In [12]:
import copy
import datasets
import itertools
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import AutoTokenizer

os.sys.path.append('/data/kai/forecasting/multimodal/financial')

from templates.PROMPTS import ForecstBaselinePrompts
from src.vllm import llm_chat

from queue import Queue
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from threading import Thread
import numpy as np

In [62]:
ticker = 'AAPL'

save_dir = '/data/kai/forecasting/results'
os.makedirs(save_dir, exist_ok=True)
baseline_path = os.path.join(save_dir, f'{ticker}_baseline.csv')
df = pd.read_csv(baseline_path)

In [63]:
df.head()

Unnamed: 0,Result,Ground Truth
0,"184.21,181.95,179.69,178.43,176.17","190.29, 198.5, 195.21, 187.61, 186.63"
1,"202.15, 204.91, 207.67, 210.43, 213.19","187.61, 186.63, 192.03, 203.63, 207.84"
2,"194.85, 197.41, 200.12, 202.95, 205.73","198.5, 195.21, 187.61, 186.63, 192.03"
3,"203.12, 205.89, 208.75, 211.69, 214.63","195.21, 187.61, 186.63, 192.03, 203.63"
4,"192.11, 194.57, 197.03, 199.49, 201.95","186.63, 192.03, 203.63, 207.84, 216.49"


In [16]:

B_INST, E_INST = "[INST]", "[/INST]"

def tokenize_dialog(dialog, tokenizer):
    if tokenizer.vocab_size >= 128000:
        dialog_tokens = tokenizer.apply_chat_template(dialog)
        dialog_tokens = dialog_tokens[:-4] # Remove generation prompt <|start_header_id|>assistant<|end_header_id|>\n\n
        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == 128009]
        labels = copy.copy(dialog_tokens)
        last_idx = 0
        for n, idx in enumerate(eot_indices):
            if n % 2 == 1:
                last_idx = idx
            else:
                labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)

        dialog_tokens = [dialog_tokens]
        labels_tokens = [labels]
    else:
        prompt_tokens = [tokenizer.encode(f"{tokenizer.bos_token}{B_INST} {(prompt['content']).strip()} {E_INST}", add_special_tokens=False) for prompt in dialog[::2]]
        answer_tokens = [tokenizer.encode(f"{answer['content'].strip()} {tokenizer.eos_token}", add_special_tokens=False) for answer in dialog[1::2]]
        dialog_tokens = list(itertools.chain.from_iterable(zip(prompt_tokens, answer_tokens)))

        #Add labels, convert prompt token to -100 in order to ignore in loss function
        labels_tokens = [len(c)*[-100,] if i % 2 == 0 else c for i,c in enumerate(dialog_tokens)]

    combined_tokens = {
        "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
        "labels": list(itertools.chain(*(t for t in labels_tokens))),
    }

    return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))


def get_financial_dataset(dataset_config, tokenizer, split):
    dataset = datasets.load_dataset("OpenAssistant/oasst1", split=split)

    dataset = dataset.map(lambda sample: {
        "message_id": sample["message_id"],
        "parent_id": sample["parent_id"],
        "text": sample["text"],
        },
        batched=True,
        remove_columns=list(dataset.features),)

    nodes = {}

    messages = {}
    root_ids = []

    for data in dataset:
        if data["parent_id"]:
            nodes[data["parent_id"]] = nodes.get(data["parent_id"], []) + [data["message_id"]]
        else:
            root_ids.append(data["message_id"])
        messages[data["message_id"]]=data["text"]

    def follow(thread, current_id):
        thread = copy.copy(thread) + [messages[current_id]]
        if current_id in nodes:
            new_threads = []
            for next_id in nodes[current_id]:
                new_threads += follow(thread, next_id)
            return new_threads
        else:
            return [thread]

    def get_threads_from_root(root_id):
        all_threads = []
        thread = [messages[root_id]]
        for cid in nodes[root_id]:
            all_threads += follow(thread, cid)
        return all_threads

    dataset = dataset.filter(lambda x: x["message_id"] in root_ids)
    dataset = dataset.map(lambda x: {"thread": get_threads_from_root(x["message_id"])}, remove_columns=list(dataset.features))
    dataset = dataset.map(lambda x: {"thread": [i for row in x["thread"] for i in row]}, batched=True)

    def to_dialog(thread):
        dialog = []
        for i, content in enumerate(thread):
            dialog.append({
                "role": "user" if i % 2 == 0 else "assistant",
                "content": content,
            })
        return {"dialog": dialog}

    dataset = dataset.map(lambda x: to_dialog(x["thread"]), remove_columns=list(dataset.features))
    dataset = dataset.map(lambda x: tokenize_dialog(x["dialog"], tokenizer), remove_columns=list(dataset.features))

    return dataset

In [17]:
model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset = get_financial_dataset(None, tokenizer, "train")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 84437/84437 [00:01<00:00, 61143.86 examples/s]
Filter: 100%|██████████| 84437/84437 [00:28<00:00, 2986.73 examples/s]
Map: 100%|██████████| 9846/9846 [00:01<00:00, 7905.57 examples/s] 
Map: 100%|██████████| 9846/9846 [00:00<00:00, 9897.09 examples/s] 
Map: 100%|██████████| 44042/44042 [00:02<00:00, 20018.76 examples/s]
Map: 100%|██████████| 44042/44042 [00:52<00:00, 843.84 examples/s] 


In [20]:
dataset[0].keys()

dict_keys(['input_ids', 'labels', 'attention_mask'])

In [24]:
print(tokenizer.decode(dataset[0]['input_ids']))

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.

Recent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industr

In [None]:
# Prepare Financial dataset in following format
# {
#     '100', 'summary...',
#     '105', 'summary...'
# }