In [21]:
import copy
from datasets import Dataset
import itertools
import pandas as pd
import os
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import AutoTokenizer

os.sys.path.append('/data/kai/forecasting/multimodal/financial')

from templates.PROMPTS import ForecstBaselinePrompts
from src.vllm import llm_chat

from queue import Queue
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from threading import Thread
import numpy as np

In [27]:
model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

window = 5
data_dir = '/data/kai/forecasting/data'
formatted_paths = sorted(glob(os.path.join(data_dir, 'formatted') + "/*"))


B_INST, E_INST = "[INST]", "[/INST]"

def tokenize_dialog(dialog, tokenizer):
    if tokenizer.vocab_size >= 128000:
        dialog_tokens = tokenizer.apply_chat_template(dialog)
        dialog_tokens = dialog_tokens[:-4] # Remove generation prompt <|start_header_id|>assistant<|end_header_id|>\n\n
        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == 128009]
        labels = copy.copy(dialog_tokens)
        last_idx = 0
        for n, idx in enumerate(eot_indices):
            if n % 2 == 1:
                last_idx = idx
            else:
                labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)

        dialog_tokens = [dialog_tokens]
        labels_tokens = [labels]
    else:
        prompt_tokens = [tokenizer.encode(f"{tokenizer.bos_token}{B_INST} {(prompt['content']).strip()} {E_INST}", add_special_tokens=False) for prompt in dialog[::2]]
        answer_tokens = [tokenizer.encode(f"{answer['content'].strip()} {tokenizer.eos_token}", add_special_tokens=False) for answer in dialog[1::2]]
        dialog_tokens = list(itertools.chain.from_iterable(zip(prompt_tokens, answer_tokens)))

        #Add labels, convert prompt token to -100 in order to ignore in loss function
        labels_tokens = [len(c)*[-100,] if i % 2 == 0 else c for i,c in enumerate(dialog_tokens)]

    combined_tokens = {
        "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
        "labels": list(itertools.chain(*(t for t in labels_tokens))),
    }

    return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))


def get_financial_dataset():
    def get_dataset():
        for ticker_path in formatted_paths:
            # initialize ticker and prompts
            ticker = ticker_path.split('/')[-1].split('.csv')[0]
            ticker_df = pd.read_csv(ticker_path)
            prompts = ForecstBaselinePrompts(window)
            
            for i in range(0, len(ticker_df)-window*2):  # use next 5 price as ground truth
                window_df = ticker_df.iloc[i: i+window*2]
                window_prices = window_df['price'].values[:window]
                window_summaries = window_df['summary'].values[:window]
                x = []
                for d, (price, summary) in enumerate(zip(window_prices, window_summaries)):
                    x.append(f'<Day {d+1} Price>{price}, summary={summary}')
                x = '<SEP>'.join(x)
                y = ', '.join([str(price)
                            for price in window_df['price'].values[window:window*2]])

                messages = [{
                    "role": "system",
                    "content": prompts.SYSTEM_PROMPT
                },
                    {
                    "role": "user",
                    'content': x
                }]

                yield tokenize_dialog(messages, tokenizer)

    ds = Dataset.from_generator(get_dataset)
    return ds


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
