In [7]:
from darts.models import NLinearModel
from darts import TimeSeries
import json
import pandas as pd
from transformers import BertTokenizer, BertModel, AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
import numpy as np

In [2]:
"""
input: x_time and x_text

# LLM 2 ways
x_time: ("110.23") --> llm tokenizer --> time_embd_for_llm
x_text: input_id

# NLinear
x_time --> n linear --> time_embd_for_nlinear
x_text --> bert or llm embd --> text_embd for nlinear
"""

'\ninput: x_time and x_text\n\n# LLM 2 ways\nx_time: ("110.23") --> llm tokenizer --> time_embd_for_llm\nx_text: input_id\n\n# NLinear\nx_time --> n linear --> time_embd_for_nlinear\nx_text --> bert or llm embd --> text_embd for nlinear\n'

In [3]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

device = "cuda:3"
llm = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, )

# embeddings
embd_dim = 4096
window_size = 5
embed_timeseries = nn.Linear(1, embd_dim)

# Initialize weights and bias
nn.init.normal_(embed_timeseries.weight, mean=0.0, std=0.01)
nn.init.zeros_(embed_timeseries.bias)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:47<00:00, 11.90s/it]


Parameter containing:
tensor([0., 0., 0.,  ..., 0., 0., 0.], requires_grad=True)

In [22]:
# run Upload_finance.ipynb
# on path: "/data/kai/forecasting/data/summary/summary_with_price_v0.2/AMD"

data_path = "/data/kai/forecasting/data/test/AMD.csv"
df = pd.read_csv(data_path)

# n = df.shape[0]
# train_n = int(n * 0.8)
# train_df = df.iloc[:train_n]
# valid_df = df.iloc[train_n:]

def extract_inputs(df):
    time_series = df["input"].apply(lambda x: json.loads(x)["share_price"]).values
    input_time = np.array([time_series[i: i+window_size] for i in range(len(df) - window_size*2)])
    output_time = np.array([time_series[i: i+window_size] for i in range(window_size, len(df) - window_size)])
    # TODO: change to all instead of "summary"
    input_text = df["input"].apply(lambda x: json.loads(x)["summary"]).values.tolist()
    input_text = [input_text[i:i+window_size] for i in range(len(df) - window_size*2)]
    output_text = [input_text[i:i+window_size] for i in range(window_size, len(df)-window_size)]
    
    # input_text: N x T x len(str)
    return input_time, output_time, input_text, output_text

input_time, output_time, input_text, output_text = extract_inputs(df)
print("Input time:", len(input_time), len(input_time[0]), input_time[0])
print("Output time:", len(output_time), len(output_time[0]), output_time[0])
print("Input text:", len(input_text), len(input_text[0]), len(input_text[0][0]))
print("Output text:", len(output_text), len(output_text[0]), len(output_text[0][0]))

Input time: 267 5 [111.98 108.41 102.95 105.53 106.46]
Output time: 267 5 [104.29 102.25 109.33 115.37 111.69]
Input text: 267 5 159
Output text: 267 5 5


In [54]:
# 1) Use linear embedding

# B x window x 1
time_input = torch.tensor(input_time[0]).unsqueeze(0).unsqueeze(-1).type(torch.float32)
input_embeds_time = embed_timeseries(time_input).to(device)

# hidden_states = inputs_embeds
input_text_formatted = ','.join(input_text[0])
embeddings = tokenizer(input_text_formatted, padding=True, truncation=True, return_tensors="pt")["input_ids"]
input_embeds_text = llm.base_model.embed_tokens(embeddings[:window_size].to(device))

# concatenate
llm_embed = torch.cat((input_embeds_time, input_embeds_text), dim=1)
print(input_embeds_time.shape, input_embeds_text.shape, llm_embed.shape)

# or add embedding for each token
# tokenized[ day_1+time_embd_day_1, day_2+time_embd_day_1, day_3+time_embd_day_1...]

# or add some exponential weighting
# tokenized[ day_1+time_embd*[||___], day_2+time_embd_day_1*[__||___], day_3+time_embd_day_1*[____||]...]


# how to modify llama embedding
# transformers / models / llama / modeling_llama.py
# Class LlamaModel: def forward
# inputs_embeds
# line 945: hidden_states = inputs_embeds

torch.Size([1, 5, 4096]) torch.Size([1, 632, 4096]) torch.Size([1, 637, 4096])
