In [None]:
ESTIMATION_FILE_PATH = 'combined_data_preprocessed_top25_14Jun24.csv'
OOS_FILE_PATH = 'combined_data_preprocessed_top25_14Jun24_test.csv'

In [None]:
import pandas as pd
import numpy as np

# Load Data

In [None]:
est_df = pd.read_csv(ESTIMATION_FILE_PATH)
oos_df = pd.read_csv(OOS_FILE_PATH)

In [None]:
all_df = pd.concat([est_df, oos_df], ignore_index=True)

In [None]:
all_df['From'] = pd.to_datetime(all_df['From'])
all_df['To'] = pd.to_datetime(all_df['To'])

In [None]:
all_df['PERMCO'].unique()

In [None]:
PERMCO_LIST = [ 2381, 11937, 20064, 20253, 20333, 20440, 20678, 20868, 20972,
       21322, 21401, 21576, 21771, 21795, 22113, 26024, 29122, 29139,
       29634, 34829, 34920, 35222, 37900, 38393, 42001]

# Functions

### Yearly Rolling Window

In [None]:
TRAIN_YEARS = 11
TEST_YEARS = 1

In [None]:
def roll_window(df, start_year):
  end_year = start_year + TRAIN_YEARS + TEST_YEARS - 1
  train_start = start_year
  train_end = start_year + TRAIN_YEARS - 1

  train_df = df[(df['From'].dt.year >= train_start) & (df['To'].dt.year <= train_end)]
  test_df = df[(df['From'].dt.year > train_end) & (df['To'].dt.year <= end_year)]

  print(train_start, train_end, end_year)

  return train_df, test_df

### LLM

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import AdamW
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tqdm import tqdm

In [None]:
model_options = {'bert': 'bert-large-uncased',
                 'roberta': 'roberta-large',
                 'distilbert': 'distilbert-base-uncased',
                 'distilroberta': 'distilroberta-base',
                 'finbert': 'yiyanghkust/finbert-tone'}

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, verbose=False, delta=0):
        """
        Args:
            patience (int): How many epochs to wait after last time validation loss improved.
                            Default: 3
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.best_loss = None
        self.wait = 0
        # self.stopped_epoch = 0
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.delta:
            self.wait += 1
            if self.wait >= self.patience:
                self.early_stop = True
                # self.stopped_epoch = epoch
                if self.verbose:
                    print(f"EarlyStopping counter: {self.wait} out of {self.patience}, stopping training.")
        else:
            self.best_loss = val_loss
            self.wait = 0



def validate_model(loader, model, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()

    avg_loss = total_loss / len(loader)
    return avg_loss

In [None]:
def get_embedding(tokenizer, llm_model,
                  need_finetune,
                  train_df, test_df,
                  batch_size, max_seq_length, finetune_epoch):

  def get_DataLoader(df, tokenizer):
    # Sample dataset
    texts = list(df['combined_text'])
    labels = list(df['moving_prc_dir'])

    # Tokenize the text
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt", max_length=max_seq_length)
    labels = torch.tensor(labels)

    # Create tensor datasets
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)

    # Create DataLoader
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    return loader

  ## get DataLoader for each dataset
  all_df = pd.concat([train_df, test_df], ignore_index=True)
  finetune_loader = get_DataLoader(all_df, tokenizer)
  train_loader = get_DataLoader(train_df, tokenizer)
  test_loader = get_DataLoader(test_df, tokenizer)

  # Move model to GPU
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  llm_model.to(device)

  ## ----- Fine-tune Training ----- ##

  if need_finetune==True:

    print('fine-tuning ...')

    llm_model.train()

    # Assume you have some way to compute or fetch a validation loss here
    early_stopping = EarlyStopping(patience=3, verbose=True)

    # Initialize optimizer
    optimizer = AdamW(llm_model.parameters(), lr=1e-4)

    # Training loop
    for epoch in range(finetune_epoch):
        for batch in tqdm(finetune_loader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch

            optimizer.zero_grad()
            outputs = llm_model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        # Simulate validation loss computation
        val_loss = validate_model(test_loader, llm_model, device)

        # Early Stopping check
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print("Early stopping triggered")
            break

        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

  ## ----- Embedding Extraction ----- ##

  def get_embedding(dataLoader):

    print('extracting embedding ...')

    llm_model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataLoader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, _ = batch

            outputs = llm_model(input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.hidden_states[-1][:, 0, :]  # [CLS] token is at index 0
            embeddings.append(cls_embeddings)

    # Concatenate all batches of embeddings
    embeddings = torch.cat(embeddings, dim=0)

    # Convert embeddings back to a NumPy array for use in non-deep learning models
    embeddings = embeddings.cpu().numpy()

    return embeddings

  train_emb = get_embedding(train_loader)
  test_emb = get_embedding(test_loader)

  return train_emb, test_emb

### Regression

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import GridSearchCV

In [None]:
def get_performance(train_emb, train_label, test_emb, test_label):

  parameters = [{'C':[0.01, 0.1, 1, 10, 100]}]

  ## Train the Model
  lr_model = LogisticRegression(penalty='l2')
  grid_search = GridSearchCV(estimator=lr_model,
                             param_grid=parameters,
                             cv=5)

  grid_search.fit(train_emb, train_label)

  # Validate the model
  predictions = grid_search.predict(test_emb)
  sentiment_score = grid_search.predict_proba(test_emb)[:, 1]
  accuracy = accuracy_score(test_label, predictions)
  precision = precision_score(test_label, predictions)

  return predictions, sentiment_score, accuracy, precision

# Embedding Extraction

In [None]:
## Select your model
## 0 = bert
## 1 = roberta
## 2 = distilbert
## 3 = distilroberta
## 4 = finbert

MODEL_IDX = 4

In [None]:
BATCH_SIZE = 16
MAX_SEQ_LENGTH = 512
FINETUNE_EPOCH = 10

start_PERMCO_idx = 0
end_PERMCO_idx = 24

model_option = list(model_options.keys())[MODEL_IDX]
print(model_option)

# Pre-trained

In [None]:
import time
start_time = time.time()

emb_list = []

for PERMCO in PERMCO_LIST[start_PERMCO_idx:end_PERMCO_idx+1]:
  ## Filter Stock
  df = all_df[all_df['PERMCO']==PERMCO]

  ## Concat headlines from the same week
  df['combined_text'] = df[['PERMCO', 'Week', 'Text', 'From', 'To']].groupby(['PERMCO', 'Week', 'From', 'To'])['Text'].transform(lambda x: ' '.join(x))

  ## Drop duplicates of combined_text
  df = df.drop_duplicates(subset=['PERMCO', 'Week', 'From', 'To', 'combined_text'])
  df = df.drop(columns=['Text'])

  ## Get the TICKER id
  TICKER = list(df['TICKER'].unique())[0]
  print(PERMCO, TICKER)

  df = df[['Week', 'From', 'To', 'PERMCO', 'combined_text', 'moving_prc_dir', 'Return']].drop_duplicates()

  ## ----- Call LLM Model ----- ##

  print(model_option)
  tokenizer = AutoTokenizer.from_pretrained(model_options[model_option])
  if model_option == 'finbert':
    llm_model = AutoModelForSequenceClassification.from_pretrained(model_options[model_option], num_labels=3, output_hidden_states=True)
  else:
    llm_model = AutoModelForSequenceClassification.from_pretrained(model_options[model_option], num_labels=2, output_hidden_states=True)

  emb_by_year_list = []

  ## ----- Rolling Window ----- ##

  for y in list(range(2005, 2013)):
    train_df, test_df = roll_window(df, y)

    ## Fine-tune only first years
    need_finetune = False
    # if y==2005:
    #   need_finetune = True

    ## Generate Embeddings
    train_emb, test_emb = get_embedding(tokenizer, llm_model,
                                        need_finetune,
                                        train_df, test_df,
                                        BATCH_SIZE, MAX_SEQ_LENGTH, FINETUNE_EPOCH)

    ## Save the Embeddings
    train_df_tmp = train_df.copy()
    train_df_tmp['feature'] = train_emb.tolist()
    train_df_tmp['year'] = y
    train_df_tmp['datatype'] = 'train'

    test_df_tmp = test_df.copy()
    test_df_tmp['feature'] = test_emb.tolist()
    test_df_tmp['year'] = y
    test_df_tmp['datatype'] = 'test'

    df_emb = pd.concat([train_df_tmp, test_df_tmp], ignore_index=True)

    emb_by_year_list.append(df_emb)

  emb_list.extend(emb_by_year_list)

emb_all = pd.concat(emb_list, ignore_index=True)
emb_all.to_csv(f'./embeddings_{model_option}.csv', index=False)

print("--- %s mins ---" % ((time.time() - start_time)/60))

In [None]:
emb_all.groupby('PERMCO')['datatype'].count()

# Fine-tuned

In [None]:
## with FINETUNING
import time
start_time = time.time()

emb_list = []

for PERMCO in PERMCO_LIST[start_PERMCO_idx:end_PERMCO_idx+1]:
  ## Filter Stock
  df = all_df[all_df['PERMCO']==PERMCO]

  ## Concat headlines from the same week
  df['combined_text'] = df[['PERMCO', 'Week', 'Text', 'From', 'To']].groupby(['PERMCO', 'Week', 'From', 'To'])['Text'].transform(lambda x: ' '.join(x))

  ## Drop duplicates of combined_text
  df = df.drop_duplicates(subset=['PERMCO', 'Week', 'From', 'To', 'combined_text'])
  df = df.drop(columns=['Text'])

  ## Get the TICKER id
  TICKER = list(df['TICKER'].unique())[0]
  print(PERMCO, TICKER)

  df = df[['Week', 'From', 'To', 'PERMCO', 'combined_text', 'moving_prc_dir', 'Return']].drop_duplicates()

  ## ----- Call LLM Model ----- ##

  print(model_option)
  tokenizer = AutoTokenizer.from_pretrained(model_options[model_option])
  if model_option == 'finbert':
    llm_model = AutoModelForSequenceClassification.from_pretrained(model_options[model_option], num_labels=2, ignore_mismatched_sizes=True, output_hidden_states=True)
  else:
    llm_model = AutoModelForSequenceClassification.from_pretrained(model_options[model_option], num_labels=2, output_hidden_states=True)

  emb_by_year_list = []

  ## ----- Rolling Window ----- ##

  for y in list(range(2005, 2013)):
    train_df, test_df = roll_window(df, y)

    ## Fine-tune every three year
    need_finetune = False
    if (y==2005) or (y-2005)%3==0:
      need_finetune = True

    ## Generate Embeddings
    train_emb, test_emb = get_embedding(tokenizer, llm_model,
                                        need_finetune,
                                        train_df, test_df,
                                        BATCH_SIZE, MAX_SEQ_LENGTH, FINETUNE_EPOCH)

    ## Save the Embeddings
    train_df_tmp = train_df.copy()
    train_df_tmp['feature'] = train_emb.tolist()
    train_df_tmp['year'] = y
    train_df_tmp['datatype'] = 'train'

    test_df_tmp = test_df.copy()
    test_df_tmp['feature'] = test_emb.tolist()
    test_df_tmp['year'] = y
    test_df_tmp['datatype'] = 'test'

    df_emb = pd.concat([train_df_tmp, test_df_tmp], ignore_index=True)

    emb_by_year_list.append(df_emb)

  emb_list.extend(emb_by_year_list)

  # llm_model.save_pretrained(f'./models/finetuned_{model_option}_{PERMCO}_every3Y_wEarlyStopping.csv')

emb_all = pd.concat(emb_list, ignore_index=True)
emb_all.to_csv(f'./embeddings_finetuned_{model_option}_every3Y_wEarlyStopping.csv', index=False)

print("--- %s mins ---" % ((time.time() - start_time)/60))

In [None]:
emb_all.groupby('PERMCO')['datatype'].count()