In [None]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from torch.utils.data import DataLoader, TensorDataset
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import wandb
from transformers import pipeline
from datasets import DatasetDict, Dataset
import llm_models

In [None]:
wandb_api_key = os.getenv("WANDB_API_KEY")
wandb.login(key=wandb_api_key)

## Data Processing

In [None]:
train_file_path = r'./data/train_fi_twitter_data.csv'
test_file_path = r'./data/valid_fi_twitter_data.csv'
training_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

In [None]:
training_data_processed = llm_models.data_cleaning(training_data)
test_data_processed = llm_models.data_cleaning(test_data)

In [None]:
train_indices, valid_indices = train_test_split(
    training_data_processed.index, test_size=0.2, random_state=42
)
train_data_processed = training_data_processed.loc[train_indices]
valid_data_processed = training_data_processed.loc[valid_indices]

In [None]:
def clean_df(df):
    df = df[~df['label'].apply(lambda x: isinstance(x, list))]
    df = df[['title', 'label']]
    df = df.dropna()
    df.columns = ['text', 'label']
    return df

In [None]:
train_data_processed = clean_df(train_data_processed)
valid_data_processed = clean_df(valid_data_processed)
test_data_processed = clean_df(test_data_processed)
training_data_processed = clean_df(training_data_processed)

In [None]:
train_data_processed

In [None]:
# label
label_list = ["Analyst Update",  "Fed and Central Banks", "Company and Product News", "Treasuries and Corporate Debt", "Dividend", "Earnings", "Energy and Oil", "Financials", "Currencies", "General News and Opinion", "Gold and Metals and Materials", "IPO", "Legal and Regulation", "M&A and Investments", "Macro", "Markets", "Politics", "Personnel Change", "Stock Commentary", "Stock Movement"]
# label mapping
ids = range(len(label_list))
id2label = dict(zip(ids, label_list))
label2id = dict(zip(label_list, ids))

## Modeling

### Baseline

In [None]:
X_train = training_data_processed['text']
y_train = training_data_processed['label']
X_test = test_data_processed['text']
y_test = test_data_processed['label']

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Baseline Accuracy: {accuracy:.2f}")
print(f"Baseline F1 Score: {f1:.2f}")

### DistilBERT

#### Tokenize

In [None]:
# for bert
def encode_text(texts, tokenizer):
    return tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        padding=True,
        truncation=True,
        max_length=128,
        return_attention_mask=True,
        return_tensors='pt'
    )
def get_inputs(data_processed, device, tokenizer):
    texts = data_processed['text'].values.tolist()
    inputs = encode_text(texts, tokenizer)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    inputs_tensor = TensorDataset(inputs['input_ids'], inputs['attention_mask'])
    # dataloader = DataLoader(inputs_tensor, batch_size=batch_size, shuffle=False)
    return inputs

In [None]:
batch_size = 8
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
training_inputs = get_inputs(training_data_processed, device, tokenizer)
train_inputs = get_inputs(train_data_processed, device, tokenizer)
valid_inputs = get_inputs(valid_data_processed, device, tokenizer)
test_inputs = get_inputs(test_data_processed, device, tokenizer)


#### Direct classification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id)
model.to(device)
accuracy, f1 = llm_models.direct_classification(model, test_data_processed, test_inputs)

#### Feature extractor

In [None]:
model = AutoModel.from_pretrained(model_name, num_labels=len(label_list))
model.to(device)

In [None]:
training_df_hidden = llm_models.get_hidden_states(training_inputs)
test_df_hidden = llm_models.get_hidden_states(test_inputs)

In [None]:
x_train = training_df_hidden
y_train = training_data_processed['label'].values.tolist()

x_test = test_df_hidden
y_test = test_data_processed['label'].values.tolist()

# Create a Logistic Regression model (or any other classifier)
classifier = LogisticRegression(max_iter=1000)

# Train the classifier
classifier.fit(x_train, y_train)

# Make predictions
y_pred = classifier.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy:.2f}%')
print(f'F1 Score: {f1:.2f}%')

#### Fine Tuning

In [None]:
train_dict = {key: value.cpu().numpy().tolist() if isinstance(value, torch.Tensor) else value
              for key, value in train_inputs.items()}
valid_dict = {key: value.cpu().numpy().tolist() if isinstance(value, torch.Tensor) else value
              for key, value in valid_inputs.items()}
test_dict = {key: value.cpu().numpy().tolist() if isinstance(value, torch.Tensor) else value
             for key, value in test_inputs.items()}

train_ds = Dataset.from_pandas(pd.DataFrame(train_dict))
valid_ds = Dataset.from_pandas(pd.DataFrame(valid_dict))
test_ds = Dataset.from_pandas(pd.DataFrame(test_dict))

dataset_dict = DatasetDict({
    'train': train_ds,
    'valid': valid_ds,
    'test': test_ds
})

In [None]:
num_train_epochs = 10
lr_initial_2 = 5e-6
weight_decay_2 = 1e-2
metrics_df_3 = llm_models.fine_tune_bert(model_name, 'earlyStop2', dataset_dict, num_train_epochs, lr_initial_2, weight_decay_2)

In [None]:
num_train_epochs = 10
lr_initial_3 = 2e-6
weight_decay_3 = 1e-2
metrics_df_3 = llm_models.fine_tune_bert(model_name, 'earlyStop3', dataset_dict, num_train_epochs, lr_initial_3, weight_decay_3)

In [None]:
num_train_epochs = 4
lr_initial_4 = 5e-5
weight_decay_4 = 5e-2
metrics_df_2 = llm_models.fine_tune_bert(model_name, 'earlyStop4', dataset_dict, num_train_epochs, lr_initial_4, weight_decay_4)

In [None]:
num_train_epochs = 4
lr_initial_5 = 6e-5
weight_decay_5 = 5e-3
metrics_df_5 = llm_models.fine_tune_bert(model_name, 'earlyStop5', dataset_dict, num_train_epochs, lr_initial_5, weight_decay_5)

In [None]:
num_train_epochs = 4
lr_initial_6 = 8e-6
weight_decay_6 = 5e-3
metrics_df_6 = llm_models.fine_tune_bert(model_name, 'earlyStop6', dataset_dict, num_train_epochs, lr_initial_6, weight_decay_6)

### DistilGPT2

In [None]:
model_name = 'distilgpt2'

#### Direct Classification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id)
model.to(device)
accuracy, f1 = llm_models.direct_classification(model, test_data_processed, test_inputs)

#### prompt 

In [None]:
# basic 
base_prompt_template = """
Please classify the text into one of the following financial categories:
Analyst Update, Fed and Central Banks, Company and Product News, Treasuries and Corporate Debt, Dividend, Earnings, Energy and Oil, Financials, Currencies, General News and Opinion, Gold and Metals and Materials, IPO, Legal and Regulation, M&A and Investments, Macro, Markets, Politics, Personnel Change, Stock Commentary, Stock Movement
Text:{text}
Category:
"""

In [None]:
pipeline_kwargs={"max_new_tokens":200,"top_p":0.95,"do_sample":True,"top_k":50,"temperature":0.1,"repetition_penalty":2.0}
llm = HuggingFacePipeline.from_model_id(model_id=model_name,task="text-generation",device=-1, pipeline_kwargs=pipeline_kwargs)
ending = ",\nCategory:"
base_prompt = PromptTemplate(input_variables=["text"], template=base_prompt_template)
metrics, test_result_df = llm_models.prompt_test(base_prompt, llm, test_data_processed.iloc[:20], ending)
print(metrics)

In [None]:
#  few shot
example_df = pd.read_csv(r'/data/training_data_processed_example.csv')
example_df['asExample'] = example_df['asExample'].fillna(0)
example_df = example_df[example_df['asExample']!=0]
print(example_df.shape,example_df.columns)
example_df = example_df[['title', 'label']]
example_df.columns = ['text', 'label']
example_df=example_df.reset_index(drop=True)
example_list = [
    f"Text: {example_df.iloc[i]['text']}\nCategory: {id2label[example_df.iloc[i]['label']]}"
    for i in range(example_df.shape[0])
]
examples = "\n".join(example_list)


In [None]:
prompt_template = """
Please classify the text into one of the following financial categories:
Analyst Update, Fed and Central Banks, Company and Product News, Treasuries and Corporate Debt, Dividend, Earnings, Energy and Oil, Financials, Currencies, General News and Opinion, Gold and Metals and Materials, IPO, Legal and Regulation, M&A and Investments, Macro, Markets, Politics, Personnel Change, Stock Commentary, Stock Movement
{examples}
Text: {text}
Category:
"""

In [None]:
all_text = train_data_processed['text'].tolist() + test_data_processed["text"].tolist()
all_words = set(" ".join([str(x) for x in all_text]).split())

# Get unknown words
model_name = 'gpt2'
tokenizer1 = GPT2Tokenizer.from_pretrained(model_name)
vocab = tokenizer1.get_vocab()
unknown_words = [word for word in all_words if word not in vocab]
if unknown_words:
    num_added_toks = tokenizer1.add_tokens(unknown_words)
    print(f"Added {num_added_toks} tokens to the tokenizer vocabulary.")
    model1 = GPT2LMHeadModel.from_pretrained(model_name)
    model1.resize_token_embeddings(len(tokenizer1))
    

In [None]:
pipeline_kwargs={"max_new_tokens":20,"top_p":0.95,"do_sample":True,"top_k":50,"temperature":0.1,"repetition_penalty":2.0}
model1.config.pad_token_id = model1.config.eos_token_id
pipe = pipeline(
    task="text-generation",
    model=model1,
    pad_token_id=50256,
    tokenizer=tokenizer1,
    **pipeline_kwargs,
    # max_length=20,
    truncation=True,
)
llm1 = HuggingFacePipeline(pipeline=pipe)

ending = "\nCategory:"
input_dict = {'examples':examples, 'text':None}
metrics, test_result_df = llm_models.prompt_test(prompt_template, input_dict, llm1, test_data_processed.iloc[:20], ending)
print(metrics)

#### fine tuning

In [None]:
num_train_epochs = 4
lr_initial_1 = 6e-5
weight_decay_1 = 5e-3
unfreeze_layer = 2
model_name ='distilgpt2'
metrics_df_1 = llm_models.fine_tune_gpt(model_name, 'ForSEC', dataset_dict, num_train_epochs, lr_initial_1, weight_decay_1)