In [1]:
# General
import numpy as np
import pandas as pd

# Model
import torch.nn.functional as F
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, EarlyStoppingCallback, Trainer
from datasets import Dataset, DatasetDict

# Imbalance
from sklearn.utils.class_weight import compute_class_weight

# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
import os
sys.path.append(os.path.abspath('..'))

# Preprocess
from src.preprocessing import PreprocessingPretrained

In [3]:
# Load data
train_df = pd.read_csv('../Data/train.csv')

In [4]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    train_df['text'], train_df['label'], 
    test_size=0.2, stratify=train_df['label'], random_state=42
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training label distribution:\n{pd.Series(y_train).value_counts()}")

Training set size: 7634
Test set size: 1909
Training label distribution:
label
2    4942
1    1538
0    1154
Name: count, dtype: int64


In [5]:
# Light preprocessing
preprocessor = PreprocessingPretrained(translate=True)

X_train = preprocessor.preprocess(X_train)

In [6]:
# Set labels 
label2id = {"bearish": 0, "bullish": 1, "neutral": 2}
id2label = {0: "bearish", 1: "bullish", 2: "neutral"}

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
# Compute class weights
class_weights_np = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights_np, dtype=torch.float).to(device)

print(class_weights_np)

[2.20508377 1.65452969 0.51490625]


In [9]:
# Load model and tokenizer - finbert trained on financial data
model_name = "yiyanghkust/finbert-tone"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [10]:
# Freeze body 
for param in model.bert.parameters():
    param.requires_grad = False

print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
# Create DatasetDict - format needed for pre trained model 
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
})

In [12]:
texts = train_df['text'].tolist()  

# Check lengths of tokenized sequences
tokenized_lengths = [len(tokenizer.encode(text)) for text in texts]

print("Max tokenized length:", max(tokenized_lengths))
print("Average tokenized length:", sum(tokenized_lengths) / len(tokenized_lengths))


Max tokenized length: 82
Average tokenized length: 28.33625884202253


In [13]:
# Tokenize

tokenized_datasets = datasets.map(lambda examples: tokenizer(examples['text'], padding="max_length", truncation=True, max_length=82), batched=True)

Map:   0%|          | 0/7634 [00:00<?, ? examples/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 7634/7634 [00:00<00:00, 22238.35 examples/s]
Map: 100%|██████████| 1909/1909 [00:00<00:00, 32983.16 examples/s]


In [14]:
# Define custom metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="macro"),
        "recall": recall_score(labels, preds, average="macro"),
        "f1": f1_score(labels, preds, average="macro"),
    }

In [15]:
# Create loss with weights
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

# Custom trainer to account for imbalance
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights).to(self.model.device)
        else:
            self.class_weights = None
        self.loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = self.loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [16]:
# Define training arguments for fine-tuning FinBERT model
training_args = TrainingArguments(
    output_dir="../logs/finbert-finetuned",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=0.01,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False # we are minimizing loss
)

In [17]:
# Initialize custom trainer with weighted loss to handle class imbalance
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [18]:
# Train
trainer.train()

# Predict
predictions = trainer.predict(tokenized_datasets['test'])

# Evaluate 
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids
print(classification_report(labels, preds, target_names=["bearish", "bullish", "neutral"]))

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.958646,0.769513,0.821281,0.584875,0.636648
2,0.816900,0.545053,0.808277,0.769266,0.709031,0.732238
3,0.718700,0.546769,0.808277,0.796807,0.687,0.726999
4,0.642000,0.506567,0.808277,0.747808,0.757868,0.751528
5,0.582100,0.482936,0.824515,0.791808,0.737478,0.760917


              precision    recall  f1-score   support

     bearish       0.76      0.62      0.69       288
     bullish       0.76      0.67      0.71       385
     neutral       0.85      0.92      0.88      1236

    accuracy                           0.82      1909
   macro avg       0.79      0.74      0.76      1909
weighted avg       0.82      0.82      0.82      1909

