<div class="alert alert-block alert-info">

## <center> <b> Stock Sentiment </center>
## <center> Predicting market behavior from tweets </center> <br>
##  <center> <b> TRANSFORMER ENCODERS </center> <br>
## <center> Spring Semester 2024-2025 <center>

<center> Group 35: <center>
<center>Joana Esteves, 20240746 <br><center>
<center>José Cavaco, 20240513 <br><center>
<center> Leonardo Di Caterina 20240485<br><center>
<center>Matilde Miguel, 20240549 <br><center>
<center>Rita Serra, 20240515 <br><center>

<div>

# Imports

In [None]:
# General
import numpy as np
import pandas as pd

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Model
from transformers import AutoTokenizer


# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings("ignore")

seed = 42

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
import os
sys.path.append(os.path.abspath('..'))

# Preprocess
from src.preprocessing import PreprocessingPretrained

# Model
from src.tranformer_encoder import TransformerEncoder

In [3]:
# Load data
train_df = pd.read_csv('../Data/train.csv')

# Initialize models

In [4]:
bertweet = "vinai/bertweet-base"
finbert = "yiyanghkust/finbert-tone"

In [5]:
# Initialize models 

Finbert = TransformerEncoder(num_classes=3, model_name=finbert, base_model="BERT")
Bertweet = TransformerEncoder(num_classes=3, model_name=bertweet, base_model="ROBERTA")

# Test pipeline

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
            train_df['text'], train_df['label'], 
            test_size=0.2, stratify=train_df['label'], random_state=seed
        )

In [10]:
tokenizer_bertweet = AutoTokenizer.from_pretrained(bertweet)
tokenizer_finbert = AutoTokenizer.from_pretrained(finbert)

lengths_bertweet = [len(tokenizer_bertweet.tokenize(text)) for text in X_train]
lengths_finbert = [len(tokenizer_finbert.tokenize(text)) for text in X_train]

print('-----BERTWEET-----')
print(f'Max tokens in train set: {max(lengths_bertweet)}')
print(f"Mean tokens: {np.mean(lengths_bertweet):.2f}")
print(f"95th percentile tokens: {np.percentile(lengths_bertweet, 95)}")
print('-----FINBERT-----')
print(f'Max tokens in train set: {max(lengths_finbert)}')
print(f"Mean tokens: {np.mean(lengths_finbert):.2f}")
print(f"95th percentile tokens: {np.percentile(lengths_finbert, 95)}")

-----BERTWEET-----
Max tokens in train set: 99
Mean tokens: 23.20
95th percentile tokens: 42.0
-----FINBERT-----
Max tokens in train set: 80
Mean tokens: 26.31
95th percentile tokens: 51.0


In [7]:
# Light preprocessing
preprocessor = PreprocessingPretrained(translate=True)

X_train_prep = preprocessor.preprocess(X_train)
X_val_prep = preprocessor.preprocess(X_val)

In [None]:
predictions, Report_Finbert = Finbert.train_predict(X_train_prep, y_train, X_val_prep, y_val)

Map: 100%|██████████| 7634/7634 [00:00<00:00, 20217.97 examples/s]
Map: 100%|██████████| 1909/1909 [00:00<00:00, 29335.42 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.027896,0.792038,0.75792,0.70073,0.718998
2,1.043900,0.685775,0.756941,0.691084,0.746419,0.708894
3,0.885900,0.669761,0.80461,0.743891,0.75125,0.747187
4,0.785800,0.628902,0.706129,0.660319,0.761056,0.675595
5,0.715600,0.586313,0.781561,0.712696,0.76343,0.73312


In [9]:
print(Report_Finbert)

              precision    recall  f1-score   support

     bearish       0.58      0.74      0.65       288
     bullish       0.66      0.75      0.70       385
     neutral       0.90      0.80      0.85      1236

    accuracy                           0.78      1909
   macro avg       0.71      0.76      0.73      1909
weighted avg       0.80      0.78      0.79      1909



In [None]:
predictions, Report_Bertweet = Bertweet.train_predict(X_train_prep, y_train, X_val_prep, y_val)

Map: 100%|██████████| 7634/7634 [00:01<00:00, 5697.25 examples/s]
Map: 100%|██████████| 1909/1909 [00:00<00:00, 5539.27 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.059872,0.724463,0.437578,0.520217,0.475235
2,1.819300,1.403098,0.179675,0.385113,0.348166,0.118038
3,1.496900,1.054533,0.563122,0.736489,0.50686,0.406404
4,1.359000,0.913722,0.691461,0.608907,0.56039,0.489438
5,1.173600,0.820468,0.735987,0.626729,0.635606,0.63083


In [9]:
print(Report_Bertweet)

              precision    recall  f1-score   support

     bearish       0.44      0.44      0.44       288
     bullish       0.58      0.62      0.60       385
     neutral       0.86      0.84      0.85      1236

    accuracy                           0.74      1909
   macro avg       0.63      0.64      0.63      1909
weighted avg       0.74      0.74      0.74      1909



# Tuning

In [None]:
# Initialize models with final parameters 
Finbert_tuned = TransformerEncoder(num_classes=3, model_name=finbert, base_model="BERT", batch_size=16, learning_rate=3e-5, num_epochs=10)
Bertweet_tuned = TransformerEncoder(num_classes=3, model_name=bertweet, base_model="ROBERTA", batch_size=16, learning_rate=3e-5, num_epochs=10)

# Evaluation

In [None]:
# Light preprocessing
preprocessor = PreprocessingPretrained(translate=True)

train_df_prep = preprocessor.preprocess(train_df)

X = train_df_prep["text"]
y = train_df_prep["label"]

In [None]:
results = []

In [None]:
def cross_validate_transformer(encoder, X, y, model_name, k=5):

    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)

    macro_precision = []
    macro_recall = []
    macro_f1 = []
    macro_accuracy = []

    weighted_precision = []
    weighted_recall = []
    weighted_f1 = []

    all_class_precisions = []
    all_class_recalls = []
    all_class_f1s = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):

        print(f"Training fold {fold}/{k}...") 
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        predictions, report = encoder.train_predict(X_train, y_train, X_val, y_val)

        # Macro avg
        macro_precision.append(report['macro avg']['precision'])
        macro_recall.append(report['macro avg']['recall'])
        macro_f1.append(report['macro avg']['f1-score'])
        macro_accuracy.append(report['accuracy'])

        # Weighted avg
        weighted_precision.append(report['weighted avg']['precision'])
        weighted_recall.append(report['weighted avg']['recall'])
        weighted_f1.append(report['weighted avg']['f1-score'])

        # Per-class metrics
        for cls, metrics in report.items():
            if cls not in ['accuracy', 'macro avg', 'weighted avg']:
                all_class_precisions.append(metrics['precision'])
                all_class_recalls.append(metrics['recall'])
                all_class_f1s.append(metrics['f1-score'])

    
    results.append({
    'Name': model_name,
    'CV_Accuracy': np.mean(macro_accuracy),
    'CV_Accuracy_Std': np.std(macro_accuracy),
    'CV_Macro_F1': np.mean(macro_f1),
    'CV_Macro_F1_Std': np.std(macro_f1),
    'CV_Weighted_F1': np.mean(weighted_f1),
    'CV_Weighted_F1_Std': np.std(weighted_f1),
    'Min_Class_Precision': np.min(all_class_precisions),
    'Max_Class_Precision': np.max(all_class_precisions),
    'Min_Class_Recall': np.min(all_class_recalls),
    'Max_Class_Recall': np.max(all_class_recalls),
    'Min_Class_F1': np.min(all_class_f1s),
    'Max_Class_F1': np.max(all_class_f1s)
    })

    print(results)

In [None]:
successful_results = pd.DataFrame(results)

In [None]:
cross_validate_transformer(Finbert_tuned, model_name="Finbert")

In [None]:
cross_validate_transformer(Bertweet_tuned, model_name="Bertweet")

In [None]:
# Visualize results

if len(successful_results) > 0:
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    
    # Accuracy comparison
    x_pos = np.arange(len(successful_results))
    ax1.bar(x_pos, successful_results['CV_Accuracy'], 
            yerr=successful_results['CV_Accuracy_Std'], capsize=5)
    ax1.set_title('Cross-Validation Accuracy Comparison')
    ax1.set_ylabel('Accuracy')
    ax1.set_xticks(x_pos)
    ax1.set_xticklabels(successful_results['Name'], rotation=45, ha='right')
    ax1.grid(True, alpha=0.3)
    
    # F1 Score comparison
    ax2.bar(x_pos, successful_results['CV_Macro_F1'], 
            yerr=successful_results['CV_Macro_F1_Std'], capsize=5, color='orange')
    ax2.set_title('Cross-Validation Macro F1 Comparison')
    ax2.set_ylabel('Macro F1 Score')
    ax2.set_xticks(x_pos)
    ax2.set_xticklabels(successful_results['Name'], rotation=45, ha='right')
    ax2.grid(True, alpha=0.3)
    
    # Performance vs Standard Deviation
    ax3.scatter(successful_results['CV_Accuracy'], successful_results['CV_Accuracy_Std'])
    for i, name in enumerate(successful_results['Name']):
        ax3.annotate(name.split(' - ')[0], 
                    (successful_results['CV_Accuracy'].iloc[i], 
                     successful_results['CV_Accuracy_Std'].iloc[i]),
                    xytext=(5, 5), textcoords='offset points', fontsize=8)
    ax3.set_xlabel('CV Accuracy')
    ax3.set_ylabel('CV Accuracy Std')
    ax3.set_title('Accuracy vs Consistency Trade-off')
    ax3.grid(True, alpha=0.3)
    
    # Ranking by different metrics
    metrics = ['CV_Accuracy', 'CV_Macro_F1', 'CV_Weighted_F1']
    rankings = {}
    for metric in metrics:
        rankings[metric] = successful_results.nlargest(3, metric)['Name'].tolist()
    
    ax4.axis('off')
    ranking_text = "🏆 TOP PERFORMERS:\n\n"
    for i, metric in enumerate(metrics):
        ranking_text += f"{metric.replace('CV_', '').replace('_', ' ')}:\n"
        for j, name in enumerate(rankings[metric]):
            ranking_text += f"  {j+1}. {name.split(' - ')[0]}\n"
        ranking_text += "\n"
    
    ax4.text(0.1, 0.9, ranking_text, transform=ax4.transAxes, 
             fontsize=12, verticalalignment='top', fontfamily='monospace')
    
    plt.suptitle('Pipeline Performance Analysis', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

In [None]:
winning_model = successful_results.loc[successful_results['CV_Weighted_F1'].idxmax()]

print("\n🏆 BEST PERFORMING MODEL:"
      f"\nName: {winning_model['Name']}"
      f"\nCV Accuracy: {winning_model['CV_Accuracy']:.4f} ± {winning_model['CV_Accuracy_Std']:.4f}"
      f"\nCV Macro F1: {winning_model['CV_Macro_F1']:.4f} ± {winning_model['CV_Macro_F1_Std']:.4f}"
      f"\nCV Weighted F1: {winning_model['CV_Weighted_F1']:.4f} ± {winning_model['CV_Weighted_F1_Std']:.4f}")

In [None]:
# classification report of the winning model
print("\n📊 Classification Report of Winning Model:"
      "\FINBERT")

predictions, Report_Finbert = Finbert.train_predict(X_train_prep, y_train, X_val_prep, y_val)

cm = confusion_matrix(y_val, predictions)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_val), yticklabels=np.unique(y_val))
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()