<div class="alert alert-block alert-info">

## <center> <b> Stock Sentiment </center>
## <center> Predicting market behavior from tweets </center> <br>
##  <center> <b> FINAL MODEL </center> <br>
## <center> Spring Semester 2024-2025 <center>

<center> Group 35: <center>
<center>Joana Esteves, 20240746 <br><center>
<center>José Cavaco, 20240513 <br><center>
<center> Leonardo Di Caterina 20240485<br><center>
<center>Matilde Miguel, 20240549 <br><center>
<center>Rita Serra, 20240515 <br><center>

<div>

# Imports

In [None]:
# General
import numpy as np
import pandas as pd

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings("ignore")

seed = 42

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
import os
sys.path.append(os.path.abspath('..'))

# Preprocess
from src.preprocessing import PreprocessingPretrained

# Model
from src.tranformer_encoder import TransformerEncoder

In [3]:
# Load data
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
            train_df['text'], train_df['label'], 
            test_size=0.2, stratify=train_df['label'], random_state=seed
        )

X_test = test_df["text"]

In [6]:
# Light preprocessing
preprocessor = PreprocessingPretrained(translate=True)

X_train_prep = preprocessor.preprocess(X_train)
X_val_prep = preprocessor.preprocess(X_val)
X_test_prep = preprocessor.preprocess(X_test)

# Initialize model

In [4]:
finbert = "yiyanghkust/finbert-tone"
Finbert = TransformerEncoder(num_classes=3, model_name=finbert, base_model="BERT")

# Cross-Validation

In [None]:
results = []

In [None]:
def cross_validate_transformer(encoder, X, y, model_name, k=5):

    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)

    macro_precision = []
    macro_recall = []
    macro_f1 = []
    macro_accuracy = []

    weighted_precision = []
    weighted_recall = []
    weighted_f1 = []

    all_class_precisions = []
    all_class_recalls = []
    all_class_f1s = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):

        print(f"Training fold {fold}/{k}...") 
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        predictions, report = encoder.train_predict(X_train, y_train, X_val, y_val)

        # Macro avg
        macro_precision.append(report['macro avg']['precision'])
        macro_recall.append(report['macro avg']['recall'])
        macro_f1.append(report['macro avg']['f1-score'])
        macro_accuracy.append(report['accuracy'])

        # Weighted avg
        weighted_precision.append(report['weighted avg']['precision'])
        weighted_recall.append(report['weighted avg']['recall'])
        weighted_f1.append(report['weighted avg']['f1-score'])

        # Per-class metrics
        for cls, metrics in report.items():
            if cls not in ['accuracy', 'macro avg', 'weighted avg']:
                all_class_precisions.append(metrics['precision'])
                all_class_recalls.append(metrics['recall'])
                all_class_f1s.append(metrics['f1-score'])

    
    results.append({
    'Name': model_name,
    'CV_Accuracy': np.mean(macro_accuracy),
    'CV_Accuracy_Std': np.std(macro_accuracy),
    'CV_Macro_F1': np.mean(macro_f1),
    'CV_Macro_F1_Std': np.std(macro_f1),
    'CV_Weighted_F1': np.mean(weighted_f1),
    'CV_Weighted_F1_Std': np.std(weighted_f1),
    'Min_Class_Precision': np.min(all_class_precisions),
    'Max_Class_Precision': np.max(all_class_precisions),
    'Min_Class_Recall': np.min(all_class_recalls),
    'Max_Class_Recall': np.max(all_class_recalls),
    'Min_Class_F1': np.min(all_class_f1s),
    'Max_Class_F1': np.max(all_class_f1s)
    })

    print(results)

In [None]:
winning_model = results.loc[results['CV_Weighted_F1'].idxmax()]

print("\n🏆 BEST PERFORMING MODEL:"
      f"\nName: {winning_model['Name']}"
      f"\nCV Accuracy: {winning_model['CV_Accuracy']:.4f} ± {winning_model['CV_Accuracy_Std']:.4f}"
      f"\nCV Macro F1: {winning_model['CV_Macro_F1']:.4f} ± {winning_model['CV_Macro_F1_Std']:.4f}"
      f"\nCV Weighted F1: {winning_model['CV_Weighted_F1']:.4f} ± {winning_model['CV_Weighted_F1_Std']:.4f}")

In [None]:
# classification report of the winning model
print("\n📊 Classification Report of Winning Model:"
      "\FINBERT")

predictions, Report_Finbert = Finbert.train_predict(X_train_prep, y_train, X_val_prep, y_val)

cm = confusion_matrix(y_val, predictions)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_val), yticklabels=np.unique(y_val))
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Predict for test set

In [7]:
X_test_prep = pd.DataFrame({
    'text': X_test_prep,
    'id': test_df['id'].values
})

In [8]:
predictions, _ = Finbert.train_predict(X_train_prep, y_train, X_val_prep, y_val, X_test_prep)

Map: 100%|██████████| 8588/8588 [00:00<00:00, 19568.93 examples/s]
Map: 100%|██████████| 955/955 [00:00<00:00, 25650.03 examples/s]
Map: 100%|██████████| 2388/2388 [00:00<00:00, 25795.55 examples/s]


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1284,0.763706,0.784293,0.718657,0.748945,0.729604
2,0.893,0.779668,0.717277,0.70206,0.756658,0.697831
3,0.8014,0.699396,0.736126,0.694975,0.7474,0.697957
4,0.7725,0.572064,0.767539,0.70676,0.788413,0.731538
5,0.6931,0.527713,0.787435,0.720679,0.788546,0.745981
