# Results Analysis


In [8]:
import sys
from pyprojroot import here
sys.path.insert(0, str(here()))
from src.utils import get_pooled_df
from src.data import get_fold_from_disk, DECODED_LABELS
from collections import Counter
from sklearn.metrics import classification_report, f1_score
import numpy as np

In [9]:
full_df = get_pooled_df()

f1_scores = []
for fold in range(5):
    train_fold, val_fold = get_fold_from_disk(full_df, fold=fold, k=5, seed=7)

    majority_label = Counter(train_fold["exaggeration_label"]).most_common(1)[0][0]
    y_true = val_fold["exaggeration_label"]
    y_pred = [majority_label] * len(y_true)

    macro_f1 = f1_score(y_true, y_pred, average="macro")
    f1_scores.append(macro_f1)
    print(f"Fold {fold}: macro_f1={macro_f1:.4f}")

print(f"\nMajority-class baseline: {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")
print(classification_report(y_true, y_pred, target_names=list(DECODED_LABELS.values()), digits=4))

Fold 0: macro_f1=0.2523
Fold 1: macro_f1=0.2523
Fold 2: macro_f1=0.2523
Fold 3: macro_f1=0.2535
Fold 4: macro_f1=0.2555

Majority-class baseline: 0.2532 ± 0.0012
              precision    recall  f1-score   support

   downplays     0.0000    0.0000    0.0000        22
        same     0.6212    1.0000    0.7664        82
 exaggerates     0.0000    0.0000    0.0000        28

    accuracy                         0.6212       132
   macro avg     0.2071    0.3333    0.2555       132
weighted avg     0.3859    0.6212    0.4761       132



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# The base model is predicting the majority class 'same' as expected, .62 for precision and .76 F1. The average macro is .25 this shows the balance of recall and precision across all classes. The model fails to predict the other classes in any other metric. These are the metrics we can use to understand the improvement of our experimentation. The standard deviation of macro scores across all the folds is 0.0012. Since the variance is this significantly low and the mass is centered around this very tight point we can have confidence that the baseline performance is consistent. This matters because we have significant imbalance in our dataset and we're using a crossfold validation technique. 