# Model Evaluation Results

This notebook compares the performance of different transformer models for emotion classification. Models are evaluated based on their F1 scores and accuracy metrics. DistilBERT is used as the baseline model.

## Import Libraries

In [19]:
%pip install -q pandas matplotlib

import json
import pandas as pd

from pathlib import Path

REPO_ROOT = Path().resolve().parent

Note: you may need to restart the kernel to use updated packages.


## Load Metrics

In [20]:
ARTIFACT_ROOT = REPO_ROOT / "ml" / "artifacts"

runs = [
    ARTIFACT_ROOT / "distilbert_v1",
    ARTIFACT_ROOT / "minilm_v1",
    ARTIFACT_ROOT / "tinybert_v1",
]

In [21]:
rows = []

for run in runs:
    metrics_path = run / "test_metrics.json"
    if not metrics_path.exists():
        print(f"Missing metrics for {run}")
        continue

    with open(metrics_path) as f:
        metrics = json.load(f)

    rows.append(metrics)

df = pd.DataFrame(rows)
df

Unnamed: 0,model,config,timestamp,accuracy,macro_f1,micro_f1,weighted_f1,per_class_precision,per_class_recall,per_class_f1,per_class_support,confusion_matrix,label_names
0,distilbert/distilbert-base-uncased,distilbert.yaml,2025-12-26T02:28:57.105729,0.540126,0.528026,0.540126,0.529034,"[0.32857142857142857, 0.27586206896551724, 0.5...","[0.3194444444444444, 0.09523809523809523, 0.59...","[0.323943661971831, 0.1415929203539823, 0.5714...","[72, 84, 91, 73, 78, 71, 65, 80, 75, 76, 67, 7...","[[23, 0, 0, 0, 2, 5, 0, 0, 1, 0, 2, 0, 0, 2, 0...","[afraid, angry, annoyed, anticipating, anxious..."
1,microsoft/MiniLM-L12-H384-uncased,minilm.yaml,2025-12-26T14:38:44.721575,0.516129,0.49023,0.516129,0.497901,"[0.3111111111111111, 0.29411764705882354, 0.59...","[0.19444444444444445, 0.11904761904761904, 0.4...","[0.23931623931623933, 0.1694915254237288, 0.53...","[72, 84, 91, 73, 78, 71, 65, 80, 75, 76, 67, 7...","[[14, 0, 1, 0, 4, 2, 0, 1, 0, 0, 1, 0, 1, 2, 1...","[afraid, angry, annoyed, anticipating, anxious..."
2,huawei-noah/TinyBERT_General_4L_312D,tinybert.yaml,2025-12-26T02:23:37.844413,0.460268,0.44545,0.460268,0.44889,"[0.23255813953488372, 0.27586206896551724, 0.4...","[0.1388888888888889, 0.09523809523809523, 0.50...","[0.17391304347826086, 0.1415929203539823, 0.49...","[72, 84, 91, 73, 78, 71, 65, 80, 75, 76, 67, 7...","[[10, 0, 1, 1, 1, 1, 1, 1, 0, 0, 4, 1, 2, 0, 0...","[afraid, angry, annoyed, anticipating, anxious..."


 ## Clean & Sort Metrics

In [22]:
display_cols = [
    "model",
    "config",
    "accuracy",
    "macro_f1",
    "micro_f1",
    "weighted_f1",
]

df_sorted = df[display_cols].sort_values("macro_f1", ascending=False).reset_index(drop=True)

df_sorted

Unnamed: 0,model,config,accuracy,macro_f1,micro_f1,weighted_f1
0,distilbert/distilbert-base-uncased,distilbert.yaml,0.540126,0.528026,0.540126,0.529034
1,microsoft/MiniLM-L12-H384-uncased,minilm.yaml,0.516129,0.49023,0.516129,0.497901
2,huawei-noah/TinyBERT_General_4L_312D,tinybert.yaml,0.460268,0.44545,0.460268,0.44889


## Per-Class Performance Comparison

In [23]:
# Create per-class comparison dataframes
for _, row in df.iterrows():
    model_name = row["model"].split("/")[-1]
    print(f"Per-Class Metrics: {model_name}")

    label_names = row.get("label_names")
    precision = row.get("per_class_precision")
    recall = row.get("per_class_recall")
    f1 = row.get("per_class_f1")
    support = row.get("per_class_support")

    per_class_df = pd.DataFrame(
        {
            "Emotion": label_names,
            "Precision": precision,
            "Recall": recall,
            "F1": f1,
            "Support": support,
        }
    )

    # Top 5 and Bottom 5 classes by F1 Score
    # Display it pretty
    top_k_df = per_class_df.sort_values("F1", ascending=False).head(5)
    display(
        top_k_df.style.format(
            {"Precision": "{:.4f}", "Recall": "{:.4f}", "F1": "{:.4f}", "Support": "{:.0f}"}
        ).background_gradient(subset=["F1"], cmap="Blues", vmin=0.55, vmax=0.8)
    )

    bottom_k_df = per_class_df.sort_values("F1", ascending=True).head(5)
    display(
        bottom_k_df.style.format(
            {"Precision": "{:.4f}", "Recall": "{:.4f}", "F1": "{:.4f}", "Support": "{:.0f}"}
        ).background_gradient(subset=["F1"], cmap="Reds", vmin=0.0, vmax=0.35)
    )

Per-Class Metrics: distilbert-base-uncased


Unnamed: 0,Emotion,Precision,Recall,F1,Support
23,lonely,0.7471,0.8333,0.7879,78
12,disgusted,0.6372,0.8471,0.7273,85
9,content,0.6375,0.6711,0.6538,76
7,caring,0.6279,0.675,0.6506,80
24,nostalgic,0.5285,0.8442,0.65,77


Unnamed: 0,Emotion,Precision,Recall,F1,Support
1,angry,0.2759,0.0952,0.1416,84
6,ashamed,0.2321,0.2,0.2149,65
22,joyful,0.3333,0.253,0.2877,83
0,afraid,0.3286,0.3194,0.3239,72
28,sentimental,0.6286,0.2418,0.3492,91


Per-Class Metrics: MiniLM-L12-H384-uncased


Unnamed: 0,Emotion,Precision,Recall,F1,Support
23,lonely,0.7927,0.8333,0.8125,78
21,jealous,0.806,0.6585,0.7248,82
12,disgusted,0.6111,0.7765,0.6839,85
9,content,0.6111,0.7237,0.6627,76
24,nostalgic,0.5323,0.8571,0.6567,77


Unnamed: 0,Emotion,Precision,Recall,F1,Support
6,ashamed,0.0625,0.0154,0.0247,65
1,angry,0.2941,0.119,0.1695,84
28,sentimental,0.5,0.1429,0.2222,91
0,afraid,0.3111,0.1944,0.2393,72
15,faithful,0.6,0.18,0.2769,50


Per-Class Metrics: TinyBERT_General_4L_312D


Unnamed: 0,Emotion,Precision,Recall,F1,Support
23,lonely,0.6374,0.7436,0.6864,78
24,nostalgic,0.5138,0.7273,0.6022,77
12,disgusted,0.4958,0.6941,0.5784,85
15,faithful,0.5556,0.6,0.5769,50
21,jealous,0.6964,0.4756,0.5652,82


Unnamed: 0,Emotion,Precision,Recall,F1,Support
1,angry,0.2759,0.0952,0.1416,84
0,afraid,0.2326,0.1389,0.1739,72
6,ashamed,0.2564,0.1538,0.1923,65
5,apprehensive,0.3191,0.2113,0.2542,71
22,joyful,0.3387,0.253,0.2897,83


## Insights

DistilBERT and MiniLM perform comparably, with DistilBERT achieving slightly higher accuracy and macro F1. TinyBERT consistently underperforms across both accuracy and F1 metrics. Most errors are from semantically overlapping emotions that all models struggle to distinguish.

**Conclusion**

Use DistilBERT for offline or service inference, and MiniLM for client-side or browser use due to its smaller size.
