In [None]:
#setup: import libraries
import json
import os
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

plt.rcParams['figure.figsize'] = (8, 5)

In [None]:
#each experiment corresponds to one trained model + its evaluation JSON

#Configure experiment result files here. Each entry should point to an evaluation_results.json file created by utils.evaluate_model

#example: uncomment and adjust paths when files exist
# EXPERIMENTS = [
#     {
#         'name': 'transformer_baseline',
#         'results_path': 'evaluation_transformer/evaluation_results.json',
#     },
#     {
#         'name': 'lstm_baseline',
#         'results_path': 'evaluation_lstm/evaluation_results.json',
#     },
# ]

#FILL THIS IN ONCE EVALUATION HAS BEEN RUN
EXPERIMENTS = []
EXPERIMENTS

In [None]:
#helper function to load eval results

def load_results(path: Path):
    """Load a single evaluation_results.json file if it exists.
    Returns a dict or None if the file is missing.
    """
    if not path.exists():
        print(f"[WARN] Results file not found: {path}")
        return None
    with path.open('r') as f:
        return json.load(f)

In [None]:
#building a small table summarizing each models:
# top1 accuracy
# top5 accuracy
# macro f1score
# weighted f1 score
# num of eval samples

summary_rows = []

for exp in EXPERIMENTS:
    name = exp.get('name', 'unnamed')
    results_path = Path(exp.get('results_path', ''))
    res = load_results(results_path)
    if res is None:
        continue

    report = res.get('classification_report', {})
    macro = report.get('macro avg', {})
    weighted = report.get('weighted avg', {})

    summary_rows.append({
        'model': name,
        'accuracy': res.get('accuracy', None),
        'top5_accuracy': res.get('top5_accuracy', None),
        'macro_f1': macro.get('f1-score', None),
        'weighted_f1': weighted.get('f1-score', None),
        'num_samples': res.get('num_samples', None),
    })

summary_df = pd.DataFrame(summary_rows)
if summary_df.empty:
    print("No evaluation results loaded yet.\n"
          "Once you have run utils.evaluate_model(...) for at least one model,\n"
          "add the corresponding JSON path to EXPERIMENTS above and re-run.")
else:
    display(summary_df)

In [None]:
#simple bar plot visualization comparing accuracy and f1 scores across models
if 'summary_df' in globals() and not summary_df.empty:
    metrics_to_plot = ['accuracy', 'macro_f1', 'weighted_f1']
    available_metrics = [m for m in metrics_to_plot if m in summary_df.columns]

    if available_metrics:
        ax = summary_df.plot(
            x='model',
            y=available_metrics,
            kind='bar'
        )
        plt.xticks(rotation=45, ha='right')
        plt.ylim(0, 1)
        plt.ylabel('Score')
        plt.title('Model performance comparison')
        plt.tight_layout()
        plt.show()
    else:
        print("No numeric metrics available to plot.")
else:
    print("No summary table available yet; run the previous cell after loading results.")

In [None]:
#per class metrics for a selected model for error analysis

#select which experiment to inspect in detail such as, 'transformer_baseline'
SELECTED_EXPERIMENT_NAME = None

selected_exp = None
for exp in EXPERIMENTS:
    if exp.get('name') == SELECTED_EXPERIMENT_NAME:
        selected_exp = exp
        break

if selected_exp is None:
    print("No SELECTED_EXPERIMENT_NAME set or experiment not found.\n"
          "Set SELECTED_EXPERIMENT_NAME to one of the names in EXPERIMENTS.")
else:
    res = load_results(Path(selected_exp['results_path']))
    if res is not None:
        report = res.get('classification_report', {})
        rows = []
        for label, stats in report.items():
            #skip global entries
            if label in ['accuracy', 'macro avg', 'weighted avg']:
                continue
            if not isinstance(stats, dict):
                continue

            rows.append({
                'label': label,
                'precision': stats.get('precision', None),
                'recall': stats.get('recall', None),
                'f1': stats.get('f1-score', None),
                'support': stats.get('support', None),
            })

        per_class_df = pd.DataFrame(rows)
        if per_class_df.empty:
            print("No per-class statistics found in classification report.")
        else:
            #sort by F1 descending for convenience
            per_class_df = per_class_df.sort_values('f1', ascending=False)
            display(per_class_df.head(10))
            print("\nLowest 10 F1-score classes:")
            display(per_class_df.tail(10))
    else:
        print("Could not load results for selected experiment.")

In [None]:
#optional: opens saved confusion matrix images
for exp in EXPERIMENTS:
    name = exp.get('name', 'unnamed')
    results_dir = Path(exp.get('results_path', '')).parent
    cm_path = results_dir / 'confusion_matrix.png'
    if cm_path.exists():
        print(f"Found confusion matrix for {name}: {cm_path}")
    else:
        print(f"No confusion matrix found for {name} (looked for {cm_path}).")