# 📊🧪 Literature Screening – Model Evaluation  
Aggregate results for **train** and **test** splits, report metrics and confusion matrices separately.

In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 1 – Imports and helpers 🔌               ║
# ╚════════════════════════════════════════════════╝
import json
from pathlib import Path
from collections import defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (accuracy_score, precision_score,
                             recall_score, f1_score,
                             confusion_matrix, classification_report)

sns.set(style="whitegrid")

In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 2 – Discover model folders 🔍            ║
# ╚════════════════════════════════════════════════╝
outputs_root = Path("outputs")
model_dirs = [d for d in outputs_root.iterdir() if d.is_dir() and d.name != "datasets"]

if not model_dirs:
    raise RuntimeError("No model result folders found inside 'outputs/'")

print("Models found:", ", ".join(d.name for d in model_dirs))


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 3 – Load predictions per split 🗄️        ║
# ╚════════════════════════════════════════════════╝
all_predictions = defaultdict(dict)  # {model: {split: DataFrame}}

for mdir in model_dirs:
    for split in ("train", "test"):
        preds_dir = mdir / split / "predictions"
        if not preds_dir.exists():
            continue

        rows = []
        for jf in preds_dir.glob("*.json"):
            with open(jf, encoding="utf-8") as f:
                data = json.load(f)
            rows.append({
                "id"          : jf.stem,
                "ground_truth": data.get("ground_truth"),
                "prediction"  : data.get("prediction")
            })
        if rows:
            df = pd.DataFrame(rows)
            all_predictions[mdir.name][split] = df
            unparsable = (df["prediction"] == "ParseError").sum()
            print(f"{mdir.name} [{split}] -> {len(df):,} rows, {unparsable} unparsable")


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 4 – Build metrics table 📋               ║
# ╚════════════════════════════════════════════════╝
metrics = []

for model, split_dict in all_predictions.items():
    for split, df in split_dict.items():
        parsable = df[df["prediction"].isin(["Included", "Excluded"])]
        unparsed = len(df) - len(parsable)

        if len(parsable) == 0:
            continue

        y_true = parsable["ground_truth"]
        y_pred = parsable["prediction"]

        metrics.append({
            "model"      : model,
            "split"      : split,
            "n_total"    : len(df),
            "n_unparsed" : unparsed,
            "accuracy"   : accuracy_score(y_true, y_pred),
            "precision"  : precision_score(y_true, y_pred, pos_label="Included"),
            "recall"     : recall_score(y_true, y_pred,  pos_label="Included"),
            "f1"         : f1_score(y_true, y_pred,      pos_label="Included")
        })

metrics_df = (pd.DataFrame(metrics)
              .set_index(["model", "split"])
              .sort_values(["model", "split"]))

metrics_df.style.format({
    "accuracy" : "{:.3f}",
    "precision": "{:.3f}",
    "recall"   : "{:.3f}",
    "f1"       : "{:.3f}"
})


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 5 – Bar charts for each split 📊         ║
# ╚════════════════════════════════════════════════╝
for split in ("train", "test"):
    subset = metrics_df.xs(split, level="split")
    if subset.empty:
        continue

    fig, axes = plt.subplots(1, 3, figsize=(16, 4))
    subset["accuracy"].plot(kind="bar", ax=axes[0], color="mediumseagreen")
    axes[0].set_title(f"Accuracy ({split})")
    axes[0].set_ylim(0, 1)

    subset["f1"].plot(kind="bar", ax=axes[1], color="dodgerblue")
    axes[1].set_title(f"F1-score ({split})")
    axes[1].set_ylim(0, 1)

    (subset["n_unparsed"] / subset["n_total"]).plot(kind="bar",
                                                    ax=axes[2],
                                                    color="indianred")
    axes[2].set_title(f"Unparsed % ({split})")
    axes[2].set_ylim(0, 1)

    plt.suptitle(f"Model comparison on {split} split")
    plt.tight_layout()
    plt.show()


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 6 – Confusion matrices 🔲                ║
# ╚════════════════════════════════════════════════╝
for model, split_dict in all_predictions.items():
    for split, df in split_dict.items():
        parsable = df[df["prediction"].isin(["Included", "Excluded"])]
        if parsable.empty:
            continue

        y_true = parsable["ground_truth"]
        y_pred = parsable["prediction"]

        cm = confusion_matrix(y_true, y_pred, labels=["Included", "Excluded"])

        plt.figure(figsize=(4, 3))
        sns.heatmap(cm,
                    annot=True,
                    fmt="d",
                    cmap="Purples",
                    xticklabels=["Included", "Excluded"],
                    yticklabels=["Included", "Excluded"])
        plt.title(f"Confusion Matrix - {model} ({split})")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.tight_layout()
        plt.show()

        print(f"Classification report for {model} ({split})")
        print(classification_report(y_true, y_pred, digits=3))


## ✔️ Evaluation complete  
You now get clear, separate insights for train and test splits across every model folder found under **outputs/**.