# Inspect misclassified examples - training split

This notebook gathers all misclassified items from the chosen model and split, joins them with their titles and abstracts, displays an interactive table, and writes a clean PDF report for human review.

**Steps**

1. Set `MODEL_NAME` if it differs from the default.
2. Run all cells from top to bottom.
3. The PDF will appear in `outputs/<MODEL_NAME>/train/misclassified/misclassified_report.pdf`.

The notebook installs *reportlab* automatically if it is not already present.

In [1]:
# Cell 1 – Imports and package check
import subprocess, sys

def pip_install(pkg):
    try:
        __import__(pkg)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

pip_install("reportlab")

import json, shutil
from pathlib import Path
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
print("Libraries ready")

Libraries ready


In [2]:
# Cell 2 – Parameters
MODEL_NAME = "gpt-4.1"   # change if needed
SPLIT = "train"          # keep as 'train' for this task

base_dir = Path("outputs")
predictions_dir = base_dir / MODEL_NAME / SPLIT / "predictions"
misclassified_dir = base_dir / MODEL_NAME / SPLIT / "misclassified"
unparsed_dir = base_dir / MODEL_NAME / SPLIT / "unparsed"

DATASET_DIR  = Path.cwd().parent / "datasets"  
dataset_path = DATASET_DIR / f"{SPLIT}_dataset.csv" 

if not dataset_path.exists():
    raise FileNotFoundError(f"Dataset not found: {dataset_path}")
if not predictions_dir.exists():
    raise FileNotFoundError(f"Folder not found: {predictions_dir}")

print("Dataset file:", dataset_path.resolve())

Dataset file: C:\Users\MILORTIE\Git repositories\padAIwan\datasets\train_dataset.csv


In [3]:
# Cell 2.5 - If misclassified folder does not exist, create it

if unparsed_dir.exists():
    unparsed_files = []
    for fp in unparsed_dir.glob("*.json"):
        unparsed_files.append(fp.name)

if not misclassified_dir.exists():
    misclassified_dir.mkdir()
    for fp in predictions_dir.glob("*.json"):
        with open(fp, encoding="utf-8") as f:
            data = json.load(f)
        if (data.get("ground_truth").get("domain") != data.get("prediction").get("domain")) and (fp.name not in unparsed_files):
            shutil.copy(fp, misclassified_dir / fp.name)

print("Misclassified folder:", misclassified_dir.resolve())

Misclassified folder: C:\Users\MILORTIE\Git repositories\padAIwan\all_class_files\outputs\gpt-4.1\train\misclassified


In [4]:
# Cell 3 – Load misclassified JSON files
rows = []
for fp in misclassified_dir.glob("*.json"):
    with open(fp, encoding="utf-8") as f:
        data = json.load(f)

    rows.append({
        "id": fp.stem,
        "ground_truth": data.get("ground_truth").get("domain"),
        "prediction": data.get("prediction").get("domain"),
        "rationale": data.get("prediction").get("domain_rationale")
    })

mis_df = pd.DataFrame(rows)
print("Loaded misclassified rows:", len(mis_df))

Loaded misclassified rows: 149


In [5]:
# Cell 4 – Merge with title and abstract
dataset_df = pd.read_csv(dataset_path, usecols=["id", "Title", "abstract"])
report_df = mis_df.merge(dataset_df, on="id", how="left")
missing = report_df["abstract"].isna().sum()
if missing:
    print(f"Warning, {missing} abstracts missing after merge")
report_df.head()

Unnamed: 0,id,ground_truth,prediction,rationale,Title,abstract
0,0423fa8b-a657-4ced-9074-7b355368c790,Social,,,Examining tobacco use among gender minority yo...,Background/Issue(s) and Objectives\nAchieving ...
1,0532012f-fe0c-40f9-bbb4-5303082cbc98,Social,Behavioural,"The research investigates ""behaviours, motivat...",Exploratory Focus Group Research on Smoking Ce...,Exploratory Research on Smoking Cessation - Ex...
2,058e0486-1ed1-4ff4-9556-7310f2b25b0d,Social,Implementation,The primary focus is on assessing and piloting...,Regulatory Decision-Making in Canada: Explorin...,OBJECTIVES/BACKGROUND/ISSUE(S): Regulators aro...
3,05a86813-b938-434b-bde6-8d05a0bf2697,Social,Implementation,"The report ""investigates measures to effective...",Health of Canadians in a changing climate: Adv...,The Climate Change and Innovation Bureau (CCIB...
4,06672431-d568-4717-89f8-1b2f76d32ad6,Social,Behavioural,"The major focus is on «patterns of use», «perc...",Canadian Cannabis Vapig Survey,Canadian Cannabis Vaping Survey Executive Summ...


In [6]:
# Cell 5 – Optional preview in notebook
try:
    from ace_tools import display_dataframe_to_user
    display_dataframe_to_user("Misclassified examples", report_df)
except Exception as e:
    print("Interactive display not available:", e)

Interactive display not available: No module named 'ace_tools'


In [None]:
# Cell 6 – Create PDF report
styles = getSampleStyleSheet()
normal = styles["Normal"]
bold = ParagraphStyle("Bold", parent=normal, fontName="Helvetica-Bold", fontSize=12, spaceAfter=6)

output_pdf = misclassified_dir / "misclassified_report.pdf"
doc = SimpleDocTemplate(str(output_pdf), pagesize=letter,
                        leftMargin=40, rightMargin=40, topMargin=40, bottomMargin=40)

elements = []
for idx, row in report_df.iterrows():
    try:
        elements.append(Paragraph(f"Example {idx+1} of {len(report_df)}", bold))
        elements.append(Paragraph(f"<b>ID:</b> {row['id']}", normal))
        elements.append(Paragraph(f"<b>Ground truth:</b> {row['ground_truth']}   <b>Prediction:</b> {row['prediction']}", normal))
        elements.append(Paragraph(f"<b>Rationale:</b> {row['rationale']}", normal))
        elements.append(Spacer(1, 8))
        elements.append(Paragraph(f"<b>Title:</b> {row['Title']}", normal))
        elements.append(Spacer(1, 4))
        elements.append(Paragraph(f"<b>Abstract:</b> {row['abstract']}", normal))
        elements.append(PageBreak())
    except:
        print("Error encountered with ", row["id"])
        elements.append(PageBreak())
        continue

doc.build(elements)
print("PDF saved to:", output_pdf.resolve())