# Scan Results Analysis

Compares scanner outputs across all runs in `core_bench/scan-results/`, with a focus on **validation agreement** — how well the scanner's numeric predictions match the hand-labeled targets stored in each `_scan.json`.

In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from inspect_scout import scan_results_df

SCAN_ROOT = Path("scan-results")

## 1. Load all scans

In [None]:
scan_dirs = sorted(SCAN_ROOT.glob("scan_id=*"))

scan_meta = []
scan_dfs  = {}   # {scan_id: {scanner_name: df}}

for d in scan_dirs:
    scan_id = d.name.split("=", 1)[1]
    meta = json.loads((d / "_scan.json").read_text())
    scan_meta.append({
        "scan_id":    scan_id,
        "timestamp":  meta["timestamp"],
        "model":      meta["model"]["model"],
        "scanners":   list(meta["scanners"].keys()),
        "has_validation": list(meta.get("validation", {}).keys()),
    })
    r = scan_results_df(str(d))
    scan_dfs[scan_id] = r.scanners

meta_df = pd.DataFrame(scan_meta).sort_values("timestamp").reset_index(drop=True)
meta_df

## 2. Combine all scanner results into one DataFrame

In [None]:
all_rows = []
for scan_id, scanners in scan_dfs.items():
    for scanner_name, df in scanners.items():
        all_rows.append(df)

combined = pd.concat(all_rows, ignore_index=True)

# Cast answer to float for numeric scanners
combined["answer_float"] = pd.to_numeric(combined["answer"], errors="coerce")
# validation_result as bool
combined["valid"] = combined["validation_result"] == "true"

print(f"Total rows: {len(combined)}")
print("Scanner names:", combined["scanner_name"].unique().tolist())
combined[["scan_id", "scanner_name", "transcript_id", "answer_float",
          "validation_target", "validation_result"]].head(8)

## 3. Validation overview per scan × scanner

In [None]:
# Only rows that have a validation target
val = combined.dropna(subset=["validation_target"]).copy()
val["validation_target"] = pd.to_numeric(val["validation_target"], errors="coerce")

summary = (
    val.groupby(["scan_id", "scanner_name"])
    .agg(
        n_validated=("valid", "count"),
        n_correct=("valid", "sum"),
    )
    .assign(accuracy=lambda x: x["n_correct"] / x["n_validated"])
    .reset_index()
)
summary

## 4. Scanner answer vs. validation target — scatter / strip plot

In [None]:
# Focus on answer_format scanner which has numeric targets
af = val[val["scanner_name"] == "answer_format"].copy()

fig, axes = plt.subplots(1, len(af["scan_id"].unique()), figsize=(5 * len(af["scan_id"].unique()), 4), sharey=True)
if not hasattr(axes, "__iter__"):
    axes = [axes]

for ax, (sid, grp) in zip(axes, af.groupby("scan_id")):
    jitter = np.random.default_rng(0).uniform(-0.15, 0.15, len(grp))
    colors = grp["valid"].map({True: "steelblue", False: "tomato"})
    ax.scatter(grp["validation_target"] + jitter, grp["answer_float"], c=colors, alpha=0.7, edgecolors="k", linewidths=0.4)
    # Perfect-agreement diagonal
    lim = [-0.5, 3.5]
    ax.plot(lim, lim, "k--", lw=0.8, alpha=0.5, label="perfect")
    ax.set_xlim(lim); ax.set_ylim(lim)
    ax.set_xlabel("Validation target"); ax.set_ylabel("Scanner answer")
    ax.set_title(f"scan_id=...{sid[-6:]}")
    from matplotlib.patches import Patch
    ax.legend(handles=[Patch(color="steelblue", label="correct"), Patch(color="tomato", label="wrong")], fontsize=8)

fig.suptitle("answer_format: scanner prediction vs. validation target", y=1.01)
plt.tight_layout()
plt.show()

## 5. Confusion matrix (answer_format, latest scan with validation)

In [None]:
# Use the scan with the most validation entries
best_scan = summary[summary["scanner_name"] == "answer_format"].sort_values("n_validated", ascending=False).iloc[0]["scan_id"]
grp = af[af["scan_id"] == best_scan].dropna(subset=["answer_float", "validation_target"])

y_true = grp["validation_target"].astype(int)
y_pred = grp["answer_float"].round().astype(int)

cm = pd.crosstab(y_true, y_pred, rownames=["True"], colnames=["Predicted"])

fig, ax = plt.subplots(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
ax.set_title(f"Confusion matrix — answer_format\n(scan ...{best_scan[-6:]})")
plt.tight_layout()
plt.show()

# Per-class precision/recall computed manually
labels = sorted(set(y_true) | set(y_pred))
rows = []
for lbl in labels:
    tp = ((y_true == lbl) & (y_pred == lbl)).sum()
    fp = ((y_true != lbl) & (y_pred == lbl)).sum()
    fn = ((y_true == lbl) & (y_pred != lbl)).sum()
    prec = tp / (tp + fp) if (tp + fp) else float("nan")
    rec  = tp / (tp + fn) if (tp + fn) else float("nan")
    f1   = 2 * prec * rec / (prec + rec) if (prec + rec) else float("nan")
    rows.append({"label": lbl, "support": int((y_true == lbl).sum()),
                 "precision": round(prec, 3), "recall": round(rec, 3), "f1": round(f1, 3)})
report = pd.DataFrame(rows).set_index("label")
print(report)
print(f"\nOverall accuracy: {(y_true == y_pred).mean():.3f}")

## 6. Validation accuracy vs. transcript outcome

Do scanner errors cluster on transcripts where the agent succeeded (`C`) or failed (`I`)?

In [None]:
af_best = af[af["scan_id"] == best_scan].copy()

cross = (
    af_best.groupby(["transcript_score", "valid"])
    .size()
    .unstack("valid", fill_value=0)
    .rename(columns={True: "scanner correct", False: "scanner wrong"})
)
cross["accuracy"] = cross["scanner correct"] / (cross["scanner correct"] + cross["scanner wrong"])
print(cross)

cross[["scanner correct", "scanner wrong"]].plot(
    kind="bar", stacked=True, color=["steelblue", "tomato"],
    figsize=(5, 3), title="Scanner validation accuracy by agent outcome",
    xlabel="Agent transcript score (C=correct, I=incorrect)",
    ylabel="Count"
)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 7. Multi-scan comparison — accuracy over time

In [None]:
ts_map = meta_df.set_index("scan_id")["timestamp"]
plot_data = (
    summary[summary["scanner_name"] == "answer_format"]
    .assign(timestamp=lambda x: x["scan_id"].map(ts_map))
    .sort_values("timestamp")
)

fig, ax = plt.subplots(figsize=(7, 3))
ax.bar(range(len(plot_data)), plot_data["accuracy"], color="steelblue")
ax.set_xticks(range(len(plot_data)))
ax.set_xticklabels([f"...{sid[-6:]}\n{ts[:10]}" for sid, ts in zip(plot_data["scan_id"], plot_data["timestamp"])], fontsize=8)
ax.axhline(1.0, color="gray", linestyle="--", lw=0.8)
ax.set_ylim(0, 1.1)
ax.set_ylabel("Validation accuracy")
ax.set_title("answer_format validation accuracy across scan runs")
for i, (acc, n) in enumerate(zip(plot_data["accuracy"], plot_data["n_validated"])):
    ax.text(i, acc + 0.02, f"{acc:.2f}\n(n={n})", ha="center", fontsize=8)
plt.tight_layout()
plt.show()

## 8. Error analysis — which transcripts are consistently mis-predicted?

In [None]:
# All answer_format rows with validation
error_rows = af[~af["valid"]].copy()
error_rows["error"] = error_rows["answer_float"].round().astype("Int64").astype(str) + " → " + error_rows["validation_target"].astype("Int64").astype(str)

# Count how many scans each transcript fails in
fail_counts = (
    error_rows.groupby("transcript_id")
    .agg(
        fail_in_n_scans=("scan_id", "nunique"),
        errors=("error", lambda x: ", ".join(sorted(set(x)))),
        transcript_score=("transcript_score", "first"),
        validation_target=("validation_target", "first"),
    )
    .sort_values("fail_in_n_scans", ascending=False)
    .reset_index()
)
fail_counts