# Baseline vs Ablations: Check Impact Analysis

This notebook analyzes the baseline experiment results against ablation runs where certain checks were disabled. It focuses on how the outcome of the same experiments changed when the following checks were turned off:

- no_lint
- no_tests
- no_playwright

Data sources are CSV files in `analysis/` with a common schema:
- Case identifier and Assignee
- AB-xx check columns with values in {PASS, WARN, FAIL, NA}
- Aggregate columns: `PASS#`, `WARN#`, `FAIL#`, `PTS`
- Free-text `Notes`

We will:
- Load and clean the datasets
- Summarize the baseline
- Compare each ablation to baseline with per-case deltas and aggregate trends
- Visualize mean deltas and surface the largest regressions and improvements


In [1]:
from __future__ import annotations

import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List

import seaborn as sns
import matplotlib.pyplot as plt

# Display options
pd.set_option("display.max_colwidth", 200)

DATA_DIR = Path("/home/eugenek/app.build-neurips25/analysis")
FILES = {
    "baseline": DATA_DIR / "app.build-neurips25 - baseline.csv",
    "no_lint": DATA_DIR / "app.build-neurips25 - ablations_no_lint.csv",
    "no_playwright": DATA_DIR / "app.build-neurips25 - ablations_no_playwright.csv",
    "no_tests": DATA_DIR / "app.build-neurips25 - ablations_no_tests.csv",
}

AB_COLUMNS = [
    "AB-01 Boot",
    "AB-02 Prompt",
    "AB-03 Create",
    "AB-04 View/Edit",
    "AB‑06 Clickable Sweep",
    "AB‑07 Performance >75",
]

AGG_COLUMNS = ["PASS#", "WARN#", "FAIL#", "PTS"]
KEY_COLUMN = "Case"

STATUS_ORDER = ["FAIL", "WARN", "NA", "PASS"]  # ordered for ordinal mapping
STATUS_TO_SCORE = {"FAIL": 0, "WARN": 0.5, "NA": np.nan, "PASS": 1.0}


def load_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    # unify column names (strip spaces) and types
    df.columns = [c.strip() for c in df.columns]
    # ensure numeric columns are numeric
    for col in AGG_COLUMNS:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    # ensure PTS exists; if missing set NaN
    if "PTS" not in df.columns:
        df["PTS"] = np.nan
    # key normalization
    df[KEY_COLUMN] = df[KEY_COLUMN].astype(str)
    return df


def score_status_columns(df: pd.DataFrame) -> pd.DataFrame:
    scored = df.copy()
    for col in AB_COLUMNS:
        if col in scored.columns:
            scored[f"{col}__score"] = scored[col].map(STATUS_TO_SCORE)
        else:
            scored[f"{col}__score"] = np.nan
    scored["AB_mean_score"] = scored[[f"{c}__score" for c in AB_COLUMNS]].mean(axis=1, skipna=True)
    return scored


# Load all datasets
raw: Dict[str, pd.DataFrame] = {k: load_csv(v) for k, v in FILES.items()}
scored: Dict[str, pd.DataFrame] = {k: score_status_columns(df) for k, df in raw.items()}

# Align on common cases present in baseline for fair comparison
baseline_cases = set(scored["baseline"][KEY_COLUMN])
for k in list(scored.keys()):
    scored[k] = scored[k][scored[k][KEY_COLUMN].isin(baseline_cases)].reset_index(drop=True)

len(baseline_cases), {k: len(v) for k, v in scored.items()}


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Baseline summary
base = scored["baseline"].copy()

# Aggregate means
baseline_agg = base[AGG_COLUMNS + ["AB_mean_score", "PTS"]].mean(numeric_only=True).to_frame("mean").T
# Status counts per AB column
status_counts = {}
for col in AB_COLUMNS:
    if col in base.columns:
        status_counts[col] = base[col].value_counts().reindex(STATUS_ORDER).fillna(0).astype(int)
status_counts_df = pd.DataFrame(status_counts).T

print("Baseline aggregate means (PASS#/WARN#/FAIL#/PTS, AB_mean_score):")
baseline_agg


In [None]:
# Compute deltas vs baseline for each ablation

def compute_deltas_vs_baseline(baseline: pd.DataFrame, variant: pd.DataFrame, label: str) -> pd.DataFrame:
    b = baseline[[KEY_COLUMN, "AB_mean_score", "PTS"] + [f"{c}__score" for c in AB_COLUMNS]].copy()
    v = variant[[KEY_COLUMN, "AB_mean_score", "PTS"] + [f"{c}__score" for c in AB_COLUMNS]].copy()
    merged = b.merge(v, on=KEY_COLUMN, suffixes=("_base", f"_{label}"))

    # Per-case deltas
    merged[f"delta_AB_mean_score_{label}"] = merged[f"AB_mean_score_{label}"] - merged["AB_mean_score_base"]
    merged[f"delta_PTS_{label}"] = merged[f"PTS_{label}"] - merged["PTS_base"]
    for c in AB_COLUMNS:
        merged[f"delta_{c}__score_{label}"] = merged[f"{c}__score_{label}"] - merged[f"{c}__score_base"]

    return merged


deltas = {}
base_df = scored["baseline"].copy()
for label in ["no_lint", "no_tests", "no_playwright"]:
    deltas[label] = compute_deltas_vs_baseline(base_df, scored[label], label)

# Aggregate mean deltas per ablation
mean_deltas = []
for label, df in deltas.items():
    row = {
        "ablation": label,
        "mean_delta_AB_mean_score": df[f"delta_AB_mean_score_{label}"].mean(),
        "mean_delta_PTS": df[f"delta_PTS_{label}"].mean(),
    }
    for c in AB_COLUMNS:
        row[f"mean_delta_{c}__score"] = df[f"delta_{c}__score_{label}"].mean()
    mean_deltas.append(row)

mean_deltas_df = pd.DataFrame(mean_deltas)
mean_deltas_df


In [None]:
# Visualization: mean deltas and worst-case drops
sns.set_theme(style="whitegrid")

# Barplot of mean delta AB_mean_score and PTS
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.barplot(data=mean_deltas_df, x="ablation", y="mean_delta_AB_mean_score", ax=axes[0], palette="Set2")
axes[0].axhline(0, color="gray", linewidth=1)
axes[0].set_title("Mean Δ AB_mean_score vs baseline")
axes[0].set_ylabel("Δ score (PASS=1, WARN=0.5, FAIL=0)")

sns.barplot(data=mean_deltas_df, x="ablation", y="mean_delta_PTS", ax=axes[1], palette="Set2")
axes[1].axhline(0, color="gray", linewidth=1)
axes[1].set_title("Mean Δ PTS vs baseline")
axes[1].set_ylabel("Δ PTS")
plt.tight_layout()
plt.show()

# Identify worst-case drops per ablation by AB_mean_score and PTS
worst_rows = []
for label, df in deltas.items():
    row_score = df.sort_values(f"delta_AB_mean_score_{label}").head(5)[[KEY_COLUMN, f"delta_AB_mean_score_{label}"]]
    row_score["ablation"] = label
    row_pts = df.sort_values(f"delta_PTS_{label}").head(5)[[KEY_COLUMN, f"delta_PTS_{label}"]]
    row_pts["ablation"] = label
    worst_rows.append((label, row_score, row_pts))

# Show top 5 worst Δ by AB_mean_score and PTS for each ablation
for label, w_score, w_pts in worst_rows:
    display(pd.DataFrame({"ablation": [label]}))
    display(w_score.rename(columns={f"delta_AB_mean_score_{label}": "delta_AB_mean_score"}))
    display(w_pts.rename(columns={f"delta_PTS_{label}": "delta_PTS"}))
