# Human Evaluation Analysis: Baseline vs Ablations

This notebook analyzes the **human-evaluated results** from the app.build benchmark.

**Data Source**: Manual evaluation CSV files containing:
- Human assessments with PASS/WARN/FAIL/NA values for each AB check
- Evaluator assignments (A1/A2)
- Performance scores and detailed notes

**For automated test results**, see `automated_results_analysis.ipynb`


## Data Loading from Human-Evaluated CSV Files

Loading data from the actual human-evaluated CSV files that contain AB column evaluations with PASS/WARN/FAIL/NA values.


In [2]:
from __future__ import annotations

import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict, List

import seaborn as sns
import matplotlib.pyplot as plt

# Display options
pd.set_option("display.max_colwidth", 200)

# Use human-evaluated CSV files from analysis directory
ANALYSIS_DIR = Path(".") if Path("app.build-neurips25 - baseline.csv").exists() else Path("/Users/evgenii.kniazev/projects/app.build-neurips25/analysis")

# Map ablation names to their CSV files
FILES = {
    "baseline": ANALYSIS_DIR / "app.build-neurips25 - baseline.csv",
    "no_lint": ANALYSIS_DIR / "app.build-neurips25 - ablations_no_lint.csv",
    "no_playwright": ANALYSIS_DIR / "app.build-neurips25 - ablations_no_playwright.csv",
    "no_tests": ANALYSIS_DIR / "app.build-neurips25 - ablations_no_tests.csv",
}

# Canonical AB columns (ASCII hyphen)
AB_COLUMNS = [
    "AB-01 Boot",
    "AB-02 Prompt",
    "AB-03 Create",
    "AB-04 View/Edit",
    "AB-06 Clickable Sweep",
    "AB-07 Performance >75",
]

# Normalize AB column names (unify hyphens and drop AB-05 if present)
AB_NORMALIZE = {
    # 01-04 variants → ASCII hyphen
    "AB–01 Boot": "AB-01 Boot",
    "AB—01 Boot": "AB-01 Boot",
    "AB‑01 Boot": "AB-01 Boot",
    "AB‑02 Prompt": "AB-02 Prompt",
    "AB–02 Prompt": "AB-02 Prompt",
    "AB—02 Prompt": "AB-02 Prompt",
    "AB‑03 Create": "AB-03 Create",
    "AB‑04 View/Edit": "AB-04 View/Edit",
    # 06/07 variants → ASCII hyphen
    "AB‑06 Clickable Sweep": "AB-06 Clickable Sweep",
    "AB–06 Clickable Sweep": "AB-06 Clickable Sweep",
    "AB—06 Clickable Sweep": "AB-06 Clickable Sweep",
    "AB‑07 Performance >75": "AB-07 Performance >75",
    "AB–07 Performance >75": "AB-07 Performance >75",
    "AB—07 Performance >75": "AB-07 Performance >75",
    # Drop AB-05 if present
    "AB-05 UI Sweep": None,
    "AB‑05 UI Sweep": None,
}

AGG_COLUMNS = ["PASS#", "WARN#", "FAIL#"]
KEY_COLUMN = "Case"

STATUS_ORDER = ["FAIL", "WARN", "NA", "PASS"]  # ordered for ordinal mapping
STATUS_TO_SCORE = {"FAIL": 0, "WARN": 0.5, "NA": np.nan, "PASS": 1.0}


def normalize_ab_columns(df: pd.DataFrame) -> pd.DataFrame:
    # unify column names and drop AB-05 variants
    rename_map = {c: AB_NORMALIZE[c] for c in df.columns if c in AB_NORMALIZE and AB_NORMALIZE[c]}
    df = df.rename(columns=rename_map)
    drop_cols = [c for c in df.columns if c in AB_NORMALIZE and AB_NORMALIZE[c] is None]
    if drop_cols:
        df = df.drop(columns=drop_cols)
    return df


def load_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    # unify column names (strip spaces) and types
    df.columns = [c.strip() for c in df.columns]

    # These are human-evaluated CSV files - normalize AB naming
    df = normalize_ab_columns(df)

    # ensure any missing AB columns exist (fill with 'NA' so analysis can proceed)
    missing = [c for c in AB_COLUMNS if c not in df.columns]
    if missing:
        print(f"Warning: Missing AB columns in {path.name}: {missing}. Filling with 'NA'.")
        for c in missing:
            df[c] = "NA"

    # ensure numeric columns are numeric
    for col in AGG_COLUMNS:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # key normalization
    df[KEY_COLUMN] = df[KEY_COLUMN].astype(str)
    return df


def score_status_columns(df: pd.DataFrame) -> pd.DataFrame:
    scored = df.copy()
    for col in AB_COLUMNS:
        if col in scored.columns:
            scored[f"{col}__score"] = scored[col].map(STATUS_TO_SCORE)
        else:
            scored[f"{col}__score"] = np.nan
    scored["AB_mean_score"] = scored[[f"{c}__score" for c in AB_COLUMNS]].mean(axis=1, skipna=True)
    return scored


# Load all datasets
raw: Dict[str, pd.DataFrame] = {k: load_csv(v) for k, v in FILES.items()}
scored: Dict[str, pd.DataFrame] = {k: score_status_columns(df) for k, df in raw.items()}

# Align on common cases present in baseline for fair comparison
baseline_cases = set(scored["baseline"][KEY_COLUMN])
for k in list(scored.keys()):
    scored[k] = scored[k][scored[k][KEY_COLUMN].isin(baseline_cases)].reset_index(drop=True)

print("Datasets loaded:")
print(f"- Baseline cases: {len(baseline_cases)}")
for k, v in scored.items():
    print(f"- {k}: {len(v)} cases")


Datasets loaded:
- Baseline cases: 30
- baseline: 30 cases
- no_lint: 30 cases
- no_playwright: 30 cases
- no_tests: 30 cases


## Compute Viability and Quality Scores


In [None]:
# Compute Viability (V) and Quality (Q) per new spec

GATE_FAIL_COLUMNS = ["AB-01 Boot", "AB-02 Prompt"]

# Equal weights across AB checks; NA re-normalization happens via mean with skipna
QUALITY_MAP = {
    "AB-01 Boot": {"PASS": 1.0, "WARN": 0.5, "FAIL": 0.0, "NA": np.nan},
    "AB-02 Prompt": {"PASS": 1.0, "WARN": 0.5, "FAIL": 0.0, "NA": np.nan},
    "AB-03 Create": {"PASS": 1.0, "WARN": 0.5, "FAIL": 0.0, "NA": np.nan},
    "AB-04 View/Edit": {"PASS": 1.0, "WARN": 0.5, "FAIL": 0.0, "NA": np.nan},
    "AB-06 Clickable Sweep": {"PASS": 1.0, "WARN": 0.5, "FAIL": 0.0, "NA": np.nan},
    # AB-07 Performance >75 is a binary proxy of performance; map as before for legacy CSVs
    "AB-07 Performance >75": {"PASS": 1.0, "WARN": 0.5, "FAIL": 0.0, "NA": np.nan},
}


def compute_viability(row: pd.Series) -> int:
    for col in GATE_FAIL_COLUMNS:
        if col in row and str(row[col]) == "FAIL":
            return 0
    return 1


def compute_quality(row: pd.Series) -> float:
    scores = []
    for col, mapping in QUALITY_MAP.items():
        if col in row:
            scores.append(mapping.get(str(row[col]), np.nan))
    if len(scores) == 0:
        return np.nan
    return float(np.nanmean(scores) * 10.0)

# Apply to all datasets
for k in list(scored.keys()):
    df = scored[k]
    scored[k]["V"] = df.apply(compute_viability, axis=1)
    scored[k]["Q"] = df.apply(compute_quality, axis=1)

# Quick sanity: show mean Q and viability rate per table
print("\nHuman Evaluation Summary:")
for k, stats in { k: {"mean_Q": float(scored[k]["Q"].mean()), "viability_rate": float(scored[k]["V"].mean())} for k in scored }.items():
    print(f"{k}: Viability={stats['viability_rate']:.1%}, Mean Q={stats['mean_Q']:.1f}")

# Also show a complete summary table
summary_rows: List[Dict[str, object]] = []
for k in scored:
    df = scored[k]
    viable_df = df[df["V"] == 1]
    summary_rows.append({
        "Ablation": k.replace("_", " ").title(),
        "N": len(df),
        "Viability": f"{df['V'].mean():.1%}",
        "Viable Count": int(df['V'].sum()),
        "Mean Q (all)": f"{df['Q'].mean():.1f}",
        "Mean Q (viable)": f"{viable_df['Q'].mean():.1f}" if len(viable_df) > 0 else "N/A",
    })
summary_df = pd.DataFrame(summary_rows)
try:
    display(summary_df)
except Exception:
    print(summary_df.to_string(index=False))



Human Evaluation Summary:
baseline: Viability=73.3%, Mean Q=8.1
no_lint: Viability=80.0%, Mean Q=8.3
no_playwright: Viability=90.0%, Mean Q=8.6
no_tests: Viability=80.0%, Mean Q=7.8


## Baseline Performance Analysis


In [4]:
# Baseline summary
base = scored["baseline"].copy()
viable = base[base['V']==1]

print(f"Baseline Performance (n={len(base)}):")
print(f"- Viability rate: {base['V'].mean():.1%} ({int(base['V'].sum())}/{len(base)} apps)")
print(f"- Quality scores:")
print(f"  • Overall mean: {base['Q'].mean():.1f}")
print(f"  • Viable apps only: {viable['Q'].mean():.1f}")
print(f"\nQuality distribution for viable apps:")
q_dist = viable['Q'].value_counts().sort_index()
for q_val, count in q_dist.items():
    print(f"  Q={q_val:.1f}: {'█' * count} ({count} apps)")


Baseline Performance (n=30):
- Viability rate: 73.3% (22/30 apps)
- Quality scores:
  • Overall mean: 8.1
  • Viable apps only: 9.6

Quality distribution for viable apps:
  Q=7.5: █ (1 apps)
  Q=8.3: █ (1 apps)
  Q=8.8: █ (1 apps)
  Q=9.0: █ (1 apps)
  Q=9.2: ████ (4 apps)
  Q=10.0: ██████████████ (14 apps)


## Human Evaluation: Ablation Comparison


In [5]:
# Compare all ablations
comparison_data = []
for k in ["baseline", "no_lint", "no_playwright", "no_tests"]:
    df = scored[k]
    viable_df = df[df['V']==1]
    
    comparison_data.append({
        "Ablation": k.replace("_", " ").title(),
        "N": len(df),
        "Viability": f"{df['V'].mean():.1%}",
        "Viable Count": int(df['V'].sum()),
        "Mean Q (all)": f"{df['Q'].mean():.1f}",
        "Mean Q (viable)": f"{viable_df['Q'].mean():.1f}" if len(viable_df) > 0 else "N/A",
        "Δ Viability": f"{(df['V'].mean() - scored['baseline']['V'].mean())*100:+.1f}%" if k != "baseline" else "-",
        "Δ Quality": f"{(df['Q'].mean() - scored['baseline']['Q'].mean()):+.1f}" if k != "baseline" else "-"
    })

comparison_df = pd.DataFrame(comparison_data)
print("\n=== HUMAN EVALUATION ABLATION COMPARISON ===")
try:
    display(comparison_df)
except Exception:
    print(comparison_df.to_string(index=False))

# Statistical analysis
print("\n=== Impact Analysis ===")
baseline_v = scored['baseline']['V'].mean()
baseline_q = scored['baseline']['Q'].mean()

for k in ["no_lint", "no_playwright", "no_tests"]:
    df = scored[k]
    v_rate = df['V'].mean()
    q_mean = df['Q'].mean()
    
    print(f"\n{k.replace('_', ' ').title()}:")
    print(f"  - Viability: {v_rate:.1%} ({(v_rate - baseline_v)*100:+.1f}% from baseline)")
    print(f"  - Quality: {q_mean:.1f} ({q_mean - baseline_q:+.1f} from baseline)")
    
    # Check which AB columns changed most
    ab_changes = []
    for col in AB_COLUMNS:
        baseline_pass = (scored['baseline'][col] == "PASS").mean()
        ablation_pass = (df[col] == "PASS").mean()
        if abs(ablation_pass - baseline_pass) > 0.05:  # >5% change
            ab_changes.append(f"{col}: {(ablation_pass - baseline_pass)*100:+.1f}%")
    
    if ab_changes:
        print(f"  - Major changes: {', '.join(ab_changes)}")



=== HUMAN EVALUATION ABLATION COMPARISON ===


Unnamed: 0,Ablation,N,Viability,Viable Count,Mean Q (all),Mean Q (viable),Δ Viability,Δ Quality
0,Baseline,30,73.3%,22,8.1,9.6,-,-
1,No Lint,30,80.0%,24,8.3,9.3,+6.7%,+0.2
2,No Playwright,30,90.0%,27,8.6,9.4,+16.7%,+0.6
3,No Tests,30,80.0%,24,7.8,9.3,+6.7%,-0.3



=== Impact Analysis ===

No Lint:
  - Viability: 80.0% (+6.7% from baseline)
  - Quality: 8.3 (+0.2 from baseline)
  - Major changes: AB-03 Create: -6.7%, AB-04 View/Edit: -13.3%, AB-07 Performance >75: +10.0%

No Playwright:
  - Viability: 90.0% (+16.7% from baseline)
  - Quality: 8.6 (+0.6 from baseline)
  - Major changes: AB-02 Prompt: +13.3%, AB-03 Create: +6.7%, AB-04 View/Edit: +6.7%, AB-06 Clickable Sweep: +13.3%, AB-07 Performance >75: +10.0%

No Tests:
  - Viability: 80.0% (+6.7% from baseline)
  - Quality: 7.8 (-0.3 from baseline)
  - Major changes: AB-03 Create: -6.7%, AB-04 View/Edit: -20.0%, AB-06 Clickable Sweep: +6.7%
