# Threshold Diagnostics

This notebook visualizes score distributions and ABR sensitivity around decision thresholds.

**Purpose:** Verify that ABR/pAUC thresholds are consistent across all evaluations.

**Does NOT:** Recompute metrics (visualization only).

In [None]:
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

from src.evaluation.metrics import compute_abr
from src.evaluation.thresholds import ThresholdSpec

## 1. Load Data

In [None]:
# Configure data path
DATA_DIR = PROJECT_ROOT / "data" / "synthetic" / "test_fix"

Da = pd.read_csv(DATA_DIR / "Da.csv")
Dr = pd.read_csv(DATA_DIR / "Dr.csv")
H = pd.read_csv(DATA_DIR / "H.csv")

print(f"Da: {len(Da)} samples, bad_rate={Da['y'].mean():.4f}")
print(f"Dr: {len(Dr)} samples (unlabeled)")
print(f"H: {len(H)} samples, bad_rate={H['y'].mean():.4f}")

## 2. Train Model and Generate Scores

In [None]:
from src.config import XGBoostConfig
from src.models.xgboost_model import XGBoostModel

# Get feature columns
feature_cols = [c for c in Da.columns if c != 'y']

# Train simple model on accepts
model_cfg = XGBoostConfig(n_estimators=100, max_depth=3, random_seed=42)
model = XGBoostModel(model_cfg)
model.fit(Da[feature_cols].values, Da['y'].values)

# Generate scores
scores_Da = model.predict_proba(Da[feature_cols].values)
scores_Dr = model.predict_proba(Dr[feature_cols].values)
scores_H = model.predict_proba(H[feature_cols].values)

print(f"Score ranges:")
print(f"  Da: [{scores_Da.min():.4f}, {scores_Da.max():.4f}], median={np.median(scores_Da):.4f}")
print(f"  Dr: [{scores_Dr.min():.4f}, {scores_Dr.max():.4f}], median={np.median(scores_Dr):.4f}")
print(f"  H:  [{scores_H.min():.4f}, {scores_H.max():.4f}], median={np.median(scores_H):.4f}")

## 3. Score Distribution Histograms

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (name, scores) in zip(axes, [("Da (accepts)", scores_Da), ("Dr (rejects)", scores_Dr), ("H (holdout)", scores_H)]):
    ax.hist(scores, bins=50, alpha=0.7, edgecolor='black')
    ax.axvline(x=0.5, color='red', linestyle='--', label='threshold=0.5')
    ax.set_xlabel('Score (P(bad))')
    ax.set_ylabel('Count')
    ax.set_title(f'{name} Score Distribution')
    ax.legend()

plt.tight_layout()
plt.show()

## 4. Zoomed View Near Decision Boundary

In [None]:
# Zoom into scores near 0.5 (typical decision boundary)
fig, ax = plt.subplots(figsize=(10, 5))

for name, scores, alpha in [("Da", scores_Da, 0.5), ("Dr", scores_Dr, 0.5), ("H", scores_H, 0.7)]:
    ax.hist(scores, bins=100, alpha=alpha, label=name, range=(0.3, 0.7))

ax.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='threshold=0.5')
ax.set_xlabel('Score (P(bad))')
ax.set_ylabel('Count')
ax.set_title('Score Distribution Near Decision Boundary [0.3, 0.7]')
ax.legend()
plt.show()

# Proportion of scores near boundary
for name, scores in [("Da", scores_Da), ("Dr", scores_Dr), ("H", scores_H)]:
    near_boundary = np.abs(scores - 0.5) < 0.05
    print(f"{name}: {near_boundary.mean()*100:.2f}% of scores in [0.45, 0.55]")

## 5. ABR Sensitivity Curve

Shows how ABR varies with acceptance rate threshold on holdout.

In [None]:
# Compute ABR at various single acceptance rates
accept_rates = np.linspace(0.05, 0.80, 50)
abr_values = []

sorted_idx = np.argsort(scores_H)
y_sorted = H['y'].values[sorted_idx]
n = len(y_sorted)

for rate in accept_rates:
    k = max(1, int(rate * n))
    bad_rate = y_sorted[:k].mean()
    abr_values.append(bad_rate)

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(accept_rates, abr_values, 'b-', linewidth=2)

# Mark paper's ABR integration range [0.2, 0.4]
ax.axvspan(0.2, 0.4, alpha=0.2, color='green', label='ABR integration range [0.2, 0.4]')
ax.axhline(y=H['y'].mean(), color='red', linestyle='--', label=f'Population bad rate ({H["y"].mean():.3f})')

ax.set_xlabel('Acceptance Rate')
ax.set_ylabel('Bad Rate Among Accepted')
ax.set_title('ABR vs Acceptance Rate (Holdout)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

# Compute integrated ABR in paper's range
ts = ThresholdSpec.paper_default()
abr_integrated = compute_abr(H['y'].values, scores_H, ts.abr_range[0], ts.abr_range[1])
print(f"\nIntegrated ABR over [{ts.abr_range[0]}, {ts.abr_range[1]}]: {abr_integrated:.6f}")

## 6. Threshold Invariance Check

Verify that experiment results use consistent thresholds.

In [None]:
# Load experiment results if available
exp_dirs = list((PROJECT_ROOT / "experiments").glob("exp1_*"))

if exp_dirs:
    latest_exp = sorted(exp_dirs)[-1]
    results_file = latest_exp / "exp1_results.csv"
    
    if results_file.exists():
        results = pd.read_csv(results_file)
        
        # Check threshold consistency
        print("Threshold values in experiment results:")
        print(f"  Unique abr_range values: {results['abr_range'].unique()}")
        print(f"  Unique pauc_max_fnr values: {results['pauc_max_fnr'].unique()}")
        print(f"  Unique threshold_policy values: {results['threshold_policy'].unique()}")
        
        # Verify all rows use same thresholds
        assert results['abr_range'].nunique() == 1, "ABR range varies across rows!"
        assert results['pauc_max_fnr'].nunique() == 1, "pAUC max_fnr varies across rows!"
        print("\nThreshold invariance check: PASSED")
    else:
        print(f"No results file found at {results_file}")
else:
    print("No experiment results found. Run experiments first.")

## Summary

This notebook verified:
1. Score distributions across Da, Dr, H
2. Score density near decision boundaries
3. ABR sensitivity to acceptance rate thresholds
4. Threshold consistency across experiment results