# Result Comparison: Our Experiments vs Paper

This notebook compares our experimental results with the reference values from the paper
"Fighting Sampling Bias: A Framework for Training and Evaluating Credit Scoring Models".

Key tables from the paper:
- **Table C.3**: Experiment I - Evaluation accuracy (Bias, Variance, RMSE)
- **Table C.4**: Experiment II - Loss due to bias and Gain from BASL

In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

## 1. Load Experiment Results

In [None]:
# Load our experiment results
experiment_dir = Path('../experiments/experiment_20251201_141716_100seeds_early_stopping')
with open(experiment_dir / 'summary.json', 'r') as f:
    results = json.load(f)

print(f"Number of trials: {results['n_trials']}")
print(f"Configuration:")
print(f"  - n_periods: {results['config']['n_periods']}")
print(f"  - batch_size: {results['config']['batch_size']}")
print(f"  - accept_rate: {results['config']['accept_rate']}")
print(f"  - bad_rate: {results['config']['bad_rate']}")
print(f"  - holdout_size: {results['config']['holdout_size']}")

In [None]:
# Paper reference values (Table C.4, Page A10)
paper_table_c4 = {
    'Metric': ['AUC', 'BS', 'PAUC', 'ABR', 'MMD'],
    'Loss_due_to_bias': [0.0591, 0.0432, 0.0535, 0.0598, 0.5737],
    'Gain_from_BASL_%': [35.72, 29.29, 22.42, 24.82, 3.74]
}
df_paper = pd.DataFrame(paper_table_c4)

print("Paper Table C.4 - Experiment II Results:")
print(df_paper.to_string(index=False))

## 2. Summary Statistics Comparison

In [None]:
# Build comparison table
metrics = ['auc', 'pauc', 'brier', 'abr']
metric_labels = ['AUC', 'PAUC', 'Brier Score', 'ABR']

summary_data = []
for metric, label in zip(metrics, metric_labels):
    baseline = results['baseline'][metric]
    basl = results['basl'][metric]
    improvement = results['improvement'][metric]
    
    summary_data.append({
        'Metric': label,
        'Baseline Mean': baseline['mean'],
        'Baseline Std': baseline['std'],
        'BASL Mean': basl['mean'],
        'BASL Std': basl['std'],
        'Improvement Mean': improvement['mean'],
        'Improvement Std': improvement['std'],
        'Improvement %': (improvement['mean'] / baseline['mean']) * 100 if baseline['mean'] != 0 else 0
    })

df_summary = pd.DataFrame(summary_data)
print("Our Experiment Results (100 seeds):")
print(df_summary.round(4).to_string(index=False))

In [None]:
# Calculate our "Gain from BASL" percentages for comparison with paper
# Note: Paper defines Gain = (BASL improvement) / (Loss due to bias) * 100%
# We only have BASL improvement, not the full oracle comparison

comparison_data = []
for metric, label in zip(metrics, metric_labels):
    paper_row = df_paper[df_paper['Metric'] == label.replace('Brier Score', 'BS')]
    if len(paper_row) > 0:
        paper_loss = paper_row['Loss_due_to_bias'].values[0]
        paper_gain = paper_row['Gain_from_BASL_%'].values[0]
    else:
        paper_loss = np.nan
        paper_gain = np.nan
    
    our_improvement = results['improvement'][metric]['mean']
    
    comparison_data.append({
        'Metric': label,
        'Paper Loss due to Bias': paper_loss,
        'Paper Gain from BASL (%)': paper_gain,
        'Our Improvement': our_improvement,
        'Our Estimated Gain (%)': (our_improvement / paper_loss * 100) if paper_loss and paper_loss != 0 else np.nan
    })

df_comparison = pd.DataFrame(comparison_data)
print("\nComparison with Paper Table C.4:")
print(df_comparison.round(2).to_string(index=False))

## 3. Baseline vs BASL Distribution Comparison

In [None]:
# Create side-by-side bar chart comparing baseline vs BASL
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (metric, label) in enumerate(zip(metrics, metric_labels)):
    ax = axes[idx]
    
    baseline = results['baseline'][metric]
    basl = results['basl'][metric]
    
    # Data for bars
    x = ['Baseline', 'BASL']
    means = [baseline['mean'], basl['mean']]
    stds = [baseline['std'], basl['std']]
    
    # Create bars
    bars = ax.bar(x, means, yerr=stds, capsize=5, color=['#3498db', '#e74c3c'], alpha=0.8)
    
    # Add value labels
    for bar, mean, std in zip(bars, means, stds):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + std + 0.01,
                f'{mean:.4f}', ha='center', va='bottom', fontsize=10)
    
    ax.set_title(label, fontsize=14, fontweight='bold')
    ax.set_ylabel('Value')
    
    # Add improvement annotation
    improvement = results['improvement'][metric]['mean']
    direction = '+' if improvement > 0 else ''
    ax.annotate(f'{direction}{improvement:.4f}', 
                xy=(0.5, max(means)), 
                xytext=(0.5, max(means) * 1.15),
                ha='center', fontsize=11,
                arrowprops=dict(arrowstyle='->', color='gray'))

plt.suptitle('Baseline vs BASL Performance (100 Seeds)', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(experiment_dir / 'baseline_vs_basl_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Improvement Distribution Analysis

In [None]:
# Show distribution of improvements with confidence intervals
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (metric, label) in enumerate(zip(metrics, metric_labels)):
    ax = axes[idx]
    
    improvement = results['improvement'][metric]
    
    # Create horizontal box-style visualization using available stats
    stats = [
        improvement['min'],
        improvement['q2.5'],
        improvement['median'],
        improvement['q97.5'],
        improvement['max']
    ]
    
    # Plot distribution range
    ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5, label='No improvement')
    ax.fill_between([0, 1], improvement['q2.5'], improvement['q97.5'], 
                    alpha=0.3, color='#3498db', label='95% CI')
    ax.axhline(y=improvement['mean'], color='#e74c3c', linewidth=2, label=f"Mean: {improvement['mean']:.4f}")
    ax.axhline(y=improvement['median'], color='#2ecc71', linewidth=2, linestyle='--', label=f"Median: {improvement['median']:.4f}")
    
    # Range bars
    ax.axhline(y=improvement['min'], color='gray', linewidth=1, alpha=0.5)
    ax.axhline(y=improvement['max'], color='gray', linewidth=1, alpha=0.5)
    
    ax.set_title(f'{label} Improvement Distribution', fontsize=14, fontweight='bold')
    ax.set_ylabel('Improvement')
    ax.set_xlim(-0.1, 1.1)
    ax.set_xticks([])
    ax.legend(loc='best', fontsize=9)
    
    # Add annotation for positive/negative regions
    if improvement['q97.5'] > 0 and improvement['q2.5'] < 0:
        ax.annotate('Mixed results\n(some seeds worse)', xy=(0.7, improvement['q2.5']/2),
                   fontsize=9, color='orange', ha='center')

plt.suptitle('Distribution of Improvement (BASL - Baseline)', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(experiment_dir / 'improvement_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Detailed Statistical Summary

In [None]:
# Create detailed summary table similar to paper format
detailed_summary = []

for metric, label in zip(metrics, metric_labels):
    for model_type, model_name in [('baseline', 'Baseline'), ('basl', 'BASL')]:
        data = results[model_type][metric]
        detailed_summary.append({
            'Model': model_name,
            'Metric': label,
            'Mean': data['mean'],
            'Std': data['std'],
            'Median': data['median'],
            'Q2.5%': data['q2.5'],
            'Q97.5%': data['q97.5'],
            'Min': data['min'],
            'Max': data['max']
        })

df_detailed = pd.DataFrame(detailed_summary)
print("Detailed Statistical Summary:")
print(df_detailed.round(4).to_string(index=False))

In [None]:
# Save detailed summary to CSV
df_detailed.to_csv(experiment_dir / 'detailed_summary.csv', index=False)
print(f"Saved to {experiment_dir / 'detailed_summary.csv'}")

## 6. Paper Comparison Analysis

In [None]:
# Create comparison visualization
fig, ax = plt.subplots(figsize=(12, 6))

# Filter to metrics we can compare
comparable_metrics = ['AUC', 'PAUC', 'ABR']  # Exclude Brier and MMD
paper_gains = [df_paper[df_paper['Metric'] == m]['Gain_from_BASL_%'].values[0] for m in comparable_metrics]

# Calculate our estimated gains
our_gains = []
for m, paper_m in zip(['auc', 'pauc', 'abr'], comparable_metrics):
    improvement = results['improvement'][m]['mean']
    paper_loss = df_paper[df_paper['Metric'] == paper_m]['Loss_due_to_bias'].values[0]
    our_gains.append((improvement / paper_loss * 100) if paper_loss != 0 else 0)

x = np.arange(len(comparable_metrics))
width = 0.35

bars1 = ax.bar(x - width/2, paper_gains, width, label='Paper Gain (%)', color='#3498db', alpha=0.8)
bars2 = ax.bar(x + width/2, our_gains, width, label='Our Estimated Gain (%)', color='#e74c3c', alpha=0.8)

ax.set_xlabel('Metric')
ax.set_ylabel('Gain from BASL (%)')
ax.set_title('Paper vs Our Results: Gain from BASL', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(comparable_metrics)
ax.legend()
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)

# Add value labels
for bar in bars1:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
            f'{bar.get_height():.1f}%', ha='center', va='bottom', fontsize=10)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
            f'{bar.get_height():.1f}%', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig(experiment_dir / 'paper_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Key Findings Summary

In [None]:
print("="*60)
print("KEY FINDINGS SUMMARY")
print("="*60)

print("\n1. AUC Results:")
auc_improvement = results['improvement']['auc']['mean']
paper_auc_loss = 0.0591
print(f"   - Our improvement: {auc_improvement:.4f}")
print(f"   - Paper loss due to bias: {paper_auc_loss}")
print(f"   - Estimated recovery: {auc_improvement/paper_auc_loss*100:.1f}%")
print(f"   - Paper expected: 35.72%")
print(f"   - Status: {'Similar magnitude' if abs(auc_improvement/paper_auc_loss*100 - 35.72) < 20 else 'Different from paper'}")

print("\n2. PAUC Results:")
pauc_improvement = results['improvement']['pauc']['mean']
print(f"   - Our improvement: {pauc_improvement:.4f}")
print(f"   - Paper expected gain: 22.42%")
print(f"   - Status: {'ISSUE - PAUC getting worse' if pauc_improvement < 0 else 'OK'}")

print("\n3. ABR Results:")
abr_baseline = results['baseline']['abr']['mean']
abr_basl = results['basl']['abr']['mean']
print(f"   - Our baseline ABR: {abr_baseline:.4f}")
print(f"   - Our BASL ABR: {abr_basl:.4f}")
print(f"   - Paper baseline ABR: ~0.20-0.22")
print(f"   - Note: Our ABR values are much lower than paper")

print("\n4. Holdout Bad Rate:")
holdout_br = results['holdout_bad_rate']['mean']
print(f"   - Mean: {holdout_br:.4f}")
print(f"   - Expected: 0.70")
print(f"   - Status: {'OK' if abs(holdout_br - 0.70) < 0.02 else 'Check data generation'}")

print("\n" + "="*60)