# Aggregate Leaderboards - Kaggle Playground Series S5E8

This notebook scans all category results and creates:
1. Per-category summary (best model per category)
2. Overall leaderboard (all models ranked by test AUC)
3. Summary README with analysis

**Categories**: linear_models, svm_kernels, knn, naive_bayes, trees, bagging_forests, boosting_gbms, neural_nets  
**Output**: summary/per_category.csv, summary/overall_leaderboard.csv, summary/README.md

In [None]:
# Setup
import os, json, glob
import numpy as np
import pandas as pd
from pathlib import Path

print("Aggregate Leaderboards - Setup Complete")
print(f"Working directory: {os.getcwd()}")

In [None]:
# Scan outputs directory for all categories and models
outputs_dir = Path('../outputs')
categories = [d.name for d in outputs_dir.iterdir() if d.is_dir()]
categories = sorted(categories)

print(f"Found {len(categories)} categories:")
for cat in categories:
    print(f"  - {cat}")

all_results = []
category_summaries = []

for category in categories:
    print(f"\nProcessing category: {category}")
    category_dir = outputs_dir / category
    
    # Get all models in this category
    model_dirs = [d for d in category_dir.iterdir() if d.is_dir() and d.name != 'summary']
    
    category_results = []
    
    for model_dir in model_dirs:
        model_name = model_dir.name
        
        # Try to load test metrics
        test_metrics_file = model_dir / 'logs' / 'test_metrics.json'
        cv_metrics_file = model_dir / 'logs' / 'cv_metrics.csv'
        
        if test_metrics_file.exists():
            with open(test_metrics_file, 'r') as f:
                test_metrics = json.load(f)
            
            # Load CV metrics for additional stats
            cv_stats = {}
            if cv_metrics_file.exists():
                cv_df = pd.read_csv(cv_metrics_file)
                # Filter out summary rows (mean/std)
                cv_data = cv_df[cv_df['fold'].apply(lambda x: str(x).isdigit())]
                if len(cv_data) > 0:
                    cv_stats = {
                        'cv_auc_mean': cv_data['roc_auc'].mean(),
                        'cv_auc_std': cv_data['roc_auc'].std(),
                        'cv_ap_mean': cv_data['average_precision'].mean(),
                        'cv_ap_std': cv_data['average_precision'].std(),
                        'cv_f1_mean': cv_data['f1'].mean(),
                        'cv_f1_std': cv_data['f1'].std()
                    }
            
            # Combine all metrics
            result = {
                'category': category,
                'model': model_name,
                'test_auc': test_metrics.get('roc_auc', 0),
                'test_ap': test_metrics.get('average_precision', 0),
                'test_f1': test_metrics.get('f1', 0),
                'test_accuracy': test_metrics.get('accuracy', 0),
                'test_precision': test_metrics.get('precision', 0),
                'test_recall': test_metrics.get('recall', 0),
                'test_logloss': test_metrics.get('logloss', np.inf),
                'chosen_threshold': test_metrics.get('chosen_threshold', 0.5),
                'artifacts_path': str(model_dir),
                **cv_stats
            }
            
            all_results.append(result)
            category_results.append(result)
            
            print(f"  {model_name}: AUC={result['test_auc']:.4f}")
        else:
            print(f"  {model_name}: No test metrics found")
    
    # Find best model in this category
    if category_results:
        best_model = max(category_results, key=lambda x: x['test_auc'])
        category_summaries.append({
            'category': category,
            'best_model': best_model['model'],
            'best_test_auc': best_model['test_auc'],
            'best_test_ap': best_model['test_ap'],
            'best_test_f1': best_model['test_f1'],
            'cv_auc_mean': best_model.get('cv_auc_mean', 0),
            'cv_auc_std': best_model.get('cv_auc_std', 0),
            'num_models': len(category_results),
            'artifacts_path': best_model['artifacts_path']
        })
        print(f"  Best: {best_model['model']} (AUC: {best_model['test_auc']:.4f})")

print(f"\nTotal models found: {len(all_results)}")
print(f"Categories processed: {len(category_summaries)}")

In [None]:
# Create overall leaderboard
overall_df = pd.DataFrame(all_results)

# Sort by test AUC (primary), then AP (secondary), then F1 (tertiary)
overall_df = overall_df.sort_values(
    ['test_auc', 'test_ap', 'test_f1'], 
    ascending=[False, False, False]
).reset_index(drop=True)

# Add rank
overall_df['rank'] = range(1, len(overall_df) + 1)

print("\nOVERALL LEADERBOARD (Top 10)")
print("=" * 80)
print(f"{'Rank':<4} {'Category':<15} {'Model':<25} {'AUC':<8} {'AP':<8} {'F1':<8}")
print("-" * 80)

for idx, row in overall_df.head(10).iterrows():
    print(f"{row['rank']:<4} {row['category']:<15} {row['model']:<25} {row['test_auc']:<8.4f} {row['test_ap']:<8.4f} {row['test_f1']:<8.4f}")

# Save overall leaderboard
os.makedirs('../summary', exist_ok=True)
overall_df.to_csv('../summary/overall_leaderboard.csv', index=False)
print(f"\nOverall leaderboard saved to: ../summary/overall_leaderboard.csv")

In [None]:
# Create per-category summary
category_df = pd.DataFrame(category_summaries)
category_df = category_df.sort_values('best_test_auc', ascending=False).reset_index(drop=True)
category_df['category_rank'] = range(1, len(category_df) + 1)

print("\nPER-CATEGORY SUMMARY")
print("=" * 80)
print(f"{'Rank':<4} {'Category':<15} {'Best Model':<25} {'AUC':<8} {'AP':<8} {'F1':<8} {'#Models':<8}")
print("-" * 80)

for idx, row in category_df.iterrows():
    print(f"{row['category_rank']:<4} {row['category']:<15} {row['best_model']:<25} {row['best_test_auc']:<8.4f} {row['best_test_ap']:<8.4f} {row['best_test_f1']:<8.4f} {row['num_models']:<8}")

# Save per-category summary
category_df.to_csv('../summary/per_category.csv', index=False)
print(f"\nPer-category summary saved to: ../summary/per_category.csv")

In [None]:
# Generate analysis and insights
best_overall = overall_df.iloc[0]
best_category = category_df.iloc[0]

# Performance statistics
auc_stats = overall_df['test_auc'].describe()
category_performance = category_df.groupby('category')['best_test_auc'].first().sort_values(ascending=False)

# Category analysis
category_analysis = []
for category in categories:
    cat_results = [r for r in all_results if r['category'] == category]
    if cat_results:
        aucs = [r['test_auc'] for r in cat_results]
        category_analysis.append({
            'category': category,
            'mean_auc': np.mean(aucs),
            'std_auc': np.std(aucs),
            'min_auc': np.min(aucs),
            'max_auc': np.max(aucs),
            'num_models': len(aucs)
        })

analysis_df = pd.DataFrame(category_analysis).sort_values('mean_auc', ascending=False)

print("\nCATEGORY ANALYSIS")
print("=" * 80)
print(f"{'Category':<15} {'Mean AUC':<10} {'Std AUC':<10} {'Min AUC':<10} {'Max AUC':<10} {'Models':<8}")
print("-" * 80)

for idx, row in analysis_df.iterrows():
    print(f"{row['category']:<15} {row['mean_auc']:<10.4f} {row['std_auc']:<10.4f} {row['min_auc']:<10.4f} {row['max_auc']:<10.4f} {row['num_models']:<8}")

In [None]:
# Create comprehensive README
readme_content = f"""# ML Benchmarking Results - Kaggle Playground Series S5E8

## Overview

This document summarizes the comprehensive ML benchmarking results across {len(categories)} model categories and {len(all_results)} total model configurations.

**Dataset**: Kaggle Playground Series S5E8  
**Evaluation Protocol**: 70/30 stratified split, 5-fold CV on train pool, final test on holdout  
**Primary Metric**: ROC-AUC  
**Secondary Metrics**: Average Precision, F1-Score  

## 🏆 Best Overall Results

| Rank | Category | Model | Test AUC | Test AP | Test F1 |
|------|----------|-------|----------|---------|----------|
"""

# Add top 10 results
for idx, row in overall_df.head(10).iterrows():
    readme_content += f"| {row['rank']} | {row['category']} | {row['model']} | {row['test_auc']:.4f} | {row['test_ap']:.4f} | {row['test_f1']:.4f} |\n"

readme_content += f"""
## 📊 Category Performance Summary

| Rank | Category | Best Model | Test AUC | Models Tested |
|------|----------|------------|----------|---------------|
"""

# Add category summary
for idx, row in category_df.iterrows():
    readme_content += f"| {row['category_rank']} | {row['category']} | {row['best_model']} | {row['best_test_auc']:.4f} | {row['num_models']} |\n"

readme_content += f"""
## 📈 Statistical Analysis

### Overall Performance Statistics
- **Best AUC**: {auc_stats['max']:.4f} ({best_overall['category']}/{best_overall['model']})
- **Mean AUC**: {auc_stats['mean']:.4f}
- **Median AUC**: {auc_stats['50%']:.4f}
- **Std AUC**: {auc_stats['std']:.4f}
- **Models Evaluated**: {len(all_results)}

### Category Analysis

| Category | Mean AUC | Std AUC | Best AUC | Models |
|----------|----------|---------|----------|--------|
"""

for idx, row in analysis_df.iterrows():
    readme_content += f"| {row['category']} | {row['mean_auc']:.4f} | {row['std_auc']:.4f} | {row['max_auc']:.4f} | {row['num_models']} |\n"

readme_content += f"""
## 🎯 Key Insights

1. **Best Performing Category**: {best_category['category']} (AUC: {best_category['best_test_auc']:.4f})
2. **Most Consistent Category**: {analysis_df.loc[analysis_df['std_auc'].idxmin(), 'category']} (Std: {analysis_df['std_auc'].min():.4f})
3. **Best Single Model**: {best_overall['model']} from {best_overall['category']} (AUC: {best_overall['test_auc']:.4f})

## 📁 File Structure

```
outputs/
├── <category>/
│   ├── <model>/
│   │   ├── logs/
│   │   │   ├── cv_metrics.csv
│   │   │   └── test_metrics.json
│   │   ├── models/
│   │   │   └── final_model.pkl
│   │   └── figures/
│   │       ├── roc_cv.png
│   │       ├── pr_cv.png
│   │       ├── confusion_matrix.png
│   │       ├── calibration_curve.png
│   │       ├── feature_importance.png
│   │       └── error_analysis.png
│   └── summary.csv
└── summary/
    ├── overall_leaderboard.csv
    ├── per_category.csv
    └── README.md
```

## 🔍 Methodology

### Data Preprocessing
- Median imputation for missing values
- StandardScaler for distance-based models (linear, SVM, KNN, neural nets)
- No scaling for tree-based models (trees, forests, boosting)

### Evaluation Protocol
1. **Data Split**: 70% train pool, 30% test holdout (stratified)
2. **Cross-Validation**: 5-fold StratifiedKFold on train pool
3. **Hyperparameter Tuning**: GridSearchCV with ROC-AUC scoring
4. **Threshold Selection**: Youden's J statistic from CV folds
5. **Final Evaluation**: Single evaluation on test holdout

### Metrics Collected
- ROC-AUC (primary ranking metric)
- Average Precision (tie-breaker)
- F1-Score (tie-breaker)
- Accuracy, Precision, Recall, Log-Loss
- Confusion Matrix, Calibration Analysis

---
*Generated automatically by aggregate_leaderboards.ipynb*
"""

# Save README
with open('../summary/README.md', 'w') as f:
    f.write(readme_content)

print("\nComprehensive README generated: ../summary/README.md")

In [None]:
# Final summary
print("\n" + "="*80)
print("AGGREGATION COMPLETE")
print("="*80)
print(f"\n📊 SUMMARY:")
print(f"   • Categories processed: {len(categories)}")
print(f"   • Total models evaluated: {len(all_results)}")
print(f"   • Best overall model: {best_overall['model']} ({best_overall['category']})")
print(f"   • Best overall AUC: {best_overall['test_auc']:.4f}")
print(f"   • Best category: {best_category['category']} (AUC: {best_category['best_test_auc']:.4f})")

print(f"\n📁 FILES GENERATED:")
print(f"   • summary/overall_leaderboard.csv - All models ranked by AUC")
print(f"   • summary/per_category.csv - Best model per category")
print(f"   • summary/README.md - Comprehensive analysis and insights")

print(f"\n🏆 TOP 3 MODELS:")
for i in range(min(3, len(overall_df))):
    row = overall_df.iloc[i]
    print(f"   {i+1}. {row['model']} ({row['category']}) - AUC: {row['test_auc']:.4f}")

print(f"\n✅ Aggregation notebook completed successfully!")
print(f"   Check summary/ folder for detailed results and analysis.")