# Boosting GBMs Category - Kaggle Playground Series S5E8

**Category**: Boosting Gradient Boosting Machines  
**Sub-models**: GradientBoostingClassifier, XGBoost, LightGBM, CatBoost  
**Split Strategy**: 70/30 stratified split  
**Cross-Validation**: 5-fold StratifiedKFold  
**Random Seed**: 42  
**Artifact Paths**: outputs/boosting_gbms/  

This notebook compares different gradient boosting variants using the same data preprocessing and evaluation protocol.

In [None]:
# Bootstrap installation and imports
%pip install numpy pandas scikit-learn matplotlib shap xgboost lightgbm catboost --quiet

import os, json, random, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score, accuracy_score,
    precision_score, recall_score, log_loss, roc_curve, precision_recall_curve,
    confusion_matrix, calibration_curve
)

# Gradient boosting libraries
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import shap

import warnings
warnings.filterwarnings('ignore')

# Set random seeds
os.makedirs('outputs', exist_ok=True)
np.random.seed(42)
random.seed(42)

print("Boosting GBMs Category - Setup Complete")
print(f"XGBoost version: {xgb.__version__}")
print(f"LightGBM version: {lgb.__version__}")
print(f"CatBoost version: {cb.__version__}")

In [None]:
# Load and prepare data
print("Loading data...")
train_df = pd.read_csv('../playground-series-s5e8/train.csv')
test_df = pd.read_csv('../playground-series-s5e8/test.csv')

feature_cols = [col for col in train_df.columns if col not in ['id', 'target']]
X = train_df[feature_cols]
y = train_df['target']

print(f"Features: {len(feature_cols)}")

# Data split: 70% train_pool, 30% test_holdout
X_train_pool, X_test_holdout, y_train_pool, y_test_holdout = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Train pool: {X_train_pool.shape}")
print(f"Test holdout: {X_test_holdout.shape}")

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("Cross-validation: 5-fold StratifiedKFold")

In [None]:
# Define GBM models and their hyperparameter grids
models_config = {
    'GradientBoosting_sklearn': {
        'estimator': GradientBoostingClassifier(random_state=42),
        'param_grid': {
            'classifier__learning_rate': [0.05, 0.1],
            'classifier__max_depth': [2, 3],
            'classifier__n_estimators': [200, 600]
        },
        'use_pipeline': True
    },
    'XGBoost_hist': {
        'estimator': xgb.XGBClassifier(
            tree_method='hist',
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric='logloss',
            random_state=42,
            verbosity=0
        ),
        'param_grid': {
            'classifier__n_estimators': [600, 1000],
            'classifier__learning_rate': [0.03, 0.1],
            'classifier__max_depth': [4, 6]
        },
        'use_pipeline': True
    },
    'LightGBM': {
        'estimator': lgb.LGBMClassifier(
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbosity=-1,
            force_row_wise=True
        ),
        'param_grid': {
            'classifier__n_estimators': [600, 1000],
            'classifier__learning_rate': [0.03, 0.1],
            'classifier__num_leaves': [31, 63]
        },
        'use_pipeline': True
    },
    'CatBoost': {
        'estimator': cb.CatBoostClassifier(
            loss_function='Logloss',
            eval_metric='AUC',
            verbose=False,
            random_state=42
        ),
        'param_grid': {
            'classifier__iterations': [800, 1200],
            'classifier__learning_rate': [0.03, 0.1],
            'classifier__depth': [6, 8]
        },
        'use_pipeline': True
    }
}

print(f"Configured {len(models_config)} GBM variants:")
for name in models_config.keys():
    print(f"  - {name}")

In [None]:
# Helper functions
def create_pipeline(estimator, use_pipeline=True):
    """Create preprocessing pipeline - GBMs typically don't need scaling"""
    if use_pipeline:
        return Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('classifier', estimator)
        ])
    else:
        # For some models that handle missing values natively
        return estimator

def get_probabilities(estimator, X):
    """Get probabilities from estimator"""
    return estimator.predict_proba(X)[:, 1]

def compute_metrics(y_true, y_prob, threshold=0.5):
    """Compute all evaluation metrics"""
    y_pred = (y_prob >= threshold).astype(int)
    
    return {
        'roc_auc': roc_auc_score(y_true, y_prob),
        'average_precision': average_precision_score(y_true, y_prob),
        'f1': f1_score(y_true, y_pred),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'logloss': log_loss(y_true, y_prob)
    }

def find_best_threshold(y_true, y_prob):
    """Find best threshold using Youden's J statistic"""
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    j_scores = tpr - fpr
    best_idx = np.argmax(j_scores)
    return thresholds[best_idx]

print("Helper functions defined")

In [None]:
# Main evaluation loop
results = {}
all_cv_results = []

for model_name, config in models_config.items():
    print(f"\n{'='*60}")
    print(f"Evaluating: {model_name}")
    print(f"{'='*60}")
    
    # Create directories for this model
    model_dir = f"../outputs/boosting_gbms/{model_name}"
    os.makedirs(f"{model_dir}/logs", exist_ok=True)
    os.makedirs(f"{model_dir}/models", exist_ok=True)
    os.makedirs(f"{model_dir}/figures", exist_ok=True)
    
    # Create pipeline
    pipeline = create_pipeline(config['estimator'], config['use_pipeline'])
    
    # Cross-validation with hyperparameter tuning
    grid_search = GridSearchCV(
        pipeline, config['param_grid'], cv=cv, 
        scoring='roc_auc', n_jobs=-1, verbose=1
    )
    
    # Fit on train pool
    grid_search.fit(X_train_pool, y_train_pool)
    best_pipeline = grid_search.best_estimator_
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV AUC: {grid_search.best_score_:.4f}")
    
    # Collect CV results for detailed analysis
    cv_metrics = []
    cv_roc_curves = []
    cv_pr_curves = []
    cv_thresholds = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_pool, y_train_pool)):
        X_fold_train = X_train_pool.iloc[train_idx]
        X_fold_val = X_train_pool.iloc[val_idx]
        y_fold_train = y_train_pool.iloc[train_idx]
        y_fold_val = y_train_pool.iloc[val_idx]
        
        # Fit best pipeline on this fold
        fold_pipeline = create_pipeline(config['estimator'], config['use_pipeline'])
        fold_pipeline.set_params(**grid_search.best_params_)
        fold_pipeline.fit(X_fold_train, y_fold_train)
        
        # Predict on validation fold
        y_val_prob = get_probabilities(fold_pipeline, X_fold_val)
        
        # Find best threshold for this fold
        best_threshold = find_best_threshold(y_fold_val, y_val_prob)
        cv_thresholds.append(best_threshold)
        
        # Compute metrics
        fold_metrics = compute_metrics(y_fold_val, y_val_prob, best_threshold)
        fold_metrics['fold'] = fold_idx + 1
        fold_metrics['threshold'] = best_threshold
        cv_metrics.append(fold_metrics)
        
        # Store curves for plotting
        fpr, tpr, _ = roc_curve(y_fold_val, y_val_prob)
        precision, recall, _ = precision_recall_curve(y_fold_val, y_val_prob)
        cv_roc_curves.append((fpr, tpr))
        cv_pr_curves.append((precision, recall))
        
        print(f"  Fold {fold_idx + 1}: AUC={fold_metrics['roc_auc']:.4f}, "
              f"AP={fold_metrics['average_precision']:.4f}, "
              f"F1={fold_metrics['f1']:.4f}")
    
    # Calculate mean threshold from CV
    mean_threshold = np.mean(cv_thresholds)
    
    # Evaluate on test holdout using mean threshold
    y_test_prob = get_probabilities(best_pipeline, X_test_holdout)
    test_metrics = compute_metrics(y_test_holdout, y_test_prob, mean_threshold)
    test_metrics['chosen_threshold'] = mean_threshold
    test_metrics['confusion_matrix'] = confusion_matrix(
        y_test_holdout, (y_test_prob >= mean_threshold).astype(int)
    ).tolist()
    
    print(f"\nTest Results (threshold={mean_threshold:.4f}):")
    print(f"  AUC: {test_metrics['roc_auc']:.4f}")
    print(f"  AP: {test_metrics['average_precision']:.4f}")
    print(f"  F1: {test_metrics['f1']:.4f}")
    
    # Store results
    results[model_name] = {
        'cv_metrics': cv_metrics,
        'test_metrics': test_metrics,
        'cv_roc_curves': cv_roc_curves,
        'cv_pr_curves': cv_pr_curves,
        'best_params': grid_search.best_params_,
        'model_dir': model_dir
    }
    
    # Add to overall results
    all_cv_results.extend([{
        'model': model_name,
        'category': 'boosting_gbms',
        **metrics
    } for metrics in cv_metrics])

print(f"\n{'='*60}")
print("All GBM models evaluated!")

In [None]:
# Save artifacts for each model
for model_name, model_results in results.items():
    model_dir = model_results['model_dir']
    
    # Save CV metrics
    cv_df = pd.DataFrame(model_results['cv_metrics'])
    
    # Add summary statistics
    summary_stats = []
    for metric in ['roc_auc', 'average_precision', 'f1', 'accuracy', 'precision', 'recall', 'logloss']:
        summary_stats.append({
            'fold': 'mean',
            metric: cv_df[metric].mean(),
            'threshold': cv_df['threshold'].mean()
        })
        summary_stats.append({
            'fold': 'std',
            metric: cv_df[metric].std(),
            'threshold': cv_df['threshold'].std()
        })
    
    cv_summary_df = pd.concat([cv_df, pd.DataFrame(summary_stats)], ignore_index=True)
    cv_summary_df.to_csv(f"{model_dir}/logs/cv_metrics.csv", index=False)
    
    # Save test metrics
    with open(f"{model_dir}/logs/test_metrics.json", 'w') as f:
        json.dump(model_results['test_metrics'], f, indent=2)
    
    # Save model (handle different formats)
    final_pipeline = create_pipeline(models_config[model_name]['estimator'], models_config[model_name]['use_pipeline'])
    grid_search_final = GridSearchCV(
        final_pipeline, models_config[model_name]['param_grid'], 
        cv=cv, scoring='roc_auc', n_jobs=-1
    )
    grid_search_final.fit(X_train_pool, y_train_pool)
    
    if 'XGBoost' in model_name:
        # Save XGBoost in native format
        if hasattr(grid_search_final.best_estimator_, 'named_steps'):
            xgb_model = grid_search_final.best_estimator_.named_steps['classifier']
        else:
            xgb_model = grid_search_final.best_estimator_
        xgb_model.save_model(f"{model_dir}/models/final_model.json")
    elif 'LightGBM' in model_name:
        # Save LightGBM in native format
        if hasattr(grid_search_final.best_estimator_, 'named_steps'):
            lgb_model = grid_search_final.best_estimator_.named_steps['classifier']
        else:
            lgb_model = grid_search_final.best_estimator_
        lgb_model.booster_.save_model(f"{model_dir}/models/final_model.txt")
    elif 'CatBoost' in model_name:
        # Save CatBoost in native format
        if hasattr(grid_search_final.best_estimator_, 'named_steps'):
            cb_model = grid_search_final.best_estimator_.named_steps['classifier']
        else:
            cb_model = grid_search_final.best_estimator_
        cb_model.save_model(f"{model_dir}/models/final_model.cbm")
    
    # Also save as pickle for consistency
    with open(f"{model_dir}/models/final_model.pkl", 'wb') as f:
        pickle.dump(grid_search_final.best_estimator_, f)
    
    print(f"Artifacts saved for {model_name}")

print("\nAll artifacts saved!")

In [None]:
# Generate figures for each model with GBM-specific plots
for model_name, model_results in results.items():
    model_dir = model_results['model_dir']
    
    print(f"Generating figures for {model_name}...")
    
    # 1. ROC Curve with CV mean and std
    plt.figure(figsize=(8, 6))
    
    mean_fpr = np.linspace(0, 1, 100)
    tprs = []
    for fpr, tpr in model_results['cv_roc_curves']:
        tprs.append(np.interp(mean_fpr, fpr, tpr))
    
    mean_tpr = np.mean(tprs, axis=0)
    std_tpr = np.std(tprs, axis=0)
    
    plt.plot(mean_fpr, mean_tpr, 'b-', 
             label=f'Mean ROC (AUC = {np.mean([cv["roc_auc"] for cv in model_results["cv_metrics"]]):.3f} ± {np.std([cv["roc_auc"] for cv in model_results["cv_metrics"]]):.3f})')
    plt.fill_between(mean_fpr, mean_tpr - std_tpr, mean_tpr + std_tpr, alpha=0.2, color='blue')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/roc_cv.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # Load final model for feature importance and SHAP
    with open(f"{model_dir}/models/final_model.pkl", 'rb') as f:
        final_model = pickle.load(f)
    
    # Get the actual estimator (handle pipeline)
    if hasattr(final_model, 'named_steps'):
        estimator = final_model.named_steps['classifier']
        # Transform data through pipeline preprocessing
        X_transformed = final_model.named_steps['imputer'].transform(X_train_pool)
    else:
        estimator = final_model
        X_transformed = X_train_pool.fillna(X_train_pool.median())
    
    # 2. Feature importance
    plt.figure(figsize=(10, 6))
    
    if hasattr(estimator, 'feature_importances_'):
        importances = estimator.feature_importances_
        indices = np.argsort(importances)[-20:]  # Top 20
        
        plt.barh(range(len(indices)), importances[indices], alpha=0.7)
        plt.yticks(range(len(indices)), [feature_cols[i] for i in indices])
        plt.xlabel('Feature Importance')
        plt.title(f'Top 20 Feature Importances - {model_name}')
        plt.grid(True, alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'Feature importance not available', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title(f'Feature Importance - {model_name}')
    
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/feature_importance.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # 3. SHAP summary plot (for tree-based models)
    try:
        print(f"  Generating SHAP plot for {model_name}...")
        
        # Sample data for SHAP (computational efficiency)
        sample_size = min(1000, len(X_transformed))
        sample_indices = np.random.choice(len(X_transformed), sample_size, replace=False)
        X_sample = X_transformed[sample_indices] if isinstance(X_transformed, np.ndarray) else X_transformed.iloc[sample_indices]
        
        # Create SHAP explainer
        if 'XGBoost' in model_name or 'LightGBM' in model_name or 'CatBoost' in model_name or 'GradientBoosting' in model_name:
            explainer = shap.TreeExplainer(estimator)
            shap_values = explainer.shap_values(X_sample)
            
            # Handle different SHAP value formats
            if isinstance(shap_values, list):
                shap_values = shap_values[1]  # For binary classification, take positive class
            
            plt.figure(figsize=(10, 6))
            shap.summary_plot(shap_values, X_sample, feature_names=feature_cols, show=False)
            plt.title(f'SHAP Summary - {model_name}')
            plt.tight_layout()
            plt.savefig(f"{model_dir}/figures/shap_summary.png", dpi=200, bbox_inches='tight')
            plt.close()
            
    except Exception as e:
        print(f"    SHAP plot failed for {model_name}: {str(e)}")
        # Create placeholder
        plt.figure(figsize=(8, 6))
        plt.text(0.5, 0.5, f'SHAP plot failed:\n{str(e)}', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title(f'SHAP Summary - {model_name}')
        plt.tight_layout()
        plt.savefig(f"{model_dir}/figures/shap_summary.png", dpi=200, bbox_inches='tight')
        plt.close()
    
    # Generate other standard plots (confusion matrix, calibration, etc.)
    # ... (similar to previous notebooks)

print("All figures generated!")

In [None]:
# Create summary table and final results
summary_data = []

for model_name, model_results in results.items():
    test_metrics = model_results['test_metrics']
    cv_metrics = model_results['cv_metrics']
    
    summary_data.append({
        'model': model_name,
        'test_auc': test_metrics['roc_auc'],
        'test_ap': test_metrics['average_precision'],
        'test_f1': test_metrics['f1'],
        'test_accuracy': test_metrics['accuracy'],
        'cv_auc_mean': np.mean([cv['roc_auc'] for cv in cv_metrics]),
        'cv_auc_std': np.std([cv['roc_auc'] for cv in cv_metrics]),
        'best_params': str(model_results['best_params']),
        'artifacts_path': model_results['model_dir']
    })

# Sort by test AUC
summary_df = pd.DataFrame(summary_data).sort_values('test_auc', ascending=False)

# Save summary
os.makedirs('../outputs/boosting_gbms', exist_ok=True)
summary_df.to_csv('../outputs/boosting_gbms/summary.csv', index=False)

# Display results
print("\n" + "="*80)
print("BOOSTING GBMs CATEGORY - FINAL RESULTS")
print("="*80)
print("\nRanked by Test AUC:")
print("-" * 60)

for idx, row in summary_df.iterrows():
    print(f"{row['model']:30s} | AUC: {row['test_auc']:.4f} | AP: {row['test_ap']:.4f} | F1: {row['test_f1']:.4f}")
    print(f"{'':30s} | CV AUC: {row['cv_auc_mean']:.4f}±{row['cv_auc_std']:.4f}")
    print(f"{'':30s} | Artifacts: {row['artifacts_path']}")
    print("-" * 60)

print(f"\nBest Model: {summary_df.iloc[0]['model']}")
print(f"Best Test AUC: {summary_df.iloc[0]['test_auc']:.4f}")
print(f"\nAll results saved to: ../outputs/boosting_gbms/")
print(f"Summary saved to: ../outputs/boosting_gbms/summary.csv")