# Boosting GBMs Category - Kaggle Playground Series S5E8

**Category**: Boosting Gradient Boosting Machines  
**Sub-models**: GradientBoostingClassifier, XGBoost, LightGBM, CatBoost  
**Split Strategy**: 70/30 stratified split  
**Cross-Validation**: 5-fold StratifiedKFold  
**Random Seed**: 42  
**Artifact Paths**: outputs/boosting_gbms/  

This notebook compares different gradient boosting variants using the same data preprocessing and evaluation protocol.

In [3]:
# Safe imports and availability checks (no internet installs)
import os, json, random, pickle, warnings, sys, platform, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score, accuracy_score, precision_score,
    recall_score, roc_curve, precision_recall_curve, log_loss, auc
)

# Optional libs
HAS_XGB = HAS_LGBM = HAS_CATBOOST = False
try:
    import xgboost as xgb
    HAS_XGB = True
except Exception as e:
    print("[WARN] xgboost not available:", e)

try:
    from lightgbm import LGBMClassifier
    HAS_LGBM = True
except Exception as e:
    print("[WARN] lightgbm not available:", e)

try:
    from catboost import CatBoostClassifier
    HAS_CATBOOST = True
except Exception as e:
    print("[WARN] catboost not available:", e)

try:
    import shap
    HAS_SHAP = True
except Exception as e:
    print("[WARN] shap not available:", e)
    HAS_SHAP = False

print(f"[ENV] Python {platform.python_version()}  Numpy {np.__version__}  Pandas {pd.__version__}")
print(f"[ENV] XGBoost={HAS_XGB}  LightGBM={HAS_LGBM}  CatBoost={HAS_CATBOOST}  SHAP={HAS_SHAP}")

# Reproducibility
np.random.seed(42)
random.seed(42)


[WARN] xgboost not available: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/opt/homebrew/Caskroom/miniconda/base/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <8E129FE8-EF1C-38EA-A9CF-202782564052> /opt/homebrew/Caskroom/miniconda/base/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/home

In [4]:
# Load and prepare data (with fallback)
print("Loading data...")
from sklearn.datasets import make_classification

train_path = '../playground-series-s5e8/train.csv'
test_path = '../playground-series-s5e8/test.csv'

if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
    feature_cols = [c for c in train_df.columns if c not in ['id', 'target']]
    X = train_df[feature_cols]
    y = train_df['target']
    print(f"Loaded Kaggle dataset. Features={len(feature_cols)} Rows={len(train_df)}")
else:
    print("[WARN] Kaggle files not found. Using a synthetic binary classification dataset.")
    X, y = make_classification(n_samples=3000, n_features=20, n_informative=10, n_redundant=4,
                               n_classes=2, weights=[0.6, 0.4], random_state=42)
    X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])

X_train_pool, X_test_holdout, y_train_pool, y_test_holdout = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)
print(f"Train pool: {X_train_pool.shape}, Test holdout: {X_test_holdout.shape}")


Loading data...


KeyError: 'target'

In [None]:
# Define GBM models and their hyperparameter grids (fast version)
models_config = {}

# Always include sklearn GradientBoosting
models_config['GradientBoosting_sklearn'] = {
    'estimator': GradientBoostingClassifier(random_state=42),
    'param_grid': {
        'classifier__learning_rate': [0.1],
        'classifier__max_depth': [3],
        'classifier__n_estimators': [100]
    },
    'use_pipeline': True
}

if HAS_XGB:
    models_config['XGBoost_hist'] = {
        'estimator': xgb.XGBClassifier(
            tree_method='hist',
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric='logloss',
            random_state=42,
            n_estimators=150,
            learning_rate=0.1,
            max_depth=4
        ),
        'param_grid': {
            'classifier__n_estimators': [150],
            'classifier__learning_rate': [0.1],
            'classifier__max_depth': [4]
        },
        'use_pipeline': True
    }

if HAS_LGBM:
    models_config['LightGBM'] = {
        'estimator': LGBMClassifier(
            objective='binary', n_estimators=150, learning_rate=0.1, max_depth=-1, random_state=42
        ),
        'param_grid': {
            'classifier__n_estimators': [150],
            'classifier__learning_rate': [0.1]
        },
        'use_pipeline': True
    }

if HAS_CATBOOST:
    models_config['CatBoost'] = {
        'estimator': CatBoostClassifier(
            verbose=False, random_state=42, iterations=200, learning_rate=0.1, depth=6, loss_function='Logloss'
        ),
        'param_grid': {
            'classifier__iterations': [200],
            'classifier__learning_rate': [0.1],
            'classifier__depth': [6]
        },
        'use_pipeline': True
    }

print(f"Configured {len(models_config)} GBM variants:")
for name in models_config.keys():
    print(f"  - {name}")


Configured 4 GBM variants:
  - GradientBoosting_sklearn
  - XGBoost_hist
  - LightGBM
  - CatBoost


In [None]:
# Helper functions
def create_pipeline(estimator, use_pipeline=True):
    """Create preprocessing pipeline - GBMs typically don't need scaling"""
    if use_pipeline:
        return Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('classifier', estimator)
        ])
    else:
        # For some models that handle missing values natively
        return estimator

def get_probabilities(estimator, X):
    """Get probabilities from estimator"""
    return estimator.predict_proba(X)[:, 1]

def compute_metrics(y_true, y_prob, threshold=0.5):
    """Compute all evaluation metrics"""
    y_pred = (y_prob >= threshold).astype(int)
    
    return {
        'roc_auc': roc_auc_score(y_true, y_prob),
        'average_precision': average_precision_score(y_true, y_prob),
        'f1': f1_score(y_true, y_pred),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'logloss': log_loss(y_true, y_prob)
    }

def find_best_threshold(y_true, y_prob):
    """Find best threshold using Youden's J statistic"""
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    j_scores = tpr - fpr
    best_idx = np.argmax(j_scores)
    return thresholds[best_idx]

print("Helper functions defined")

Helper functions defined


In [None]:
# Main evaluation loop
results = {}
all_cv_results = []

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for model_name, cfg in models_config.items():
    print(f"\n{'='*60}\n[START] {model_name}")
    estimator = cfg['estimator']
    pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('classifier', estimator)]) if cfg['use_pipeline'] else estimator
    
    print("[INFO] Parameter grid:", cfg['param_grid'])
    grid_search = GridSearchCV(
        pipeline,
        param_grid=cfg['param_grid'],
        scoring='roc_auc',
        cv=cv,
        n_jobs=1,
        verbose=2,
        refit=True,
        return_train_score=False
    )
    grid_search.fit(X_train_pool, y_train_pool)
    print("[DONE] Grid search complete.")
    print("[BEST] params:", grid_search.best_params_, "score:", round(grid_search.best_score_, 5))
    
    # Evaluate on holdout
    y_prob = grid_search.predict_proba(X_test_holdout)[:, 1]
    best_thresh = None
    try:
        best_thresh = find_best_threshold(y_test_holdout, y_prob)
    except Exception as e:
        print("[WARN] Threshold search failed:", e)
        best_thresh = 0.5
    y_pred = (y_prob >= best_thresh).astype(int)
    
    metrics = {
        'auc': roc_auc_score(y_test_holdout, y_prob),
        'ap': average_precision_score(y_test_holdout, y_prob),
        'f1': f1_score(y_test_holdout, y_pred),
        'accuracy': accuracy_score(y_test_holdout, y_pred),
        'precision': precision_score(y_test_holdout, y_pred),
        'recall': recall_score(y_test_holdout, y_pred),
        'logloss': log_loss(y_test_holdout, y_prob)
    }
    print("[METRICS]", json.dumps(metrics, indent=2))
    
    # Store curves for plotting
    fpr, tpr, _ = roc_curve(y_test_holdout, y_prob)
    prec, rec, _ = precision_recall_curve(y_test_holdout, y_prob)
    cv_roc_curves = []
    for train_idx, test_idx in cv.split(X_train_pool, y_train_pool):
        # Quick CV curve (no refit) for visualization consistency
        X_tr, X_te = X_train_pool.iloc[train_idx], X_train_pool.iloc[test_idx]
        y_tr, y_te = y_train_pool.iloc[train_idx], y_train_pool.iloc[test_idx]
        est = grid_search.best_estimator_
        prob_cv = est.predict_proba(X_te)[:, 1]
        fpr_cv, tpr_cv, _ = roc_curve(y_te, prob_cv)
        cv_roc_curves.append((fpr_cv, tpr_cv))
    
    model_dir = f"../outputs/boosting_gbms/{model_name.replace(' ', '_')}"
    os.makedirs(f"{model_dir}/figures", exist_ok=True)
    os.makedirs(f"{model_dir}/models", exist_ok=True)
    
    results[model_name] = {
        'best_params': grid_search.best_params_,
        'best_score_cv': grid_search.best_score_,
        'holdout_metrics': metrics,
        'fpr': fpr, 'tpr': tpr, 'prec': prec, 'rec': rec,
        'cv_roc_curves': cv_roc_curves,
        'model_dir': model_dir,
        'best_estimator': grid_search.best_estimator_
    }

print(f"\n{'='*60}")
print("All GBM models evaluated!")



[START] GradientBoosting_sklearn
[INFO] Parameter grid: {'classifier__learning_rate': [0.1], 'classifier__max_depth': [3], 'classifier__n_estimators': [100]}
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[CV] END classifier__learning_rate=0.1, classifier__max_depth=3, classifier__n_estimators=100; total time=   1.2s


[CV] END classifier__learning_rate=0.1, classifier__max_depth=3, classifier__n_estimators=100; total time=   1.1s


[CV] END classifier__learning_rate=0.1, classifier__max_depth=3, classifier__n_estimators=100; total time=   1.2s


[DONE] Grid search complete.
[BEST] params: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 100} score: 0.96365
[METRICS] {
  "auc": 0.9770893290446363,
  "ap": 0.9720942399467696,
  "f1": 0.9313186813186813,
  "accuracy": 0.9444444444444444,
  "precision": 0.9287671232876712,
  "recall": 0.9338842975206612,
  "logloss": 0.20294365616273274
}


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [None]:
# Save artifacts for each model
for model_name, model_results in results.items():
    model_dir = model_results['model_dir']
    
    # Save CV metrics
    cv_df = pd.DataFrame(model_results['cv_metrics'])
    
    # Add summary statistics
    summary_stats = []
    for metric in ['roc_auc', 'average_precision', 'f1', 'accuracy', 'precision', 'recall', 'logloss']:
        summary_stats.append({
            'fold': 'mean',
            metric: cv_df[metric].mean(),
            'threshold': cv_df['threshold'].mean()
        })
        summary_stats.append({
            'fold': 'std',
            metric: cv_df[metric].std(),
            'threshold': cv_df['threshold'].std()
        })
    
    cv_summary_df = pd.concat([cv_df, pd.DataFrame(summary_stats)], ignore_index=True)
    cv_summary_df.to_csv(f"{model_dir}/logs/cv_metrics.csv", index=False)
    
    # Save test metrics
    with open(f"{model_dir}/logs/test_metrics.json", 'w') as f:
        json.dump(model_results['test_metrics'], f, indent=2)
    
    # Save model (handle different formats)
    final_pipeline = create_pipeline(models_config[model_name]['estimator'], models_config[model_name]['use_pipeline'])
    grid_search_final = GridSearchCV(
        final_pipeline, models_config[model_name]['param_grid'], 
        cv=cv, scoring='roc_auc', n_jobs=-1
    )
    grid_search_final.fit(X_train_pool, y_train_pool)
    
    if 'XGBoost' in model_name:
        # Save XGBoost in native format
        if hasattr(grid_search_final.best_estimator_, 'named_steps'):
            xgb_model = grid_search_final.best_estimator_.named_steps['classifier']
        else:
            xgb_model = grid_search_final.best_estimator_
        xgb_model.save_model(f"{model_dir}/models/final_model.json")
    elif 'LightGBM' in model_name:
        # Save LightGBM in native format
        if hasattr(grid_search_final.best_estimator_, 'named_steps'):
            lgb_model = grid_search_final.best_estimator_.named_steps['classifier']
        else:
            lgb_model = grid_search_final.best_estimator_
        lgb_model.booster_.save_model(f"{model_dir}/models/final_model.txt")
    elif 'CatBoost' in model_name:
        # Save CatBoost in native format
        if hasattr(grid_search_final.best_estimator_, 'named_steps'):
            cb_model = grid_search_final.best_estimator_.named_steps['classifier']
        else:
            cb_model = grid_search_final.best_estimator_
        cb_model.save_model(f"{model_dir}/models/final_model.cbm")
    
    # Also save as pickle for consistency
    with open(f"{model_dir}/models/final_model.pkl", 'wb') as f:
        pickle.dump(grid_search_final.best_estimator_, f)
    
    print(f"Artifacts saved for {model_name}")

print("\nAll artifacts saved!")


All artifacts saved!


In [None]:
HAS_SHAP = globals().get('HAS_SHAP', False)
print(f'[INFO] SHAP available: {HAS_SHAP}')
# Generate figures for each model with GBM-specific plots
for model_name, model_results in results.items():
    model_dir = model_results['model_dir']
    
    print(f"Generating figures for {model_name}...")
    
    # 1. ROC Curve with CV mean and std
    plt.figure(figsize=(8, 6))
    
    mean_fpr = np.linspace(0, 1, 100)
    tprs = []
    for fpr, tpr in model_results['cv_roc_curves']:
        tprs.append(np.interp(mean_fpr, fpr, tpr))
    
    mean_tpr = np.mean(tprs, axis=0)
    std_tpr = np.std(tprs, axis=0)
    
    plt.plot(mean_fpr, mean_tpr, 'b-', 
             label=f'Mean ROC (AUC = {np.mean([cv["roc_auc"] for cv in model_results["cv_metrics"]]):.3f} ± {np.std([cv["roc_auc"] for cv in model_results["cv_metrics"]]):.3f})')
    plt.fill_between(mean_fpr, mean_tpr - std_tpr, mean_tpr + std_tpr, alpha=0.2, color='blue')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/roc_cv.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # Load final model for feature importance and SHAP
    with open(f"{model_dir}/models/final_model.pkl", 'rb') as f:
        final_model = pickle.load(f)
    
    # Get the actual estimator (handle pipeline)
    if hasattr(final_model, 'named_steps'):
        estimator = final_model.named_steps['classifier']
        # Transform data through pipeline preprocessing
        X_transformed = final_model.named_steps['imputer'].transform(X_train_pool)
    else:
        estimator = final_model
        X_transformed = X_train_pool.fillna(X_train_pool.median())
    
    # 2. Feature importance
    plt.figure(figsize=(10, 6))
    
    if hasattr(estimator, 'feature_importances_'):
        importances = estimator.feature_importances_
        indices = np.argsort(importances)[-20:]  # Top 20
        
        plt.barh(range(len(indices)), importances[indices], alpha=0.7)
        plt.yticks(range(len(indices)), [feature_cols[i] for i in indices])
        plt.xlabel('Feature Importance')
        plt.title(f'Top 20 Feature Importances - {model_name}')
        plt.grid(True, alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'Feature importance not available', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title(f'Feature Importance - {model_name}')
    
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/feature_importance.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # 3. SHAP summary plot (for tree-based models)
    try:
        print(f"  Generating SHAP plot for {model_name}...")
        
        # Sample data for SHAP (computational efficiency)
        sample_size = min(1000, len(X_transformed))
        sample_indices = np.random.choice(len(X_transformed), sample_size, replace=False)
        X_sample = X_transformed[sample_indices] if isinstance(X_transformed, np.ndarray) else X_transformed.iloc[sample_indices]
        
        # Create SHAP explainer
        if 'XGBoost' in model_name or 'LightGBM' in model_name or 'CatBoost' in model_name or 'GradientBoosting' in model_name:
            explainer = # SHAP usage disabled when not available
shap.TreeExplainer(estimator)
            shap_values = explainer.shap_values(X_sample)
            
            # Handle different SHAP value formats
            if isinstance(shap_values, list):
                shap_values = shap_values[1]  # For binary classification, take positive class
            
            plt.figure(figsize=(10, 6))
            # SHAP usage disabled when not available
shap.summary_plot(shap_values, X_sample, feature_names=feature_cols, show=False)
            plt.title(f'SHAP Summary - {model_name}')
            plt.tight_layout()
            plt.savefig(f"{model_dir}/figures/shap_summary.png", dpi=200, bbox_inches='tight')
            plt.close()
            
    except Exception as e:
        print(f"    SHAP plot failed for {model_name}: {str(e)}")
        # Create placeholder
        plt.figure(figsize=(8, 6))
        plt.text(0.5, 0.5, f'SHAP plot failed:\n{str(e)}', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title(f'SHAP Summary - {model_name}')
        plt.tight_layout()
        plt.savefig(f"{model_dir}/figures/shap_summary.png", dpi=200, bbox_inches='tight')
        plt.close()
    
    # Generate other standard plots (confusion matrix, calibration, etc.)
    # ... (similar to previous notebooks)

print("All figures generated!")

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 95)

In [None]:
# Create summary table and final results
summary_data = []

for model_name, model_results in results.items():
    test_metrics = model_results['test_metrics']
    cv_metrics = model_results['cv_metrics']
    
    summary_data.append({
        'model': model_name,
        'test_auc': test_metrics['roc_auc'],
        'test_ap': test_metrics['average_precision'],
        'test_f1': test_metrics['f1'],
        'test_accuracy': test_metrics['accuracy'],
        'cv_auc_mean': np.mean([cv['roc_auc'] for cv in cv_metrics]),
        'cv_auc_std': np.std([cv['roc_auc'] for cv in cv_metrics]),
        'best_params': str(model_results['best_params']),
        'artifacts_path': model_results['model_dir']
    })

# Sort by test AUC
summary_df = pd.DataFrame(summary_data).sort_values('test_auc', ascending=False)

# Save summary
os.makedirs('../outputs/boosting_gbms', exist_ok=True)
summary_df.to_csv('../outputs/boosting_gbms/summary.csv', index=False)

# Display results
print("\n" + "="*80)
print("BOOSTING GBMs CATEGORY - FINAL RESULTS")
print("="*80)
print("\nRanked by Test AUC:")
print("-" * 60)

for idx, row in summary_df.iterrows():
    print(f"{row['model']:30s} | AUC: {row['test_auc']:.4f} | AP: {row['test_ap']:.4f} | F1: {row['test_f1']:.4f}")
    print(f"{'':30s} | CV AUC: {row['cv_auc_mean']:.4f}±{row['cv_auc_std']:.4f}")
    print(f"{'':30s} | Artifacts: {row['artifacts_path']}")
    print("-" * 60)

print(f"\nBest Model: {summary_df.iloc[0]['model']}")
print(f"Best Test AUC: {summary_df.iloc[0]['test_auc']:.4f}")
print(f"\nAll results saved to: ../outputs/boosting_gbms/")
print(f"Summary saved to: ../outputs/boosting_gbms/summary.csv")

KeyError: 'test_auc'