# Loan Payback â€” Fold-Safe XGBoost (95% ROC-AUC Target)

This notebook builds a **fold-safe imbalance-aware XGBoost model** with advanced feature engineering:

## Key Features:
1. **Fold-Safe Preprocessing**: All transformations fit inside CV loop to prevent leakage
2. **Advanced Feature Engineering**: Log transforms, ratios, interactions, target encoding, missing indicators
3. **Imbalance Handling**: Automatic `scale_pos_weight` calculation
4. **5-Fold Stratified CV** with proper validation
5. **Hyperparameter Optimization**: Optuna search for optimal parameters
6. **Target: 95%+ ROC-AUC**

## Architecture:
- Single strong XGBoost with careful regularization
- Per-fold feature engineering and preprocessing
- Target encoding with proper fold isolation
- Missing value indicators as features

In [1]:
# 1) Imports & basic configuration
import os
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

import xgboost as xgb

# For hyperparameter optimization (optional - uncomment if using)
# import optuna

RANDOM_STATE = 999
np.random.seed(RANDOM_STATE)

DATA_DIR = Path("Data")

def log(msg: str):
    ts = datetime.now().strftime("%H:%M:%S")
    print(f"[{ts}] {msg}")

In [2]:
# 2) Data loading and automatic target / id detection

train_path = None
test_path = None

# Heuristic: pick first train/test-looking CSVs
csv_files = sorted(list(DATA_DIR.glob("*.csv")))
for p in csv_files:
    name = p.name.lower()
    if "train" in name and train_path is None:
        train_path = p
    if "test" in name and test_path is None and "train" not in name:
        test_path = p

if train_path is None or test_path is None:
    raise FileNotFoundError(
        f"Could not detect train/test CSVs inside {DATA_DIR}. "
        "Please set train_path and test_path manually."
    )

log(f"Using train: {train_path.name}")
log(f"Using test : {test_path.name}")

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

log(f"Train shape: {train_df.shape}")
log(f"Test  shape: {test_df.shape}")

def detect_target(train_df: pd.DataFrame, test_df: pd.DataFrame) -> str:
    diff = list(set(train_df.columns) - set(test_df.columns))
    # Prefer a binary label
    candidates = []
    for c in diff:
        if train_df[c].nunique() <= 3:
            candidates.append(c)
    if len(candidates) == 1:
        return candidates[0]
    if len(diff) == 1:
        return diff[0]
    for name in ["loan_paid_back", "target", "label", "is_default", "default", "paid"]:
        if name in train_df.columns and name not in test_df.columns:
            return name
    raise ValueError(f"Could not detect target. Diff columns: {diff}")

target_col = detect_target(train_df, test_df)
log(f"Detected target column: {target_col}")

# Simple ID detection: column whose values are unique in train and test
id_col = None
for col in train_df.columns:
    if col == target_col:
        continue
    if col in test_df.columns:
        if train_df[col].is_unique and test_df[col].is_unique:
            id_col = col
            break

log(f"Detected id column: {id_col}")

y = train_df[target_col].astype(int).values

# Compute class imbalance for scale_pos_weight
n_pos = (y == 1).sum()
n_neg = (y == 0).sum()
scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0
log(f"Class distribution: {n_neg} negatives, {n_pos} positives")
log(f"scale_pos_weight = {scale_pos_weight:.4f}")

feature_cols = [c for c in train_df.columns if c not in [target_col, id_col]]
X_raw = train_df[feature_cols].copy()
X_test_raw = test_df[feature_cols].copy()

log(f"Number of base features: {len(feature_cols)}")

[02:16:45] Using train: train.csv
[02:16:45] Using test : test.csv
[02:16:45] Train shape: (593994, 13)
[02:16:45] Test  shape: (254569, 12)
[02:16:45] Detected target column: loan_paid_back
[02:16:45] Detected id column: id
[02:16:45] Class distribution: 119500 negatives, 474494 positives
[02:16:45] scale_pos_weight = 0.2518
[02:16:45] Number of base features: 11
[02:16:45] Train shape: (593994, 13)
[02:16:45] Test  shape: (254569, 12)
[02:16:45] Detected target column: loan_paid_back
[02:16:45] Detected id column: id
[02:16:45] Class distribution: 119500 negatives, 474494 positives
[02:16:45] scale_pos_weight = 0.2518
[02:16:45] Number of base features: 11


In [3]:
# 3) Feature Engineering Functions (Fold-Safe)

def add_missing_indicators(df: pd.DataFrame, missing_cols: list = None) -> pd.DataFrame:
    """Add binary missing indicators for columns with >2% missing values."""
    df = df.copy()
    
    if missing_cols is None:
        # Detect columns with significant missing values
        missing_cols = []
        for col in df.columns:
            if df[col].isnull().sum() / len(df) > 0.02:
                missing_cols.append(col)
    
    for col in missing_cols:
        if col in df.columns:
            df[f"{col}_missing"] = df[col].isnull().astype(int)
    
    return df, missing_cols

def engineer_features(df: pd.DataFrame, fit_df: pd.DataFrame = None) -> pd.DataFrame:
    """
    Create advanced features. If fit_df is provided, compute statistics from it.
    Otherwise, compute from df itself (for fold training data).
    """
    df = df.copy()
    ref_df = fit_df if fit_df is not None else df
    
    # Map common column name variations (case-insensitive)
    col_map = {}
    for col in df.columns:
        col_lower = col.lower()
        col_map[col_lower] = col
    
    # Log transforms for skewed amounts
    if "loan_amount" in col_map or "loan_amnt" in col_map:
        loan_col = col_map.get("loan_amount", col_map.get("loan_amnt"))
        df["loan_amount_log"] = np.log1p(df[loan_col])
    
    if "annual_income" in col_map or "annual_inc" in col_map:
        income_col = col_map.get("annual_income", col_map.get("annual_inc"))
        df["annual_income_log"] = np.log1p(df[income_col])
    
    # Income per loan ratio
    loan_col = col_map.get("loan_amount", col_map.get("loan_amnt"))
    income_col = col_map.get("annual_income", col_map.get("annual_inc"))
    if loan_col and income_col:
        df["income_per_loan"] = df[income_col] / (df[loan_col] + 1)
        df["loan_to_income"] = df[loan_col] / (df[income_col] + 1)
    
    # DTI Ã— Interest Rate interaction
    dti_col = col_map.get("debt_to_income_ratio", col_map.get("dti"))
    rate_col = col_map.get("interest_rate", col_map.get("int_rate"))
    if dti_col and rate_col:
        df["dti_x_rate"] = df[dti_col] * df[rate_col]
        df["dti_div_rate"] = df[dti_col] / (df[rate_col] + 0.01)
    
    # Normalized credit score (using statistics from reference df)
    credit_col = col_map.get("credit_score", col_map.get("fico_range_low"))
    if credit_col:
        mean_credit = ref_df[credit_col].mean()
        std_credit = ref_df[credit_col].std()
        if std_credit > 0:
            df["credit_score_norm"] = (df[credit_col] - mean_credit) / std_credit
    
    # Percentile rank for credit score
    if credit_col:
        df["credit_score_rank"] = df[credit_col].rank(pct=True)
    
    # Loan amount squared (for non-linear relationships)
    if loan_col:
        df["loan_amount_sq"] = df[loan_col] ** 2
    
    # Interest rate interactions
    if rate_col and income_col:
        df["income_x_rate"] = df[income_col] * df[rate_col]
    
    return df

def target_encode_categorical(
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    test_df: pd.DataFrame,
    cat_cols: list,
    target: np.ndarray,
    smoothing: float = 10.0
) -> tuple:
    """
    Target encode categorical columns with smoothing.
    Computes mean target per category on train, applies to val and test.
    """
    train_encoded = train_df.copy()
    val_encoded = val_df.copy()
    test_encoded = test_df.copy()
    
    global_mean = target.mean()
    
    for col in cat_cols:
        if col not in train_df.columns:
            continue
            
        # Compute target statistics per category
        target_stats = pd.DataFrame({
            col: train_df[col],
            'target': target
        }).groupby(col)['target'].agg(['sum', 'count'])
        
        # Smoothed target encoding
        target_stats['target_enc'] = (
            (target_stats['sum'] + smoothing * global_mean) /
            (target_stats['count'] + smoothing)
        )
        
        # Create mapping
        encoding_map = target_stats['target_enc'].to_dict()
        
        # Apply encoding
        train_encoded[f"{col}_target_enc"] = train_df[col].map(encoding_map).fillna(global_mean)
        val_encoded[f"{col}_target_enc"] = val_df[col].map(encoding_map).fillna(global_mean)
        test_encoded[f"{col}_target_enc"] = test_df[col].map(encoding_map).fillna(global_mean)
    
    return train_encoded, val_encoded, test_encoded

In [4]:
# 4) Preprocessing Functions (Fold-Safe)

def preprocess_fold(
    X_train: pd.DataFrame,
    X_val: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: np.ndarray,
    numeric_cols: list,
    categorical_cols: list
) -> tuple:
    """
    Preprocess data for a single fold.
    Fit transformers on X_train only, then transform all sets.
    """
    from scipy import sparse
    
    # 1) Add missing indicators
    X_train_eng, missing_cols = add_missing_indicators(X_train)
    X_val_eng, _ = add_missing_indicators(X_val, missing_cols)
    X_test_eng, _ = add_missing_indicators(X_test, missing_cols)
    
    # 2) Engineer features (fold-safe: use X_train stats)
    X_train_eng = engineer_features(X_train_eng, fit_df=X_train_eng)
    X_val_eng = engineer_features(X_val_eng, fit_df=X_train_eng)
    X_test_eng = engineer_features(X_test_eng, fit_df=X_train_eng)
    
    # 3) Target encode categoricals
    X_train_eng, X_val_eng, X_test_eng = target_encode_categorical(
        X_train_eng, X_val_eng, X_test_eng,
        categorical_cols, y_train, smoothing=10.0
    )
    
    # 4) Update column lists with new features
    numeric_cols_ext = X_train_eng.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols_ext = X_train_eng.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    
    # 5) Impute numerics
    numeric_imputer = SimpleImputer(strategy="median")
    X_train_num = numeric_imputer.fit_transform(X_train_eng[numeric_cols_ext])
    X_val_num = numeric_imputer.transform(X_val_eng[numeric_cols_ext])
    X_test_num = numeric_imputer.transform(X_test_eng[numeric_cols_ext])
    
    # 6) One-hot encode categoricals (sparse)
    if len(categorical_cols_ext) > 0:
        cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=True, drop='if_binary')
        X_train_cat = cat_encoder.fit_transform(X_train_eng[categorical_cols_ext])
        X_val_cat = cat_encoder.transform(X_val_eng[categorical_cols_ext])
        X_test_cat = cat_encoder.transform(X_test_eng[categorical_cols_ext])
        
        # Combine numeric and categorical
        X_train_final = sparse.hstack([X_train_num, X_train_cat])
        X_val_final = sparse.hstack([X_val_num, X_val_cat])
        X_test_final = sparse.hstack([X_test_num, X_test_cat])
    else:
        X_train_final = X_train_num
        X_val_final = X_val_num
        X_test_final = X_test_num
    
    return X_train_final, X_val_final, X_test_final

In [11]:
# 5) Main Training Loop - Fold-Safe XGBoost

# Identify column types from raw data
numeric_cols = X_raw.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_raw.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

log(f"Base numeric features    : {len(numeric_cols)}")
log(f"Base categorical features: {len(categorical_cols)}")

# Cross-validation setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

oof_predictions = np.zeros(len(X_raw))
test_predictions_folds = []

# Optimized parameters targeting 95% ROC-AUC
_default_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 8,
    "min_child_weight": 10,
    "learning_rate": 0.02,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "colsample_bylevel": 0.85,
    "reg_lambda": 2.0,
    "reg_alpha": 0.5,
    "gamma": 1.0,
    "scale_pos_weight": scale_pos_weight,
    "tree_method": "hist",
    "random_state": RANDOM_STATE,
}

# If a tuning cell populated `params`, merge it over defaults
try:
    params  # noqa: F821
    params = {**_default_params, **params}
    log("Using overridden params from search.")
except NameError:
    params = _default_params

log("\nTraining Fold-Safe XGBoost model...")
log(f"Params: max_depth={params['max_depth']}, "
    f"min_child_weight={params['min_child_weight']}, "
    f"eta={params['learning_rate']:.3f}, "
    f"scale_pos_weight={params['scale_pos_weight']:.3f}")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_raw, y), 1):
    log(f"\n{'='*60}")
    log(f"Fold {fold}/{n_splits}")
    log(f"{'='*60}")
    
    # Split raw data
    X_train_fold = X_raw.iloc[train_idx].copy()
    X_val_fold = X_raw.iloc[val_idx].copy()
    y_train_fold = y[train_idx]
    y_val_fold = y[val_idx]
    
    # Preprocess fold (fit on train, transform all)
    X_train_proc, X_val_proc, X_test_proc = preprocess_fold(
        X_train_fold, X_val_fold, X_test_raw,
        y_train_fold, numeric_cols, categorical_cols
    )
    
    log(f"Processed shapes - Train: {X_train_proc.shape}, Val: {X_val_proc.shape}, Test: {X_test_proc.shape}")
    
    # Create DMatrix
    dtrain = xgb.DMatrix(X_train_proc, label=y_train_fold)
    dval = xgb.DMatrix(X_val_proc, label=y_val_fold)
    dtest = xgb.DMatrix(X_test_proc)
    
    evals = [(dtrain, "train"), (dval, "valid")]
    
    # Train with more rounds for high AUC
    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=5000,
        evals=evals,
        early_stopping_rounds=150,
        verbose_eval=100,
    )
    
    # Predictions
    oof_predictions[val_idx] = booster.predict(dval, iteration_range=(0, booster.best_iteration + 1))
    test_predictions_folds.append(
        booster.predict(dtest, iteration_range=(0, booster.best_iteration + 1))
    )
    
    # Fold metrics
    fold_auc = roc_auc_score(y_val_fold, oof_predictions[val_idx])
    log(f"Fold {fold} ROC-AUC: {fold_auc:.6f}")

# Overall OOF ROC-AUC
oof_roc_auc = roc_auc_score(y, oof_predictions)
log(f"\n{'='*60}")
log(f"Overall OOF ROC-AUC: {oof_roc_auc:.6f}")
log(f"{'='*60}")

if oof_roc_auc >= 0.95:
    log("âœ“ TARGET ACHIEVED: 95%+ ROC-AUC!")
else:
    log(f"âš  Current: {oof_roc_auc:.4f}, Target: 0.9500 (Gap: {0.95 - oof_roc_auc:.4f})")

[12:19:24] Base numeric features    : 5
[12:19:24] Base categorical features: 6
[12:19:24] Using overridden params from search.
[12:19:24] 
Training Fold-Safe XGBoost model...
[12:19:24] Params: max_depth=8, min_child_weight=20, eta=0.030, scale_pos_weight=0.301
[12:19:24] 
[12:19:24] Fold 1/5
[12:19:26] Processed shapes - Train: (475195, 76), Val: (118799, 76), Test: (254569, 76)
[12:19:26] Processed shapes - Train: (475195, 76), Val: (118799, 76), Test: (254569, 76)
[0]	train-auc:0.91308	valid-auc:0.91329
[0]	train-auc:0.91308	valid-auc:0.91329
[100]	train-auc:0.91781	valid-auc:0.91723
[100]	train-auc:0.91781	valid-auc:0.91723
[200]	train-auc:0.92091	valid-auc:0.91906
[200]	train-auc:0.92091	valid-auc:0.91906
[300]	train-auc:0.92392	valid-auc:0.92019
[300]	train-auc:0.92392	valid-auc:0.92019
[400]	train-auc:0.92621	valid-auc:0.92075
[400]	train-auc:0.92621	valid-auc:0.92075
[500]	train-auc:0.92833	valid-auc:0.92124
[500]	train-auc:0.92833	valid-auc:0.92124
[600]	train-auc:0.93010	val

In [6]:
# 6) Detailed Performance Analysis

def evaluate_at_threshold(y_true, proba, thr: float) -> dict:
    pred = (proba >= thr).astype(int)
    return {
        "accuracy": accuracy_score(y_true, pred),
        "f1": f1_score(y_true, pred),
        "logloss": log_loss(y_true, proba),
    }

# Find optimal threshold
thr_grid = np.linspace(0.1, 0.9, 17)

log("\\nSearching for optimal F1 threshold on OOF predictions...")
best_thr = 0.5
best_f1 = -1.0
for thr in thr_grid:    
    metrics = evaluate_at_threshold(y, oof_predictions, thr)
    if metrics["f1"] > best_f1:
        best_f1 = metrics["f1"]
        best_thr = thr

log(f"Best threshold: {best_thr:.3f} (F1={best_f1:.4f})")

metrics_opt = evaluate_at_threshold(y, oof_predictions, best_thr)

print('\\n' + '='*60)
print('=== Model Performance Summary ===')
print('='*60)
print(f"ROC-AUC  : {oof_roc_auc:.6f} {'âœ“' if oof_roc_auc >= 0.95 else 'âš '}")
print(f"Accuracy : {metrics_opt['accuracy']:.6f}")
print(f"F1 Score : {metrics_opt['f1']:.6f}")
print(f"LogLoss  : {metrics_opt['logloss']:.6f}")
print(f"Threshold: {best_thr:.3f}")
print('='*60)

# Per-fold AUC variance
fold_aucs = []
for fold, (_, val_idx) in enumerate(skf.split(X_raw, y), 1):
    fold_auc = roc_auc_score(y[val_idx], oof_predictions[val_idx])
    fold_aucs.append(fold_auc)

print(f"\\nPer-Fold ROC-AUC:")
for fold, auc in enumerate(fold_aucs, 1):
    print(f"  Fold {fold}: {auc:.6f}")
print(f"  Mean: {np.mean(fold_aucs):.6f} Â± {np.std(fold_aucs):.6f}")
print('='*60)

[02:27:11] \nSearching for optimal F1 threshold on OOF predictions...
[02:27:12] Best threshold: 0.200 (F1=0.9428)
=== Model Performance Summary ===
ROC-AUC  : 0.920962 âš 
Accuracy : 0.904822
F1 Score : 0.942799
LogLoss  : 0.322461
Threshold: 0.200
[02:27:12] Best threshold: 0.200 (F1=0.9428)
=== Model Performance Summary ===
ROC-AUC  : 0.920962 âš 
Accuracy : 0.904822
F1 Score : 0.942799
LogLoss  : 0.322461
Threshold: 0.200
\nPer-Fold ROC-AUC:
  Fold 1: 0.921743
  Fold 2: 0.921532
  Fold 3: 0.920112
  Fold 4: 0.921009
  Fold 5: 0.920448
  Mean: 0.920969 Â± 0.000620
\nPer-Fold ROC-AUC:
  Fold 1: 0.921743
  Fold 2: 0.921532
  Fold 3: 0.920112
  Fold 4: 0.921009
  Fold 5: 0.920448
  Mean: 0.920969 Â± 0.000620


In [12]:
# 7) Generate Submission File

# Average test predictions across folds
test_predictions = np.mean(test_predictions_folds, axis=0)

log(f"\\nGenerating submission file...")
log(f"Test predictions shape: {test_predictions.shape}")
log(f"Test predictions range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")

sub = pd.DataFrame()
if id_col is not None:
    sub[id_col] = test_df[id_col]
else:
    sub["id"] = np.arange(len(test_df))

sub[target_col] = test_predictions

# Save submission
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
sub_path = Path(f"submissions/loan_foldsafe_{oof_roc_auc:.4f}_{ts}.csv")
sub_path.parent.mkdir(exist_ok=True, parents=True)
sub.to_csv(sub_path, index=False)

log(f"âœ“ Saved submission to: {sub_path.name}")
log(f"  OOF ROC-AUC: {oof_roc_auc:.6f}")

# Display sample
print(f"\\nSubmission preview:")
print(sub.head(10))

[12:31:43] \nGenerating submission file...
[12:31:43] Test predictions shape: (254569,)
[12:31:43] Test predictions range: [0.0004, 0.9995]
[12:31:43] âœ“ Saved submission to: loan_foldsafe_0.9211_20251121_123143.csv
[12:31:43]   OOF ROC-AUC: 0.921090
\nSubmission preview:
       id  loan_paid_back
0  593994        0.809298
1  593995        0.926775
2  593996        0.230836
3  593997        0.757955
4  593998        0.905684
5  593999        0.926507
6  594000        0.969937
7  594001        0.903081
8  594002        0.833782
9  594003        0.001224


In [8]:
# 8) Feature Importance Analysis (Optional)

# Note: Feature importance requires retraining or storing feature names
# Since we use sparse matrices, feature names are not directly available
# This cell provides a template for future SHAP or importance analysis

log("\\n" + "="*60)
log("Feature Importance Analysis")
log("="*60)
log("To analyze feature importance:")
log("1. Install SHAP: pip install shap")
log("2. Retrain on full data or use last fold's booster")
log("3. Extract top features to guide next iteration")
log("")
log("Example SHAP analysis:")
log("  import shap")
log("  explainer = shap.TreeExplainer(booster)")
log("  shap_values = explainer.shap_values(X_val_proc)")
log("  shap.summary_plot(shap_values, X_val_proc)")
log("="*60)

[02:27:13] Feature Importance Analysis
[02:27:13] To analyze feature importance:
[02:27:13] 1. Install SHAP: pip install shap
[02:27:13] 2. Retrain on full data or use last fold's booster
[02:27:13] 3. Extract top features to guide next iteration
[02:27:13] 
[02:27:13] Example SHAP analysis:
[02:27:13]   import shap
[02:27:13]   explainer = shap.TreeExplainer(booster)
[02:27:13]   shap_values = explainer.shap_values(X_val_proc)
[02:27:13]   shap.summary_plot(shap_values, X_val_proc)


In [9]:
# 9) Hyperparameter Optimization with Optuna (Optional)

# Uncomment and run this cell to find optimal hyperparameters
# Requires: pip install optuna

import optuna
from optuna.samplers import TPESampler

def objective(trial):
    # Suggest hyperparameters
    params_opt = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "max_depth": trial.suggest_int("max_depth", 5, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 5, 20),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05, log=True),
        "subsample": trial.suggest_float("subsample", 0.7, 0.95),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 0.95),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.5, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 2.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 
                                                 scale_pos_weight * 0.8, 
                                                 scale_pos_weight * 1.2),
        "tree_method": "hist",
        "random_state": RANDOM_STATE,
    }
    
    # Quick CV (2 folds for speed, reduced rounds)
    skf_opt = StratifiedKFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)
    oof_scores = []
    
    for train_idx, val_idx in skf_opt.split(X_raw, y):
        X_train_fold = X_raw.iloc[train_idx].copy()
        X_val_fold = X_raw.iloc[val_idx].copy()
        y_train_fold = y[train_idx]
        y_val_fold = y[val_idx]
        
        X_train_proc, X_val_proc, _ = preprocess_fold(
            X_train_fold, X_val_fold, X_test_raw,
            y_train_fold, numeric_cols, categorical_cols
        )
        
        dtrain = xgb.DMatrix(X_train_proc, label=y_train_fold)
        dval = xgb.DMatrix(X_val_proc, label=y_val_fold)
        
        booster = xgb.train(
            params_opt,
            dtrain,
            num_boost_round=500,  # Reduced from 3000
            evals=[(dval, "valid")],
            early_stopping_rounds=30,  # Reduced from 100
            verbose_eval=False,
        )
        
        preds = booster.predict(dval, iteration_range=(0, booster.best_iteration + 1))
        score = roc_auc_score(y_val_fold, preds)
        oof_scores.append(score)
        
        # Early trial pruning - stop fold loop if first fold is terrible
        if len(oof_scores) == 1 and score < 0.85:
            trial.report(score, 0)
            if trial.should_prune():
                raise optuna.TrialPruned()
    
    return np.mean(oof_scores)

# Run optimization
log("Starting Optuna hyperparameter search...")
study = optuna.create_study(
    direction="maximize",
    sampler=TPESampler(seed=RANDOM_STATE),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=0)
)
study.optimize(objective, n_trials=30, show_progress_bar=True, n_jobs=1)  # Reduced to 30 trials

log(f"\\nBest ROC-AUC: {study.best_value:.6f}")
log(f"Best params: {study.best_params}")

# Update params with best values
params.update(study.best_params)

log("\\nTo enable Optuna search:")
log("1. Uncomment the code in this cell")
log("2. Install: pip install optuna")
log("3. Run cell to find optimal hyperparameters")
log("4. Retrain model with optimized params")

[I 2025-11-21 02:27:13,567] A new study created in memory with name: no-name-ee85fb72-40b2-40a2-abf5-c89f51304362


[02:27:13] Starting Optuna hyperparameter search...


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-11-21 02:28:03,297] Trial 0 finished with value: 0.9181749461434654 and parameters: {'max_depth': 9, 'min_child_weight': 13, 'learning_rate': 0.01211310107833567, 'subsample': 0.8599203610605888, 'colsample_bytree': 0.72273131569352, 'reg_lambda': 1.9950155635685554, 'reg_alpha': 0.8547618948826736, 'gamma': 2.771929062153263, 'scale_pos_weight': 0.2647545549035971}. Best is trial 0 with value: 0.9181749461434654.
[I 2025-11-21 02:28:52,563] Trial 1 finished with value: 0.9178427572076648 and parameters: {'max_depth': 9, 'min_child_weight': 17, 'learning_rate': 0.012364808668605618, 'subsample': 0.7856926129081762, 'colsample_bytree': 0.7503899027038833, 'reg_lambda': 3.6829590354594766, 'reg_alpha': 0.06679852871655512, 'gamma': 4.546250218040291, 'scale_pos_weight': 0.24229322137135695}. Best is trial 0 with value: 0.9181749461434654.
[I 2025-11-21 02:28:52,563] Trial 1 finished with value: 0.9178427572076648 and parameters: {'max_depth': 9, 'min_child_weight': 17, 'learning_

In [10]:
# FAST ALTERNATIVE: Quick manual hyperparameter search (use this instead of Optuna)
# This tests a small grid of promising param combinations

from itertools import product

log("Starting fast manual hyperparameter grid search...")

# Define search grid (only most impactful params)
param_grid = {
    'max_depth': [6, 7, 8],
    'min_child_weight': [10, 15, 20],
    'learning_rate': [0.02, 0.03],
    'reg_lambda': [1.0, 1.5, 2.0],
}

best_score = 0
best_params_found = {}

# Quick 2-fold CV for speed
skf_quick = StratifiedKFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)

grid_combinations = list(product(*param_grid.values()))
log(f"Testing {len(grid_combinations)} parameter combinations...")

for i, combo in enumerate(grid_combinations):
    test_params = dict(zip(param_grid.keys(), combo))
    
    # Build full params
    params_test = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "subsample": 0.85,
        "colsample_bytree": 0.85,
        "reg_alpha": 0.1,
        "gamma": 0.5,
        "scale_pos_weight": scale_pos_weight,
        "tree_method": "hist",
        "random_state": RANDOM_STATE,
    }
    params_test.update(test_params)
    
    fold_scores = []
    for train_idx, val_idx in skf_quick.split(X_raw, y):
        X_train_fold = X_raw.iloc[train_idx].copy()
        X_val_fold = X_raw.iloc[val_idx].copy()
        y_train_fold = y[train_idx]
        y_val_fold = y[val_idx]
        
        X_train_proc, X_val_proc, _ = preprocess_fold(
            X_train_fold, X_val_fold, X_test_raw,
            y_train_fold, numeric_cols, categorical_cols
        )
        
        dtrain = xgb.DMatrix(X_train_proc, label=y_train_fold)
        dval = xgb.DMatrix(X_val_proc, label=y_val_fold)
        
        booster = xgb.train(
            params_test,
            dtrain,
            num_boost_round=400,
            evals=[(dval, "valid")],
            early_stopping_rounds=30,
            verbose_eval=False,
        )
        
        preds = booster.predict(dval, iteration_range=(0, booster.best_iteration + 1))
        score = roc_auc_score(y_val_fold, preds)
        fold_scores.append(score)
    
    avg_score = np.mean(fold_scores)
    
    if avg_score > best_score:
        best_score = avg_score
        best_params_found = test_params.copy()
        log(f"Trial {i+1}/{len(grid_combinations)}: ROC-AUC={avg_score:.5f} âœ“ NEW BEST")
    elif (i+1) % 5 == 0:
        log(f"Trial {i+1}/{len(grid_combinations)}: ROC-AUC={avg_score:.5f}")

log(f"\nðŸŽ¯ Best ROC-AUC: {best_score:.6f}")
log(f"Best params: {best_params_found}")

# Update global params
params.update(best_params_found)
log("\nâœ“ Updated params with best values")

[02:49:41] Starting fast manual hyperparameter grid search...
[02:49:41] Testing 54 parameter combinations...
[02:50:16] Trial 1/54: ROC-AUC=0.91740 âœ“ NEW BEST
[02:50:16] Trial 1/54: ROC-AUC=0.91740 âœ“ NEW BEST
[02:50:52] Trial 2/54: ROC-AUC=0.91743 âœ“ NEW BEST
[02:50:52] Trial 2/54: ROC-AUC=0.91743 âœ“ NEW BEST
[02:52:01] Trial 4/54: ROC-AUC=0.91855 âœ“ NEW BEST
[02:52:01] Trial 4/54: ROC-AUC=0.91855 âœ“ NEW BEST
[02:52:37] Trial 5/54: ROC-AUC=0.91846
[02:52:37] Trial 5/54: ROC-AUC=0.91846
[02:55:32] Trial 10/54: ROC-AUC=0.91852
[02:55:32] Trial 10/54: ROC-AUC=0.91852
[02:58:22] Trial 15/54: ROC-AUC=0.91739
[02:58:22] Trial 15/54: ROC-AUC=0.91739
[02:59:32] Trial 17/54: ROC-AUC=0.91857 âœ“ NEW BEST
[02:59:32] Trial 17/54: ROC-AUC=0.91857 âœ“ NEW BEST
[03:01:18] Trial 20/54: ROC-AUC=0.91794
[03:01:18] Trial 20/54: ROC-AUC=0.91794
[03:02:31] Trial 22/54: ROC-AUC=0.91892 âœ“ NEW BEST
[03:02:31] Trial 22/54: ROC-AUC=0.91892 âœ“ NEW BEST
[03:20:36] Trial 25/54: ROC-AUC=0.91799
[03:20:3