# Loan Payback — Meta-Boosted XGBoost (Copilot Edition)

This notebook builds a **two–stage imbalance-aware boosted model** with advanced feature engineering:

## Key Features:
1. **Auto Feature Engineering**: Log transforms, ratios, normalized credit scores
2. **Imbalance Handling**: Automatic `scale_pos_weight` calculation
3. **Stage 1**: Strong XGBoost with deeper trees and careful regularization
4. **Stage 2 (Meta Boost)**: Boosts over Stage-1 logits using `base_margin`
5. **5-Fold Stratified CV** with threshold optimization
6. **Automatic model selection** based on OOF AUC

## Target: 0.925+ AUC
Follow the tuning guide at the end to iteratively improve performance.

In [20]:

# 1) Imports & basic configuration
import os
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import xgboost as xgb

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_DIR = Path("Data")
# If running locally, you can override DATA_DIR, e.g.:
# DATA_DIR = Path("/mnt/data") / "loan-payback"

def log(msg: str):
    ts = datetime.now().strftime("%H:%M:%S")
    print(f"[{ts}] {msg}")


In [21]:

# 2) Data loading and automatic target / id detection

train_path = None
test_path = None

# Heuristic: pick first train/test-looking CSVs
csv_files = sorted(list(DATA_DIR.glob("*.csv")))
for p in csv_files:
    name = p.name.lower()
    if "train" in name and train_path is None:
        train_path = p
    if "test" in name and test_path is None and "train" not in name:
        test_path = p

if train_path is None or test_path is None:
    raise FileNotFoundError(
        f"Could not detect train/test CSVs inside {DATA_DIR}. "
        "Please set train_path and test_path manually."
    )

log(f"Using train: {train_path.name}")
log(f"Using test : {test_path.name}")

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

log(f"Train shape: {train_df.shape}")
log(f"Test  shape: {test_df.shape}")

def detect_target(train_df: pd.DataFrame, test_df: pd.DataFrame) -> str:
    diff = list(set(train_df.columns) - set(test_df.columns))
    # Prefer a binary label
    candidates = []
    for c in diff:
        if train_df[c].nunique() <= 3:
            candidates.append(c)
    if len(candidates) == 1:
        return candidates[0]
    if len(diff) == 1:
        return diff[0]
    for name in ["loan_paid_back", "target", "label", "is_default", "default", "paid"]:
        if name in train_df.columns and name not in test_df.columns:
            return name
    raise ValueError(f"Could not detect target. Diff columns: {diff}")

target_col = detect_target(train_df, test_df)
log(f"Detected target column: {target_col}")

# Simple ID detection: column whose values are unique in train and test
id_col = None
for col in train_df.columns:
    if col == target_col:
        continue
    if col in test_df.columns:
        if train_df[col].is_unique and test_df[col].is_unique:
            id_col = col
            break

log(f"Detected id column: {id_col}")

y = train_df[target_col].astype(int).values

# Compute class imbalance for scale_pos_weight
n_pos = (y == 1).sum()
n_neg = (y == 0).sum()
scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0
log(f"Class distribution: {n_neg} negatives, {n_pos} positives")
log(f"scale_pos_weight = {scale_pos_weight:.4f}")

feature_cols = [c for c in train_df.columns if c not in [target_col, id_col]]
X = train_df[feature_cols].copy()
X_test = test_df[feature_cols].copy()

log(f"Number of features: {len(feature_cols)}")


[23:16:23] Using train: train.csv
[23:16:23] Using test : test.csv
[23:16:24] Train shape: (593994, 13)
[23:16:24] Test  shape: (254569, 12)
[23:16:24] Detected target column: loan_paid_back
[23:16:24] Detected id column: id
[23:16:24] Class distribution: 119500 negatives, 474494 positives
[23:16:24] scale_pos_weight = 0.2518
[23:16:24] Number of features: 11
[23:16:24] Train shape: (593994, 13)
[23:16:24] Test  shape: (254569, 12)
[23:16:24] Detected target column: loan_paid_back
[23:16:24] Detected id column: id
[23:16:24] Class distribution: 119500 negatives, 474494 positives
[23:16:24] scale_pos_weight = 0.2518
[23:16:24] Number of features: 11


In [22]:

# 4) Preprocessing: numeric + categorical pipelines

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

log(f"Numeric features    : {len(numeric_cols)}")
log(f"Categorical features: {len(categorical_cols)}")

numeric_transformer = SimpleImputer(strategy="median")

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# Fit on full training data, then transform train & test
from sklearn.pipeline import Pipeline as SklearnPipeline

dummy_model = SklearnPipeline(steps=[
    ("preprocess", preprocess),
])

log("Fitting preprocessing on full training data...")
dummy_model.fit(X)

X_proc = dummy_model.transform(X)
X_test_proc = dummy_model.transform(X_test)

log(f"Processed X shape      : {X_proc.shape}")
log(f"Processed X_test shape : {X_test_proc.shape}")


[23:16:24] Numeric features    : 5
[23:16:24] Categorical features: 6
[23:16:24] Fitting preprocessing on full training data...
[23:16:25] Processed X shape      : (593994, 60)
[23:16:25] Processed X_test shape : (254569, 60)
[23:16:25] Processed X shape      : (593994, 60)
[23:16:25] Processed X_test shape : (254569, 60)


In [23]:

# 3) Feature Engineering: Add powerful derived features

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Automatically creates advanced features if the base columns exist.
    Returns a copy with new features added.
    """
    df = df.copy()
    
    # Map common column name variations (case-insensitive)
    col_map = {}
    for col in df.columns:
        col_lower = col.lower()
        col_map[col_lower] = col
    
    # Log transforms for skewed amounts
    if "loan_amount" in col_map or "loan_amnt" in col_map:
        loan_col = col_map.get("loan_amount", col_map.get("loan_amnt"))
        df["loan_amount_log"] = np.log1p(df[loan_col])
        log(f"✓ Created loan_amount_log from {loan_col}")
    
    if "annual_income" in col_map or "annual_inc" in col_map:
        income_col = col_map.get("annual_income", col_map.get("annual_inc"))
        df["annual_income_log"] = np.log1p(df[income_col])
        log(f"✓ Created annual_income_log from {income_col}")
    
    # Income per loan ratio
    loan_col = col_map.get("loan_amount", col_map.get("loan_amnt"))
    income_col = col_map.get("annual_income", col_map.get("annual_inc"))
    if loan_col and income_col:
        df["income_per_loan"] = df[income_col] / (df[loan_col] + 1)
        log(f"✓ Created income_per_loan")
    
    # DTI × Interest Rate interaction
    dti_col = col_map.get("debt_to_income_ratio", col_map.get("dti"))
    rate_col = col_map.get("interest_rate", col_map.get("int_rate"))
    if dti_col and rate_col:
        df["dti_x_rate"] = df[dti_col] * df[rate_col]
        log(f"✓ Created dti_x_rate")
    
    # Normalized credit score
    credit_col = col_map.get("credit_score", col_map.get("fico_range_low"))
    if credit_col:
        mean_credit = df[credit_col].mean()
        std_credit = df[credit_col].std()
        if std_credit > 0:
            df["credit_score_norm"] = (df[credit_col] - mean_credit) / std_credit
            log(f"✓ Created credit_score_norm from {credit_col}")
    
    return df

log("Engineering features for train and test...")
X = engineer_features(X)
X_test = engineer_features(X_test)

log(f"Features after engineering: {X.shape[1]}")


[23:16:25] Engineering features for train and test...
[23:16:25] ✓ Created loan_amount_log from loan_amount
[23:16:25] ✓ Created annual_income_log from annual_income
[23:16:25] ✓ Created income_per_loan
[23:16:25] ✓ Created dti_x_rate
[23:16:25] ✓ Created credit_score_norm from credit_score
[23:16:25] ✓ Created loan_amount_log from loan_amount
[23:16:25] ✓ Created annual_income_log from annual_income
[23:16:25] ✓ Created income_per_loan
[23:16:25] ✓ Created dti_x_rate
[23:16:25] ✓ Created credit_score_norm from credit_score
[23:16:25] Features after engineering: 16


In [24]:

# 5) Stage 1: Imbalance-Aware XGBoost with Stronger Regularization

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

oof_pred_stage1 = np.zeros(X_proc.shape[0])
test_pred_stage1_folds = []

params_stage1 = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 5,
    "min_child_weight": 20,
    "learning_rate": 0.035,
    "subsample": 0.9,
    "colsample_bytree": 0.8,
    "reg_lambda": 1.0,
    "reg_alpha": 0.0,
    "scale_pos_weight": scale_pos_weight,
    "tree_method": "hist",
    "random_state": RANDOM_STATE,
}

log("Training Stage 1 model (imbalance-aware XGBoost)...")
log(f"Params: max_depth={params_stage1['max_depth']}, "
    f"min_child_weight={params_stage1['min_child_weight']}, "
    f"eta={params_stage1['learning_rate']:.3f}, "
    f"scale_pos_weight={params_stage1['scale_pos_weight']:.3f}")

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_proc, y), 1):
    log(f"Fold {fold}/{n_splits}")
    X_tr, X_val = X_proc[tr_idx], X_proc[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test_proc)

    evals = [(dtrain, "train"), (dval, "valid")]

    booster = xgb.train(
        params_stage1,
        dtrain,
        num_boost_round=2000,
        evals=evals,
        early_stopping_rounds=100,
        verbose_eval=200,
    )

    oof_pred_stage1[val_idx] = booster.predict(dval, iteration_range=(0, booster.best_iteration + 1))
    test_pred_stage1_folds.append(
        booster.predict(dtest, iteration_range=(0, booster.best_iteration + 1))
    )

auc_stage1 = roc_auc_score(y, oof_pred_stage1)
log(f"Stage 1 OOF ROC-AUC: {auc_stage1:.5f}")


[23:16:25] Training Stage 1 model (imbalance-aware XGBoost)...
[23:16:25] Params: max_depth=5, min_child_weight=20, eta=0.035, scale_pos_weight=0.252
[23:16:25] Fold 1/5
[0]	train-auc:0.90254	valid-auc:0.90304
[0]	train-auc:0.90254	valid-auc:0.90304
[200]	train-auc:0.91698	valid-auc:0.91762
[200]	train-auc:0.91698	valid-auc:0.91762
[400]	train-auc:0.92000	valid-auc:0.91963
[400]	train-auc:0.92000	valid-auc:0.91963
[600]	train-auc:0.92239	valid-auc:0.92088
[600]	train-auc:0.92239	valid-auc:0.92088
[800]	train-auc:0.92448	valid-auc:0.92186
[800]	train-auc:0.92448	valid-auc:0.92186
[1000]	train-auc:0.92601	valid-auc:0.92240
[1000]	train-auc:0.92601	valid-auc:0.92240
[1200]	train-auc:0.92723	valid-auc:0.92263
[1200]	train-auc:0.92723	valid-auc:0.92263
[1400]	train-auc:0.92831	valid-auc:0.92284
[1400]	train-auc:0.92831	valid-auc:0.92284
[1600]	train-auc:0.92929	valid-auc:0.92294
[1600]	train-auc:0.92929	valid-auc:0.92294
[1800]	train-auc:0.93018	valid-auc:0.92299
[1800]	train-auc:0.93018	va

In [25]:

# 6) Stage 2: Meta XGBoost boosting over Stage‑1 logits (base_margin trick)

def prob_to_logit(p: np.ndarray, eps: float = 1e-6) -> np.ndarray:
    p = np.clip(p, eps, 1 - eps)
    return np.log(p / (1 - p))

log("Computing Stage 1 logits for OOF and test...")

logits_oof_stage1 = prob_to_logit(oof_pred_stage1)
test_pred_stage1_folds = np.vstack(test_pred_stage1_folds)  # (n_splits, n_test)
logits_test_stage1_folds = prob_to_logit(test_pred_stage1_folds)

oof_pred_stage2 = np.zeros(X_proc.shape[0])
test_pred_stage2_folds = []

params_stage2 = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 4,
    "min_child_weight": 10,
    "learning_rate": 0.03,
    "subsample": 0.8,
    "colsample_bytree": 0.9,
    "reg_lambda": 1.5,
    "reg_alpha": 0.2,
    "scale_pos_weight": scale_pos_weight,
    "tree_method": "hist",
    "random_state": RANDOM_STATE + 1,
}

log("Training Stage 2 meta model with base_margin (logit boosting)...")
log(f"Params: max_depth={params_stage2['max_depth']}, "
    f"min_child_weight={params_stage2['min_child_weight']}, "
    f"eta={params_stage2['learning_rate']:.3f}")

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_proc, y), 1):
    log(f"Stage 2 Fold {fold}/{n_splits}")
    X_tr, X_val = X_proc[tr_idx], X_proc[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test_proc)

    # base_margin = Stage‑1 logits
    dtrain.set_base_margin(logits_oof_stage1[tr_idx])
    dval.set_base_margin(logits_oof_stage1[val_idx])
    dtest.set_base_margin(logits_test_stage1_folds[fold - 1])

    evals = [(dtrain, "train"), (dval, "valid")]

    booster_meta = xgb.train(
        params_stage2,
        dtrain,
        num_boost_round=2000,
        evals=evals,
        early_stopping_rounds=100,
        verbose_eval=200,
    )

    oof_pred_stage2[val_idx] = booster_meta.predict(
        dval, iteration_range=(0, booster_meta.best_iteration + 1)
    )
    test_pred_stage2_folds.append(
        booster_meta.predict(dtest, iteration_range=(0, booster_meta.best_iteration + 1))
    )

auc_stage2 = roc_auc_score(y, oof_pred_stage2)
log(f"Stage 2 OOF ROC-AUC: {auc_stage2:.5f}")



[23:25:52] Computing Stage 1 logits for OOF and test...
[23:25:52] Training Stage 2 meta model with base_margin (logit boosting)...
[23:25:52] Params: max_depth=4, min_child_weight=10, eta=0.030
[23:25:52] Stage 2 Fold 1/5
[0]	train-auc:0.92172	valid-auc:0.92300
[0]	train-auc:0.92172	valid-auc:0.92300
[100]	train-auc:0.92208	valid-auc:0.92276
[100]	train-auc:0.92208	valid-auc:0.92276
[23:25:58] Stage 2 Fold 2/5
[0]	train-auc:0.92179	valid-auc:0.92274
[23:25:58] Stage 2 Fold 2/5
[0]	train-auc:0.92179	valid-auc:0.92274
[100]	train-auc:0.92207	valid-auc:0.92253
[100]	train-auc:0.92207	valid-auc:0.92253
[23:26:04] Stage 2 Fold 3/5
[0]	train-auc:0.92224	valid-auc:0.92093
[23:26:04] Stage 2 Fold 3/5
[0]	train-auc:0.92224	valid-auc:0.92093
[100]	train-auc:0.92254	valid-auc:0.92068
[100]	train-auc:0.92254	valid-auc:0.92068
[23:26:10] Stage 2 Fold 4/5
[0]	train-auc:0.92206	valid-auc:0.92163
[23:26:10] Stage 2 Fold 4/5
[0]	train-auc:0.92206	valid-auc:0.92163
[99]	train-auc:0.92237	valid-auc:0.92

In [26]:

# 7) Compare Stage 1 vs Stage 2 and find optimal F1 threshold

def evaluate_at_threshold(y_true, proba, thr: float) -> dict:
    pred = (proba >= thr).astype(int)
    return {
        "accuracy": accuracy_score(y_true, pred),
        "f1": f1_score(y_true, pred),
        "logloss": log_loss(y_true, proba),
    }

thr_grid = np.linspace(0.1, 0.9, 17)

log("Searching F1-optimal threshold on Stage 2 OOF probabilities...")
best_thr = 0.5
best_f1 = -1.0
for thr in thr_grid:
    metrics = evaluate_at_threshold(y, oof_pred_stage2, thr)
    if metrics["f1"] > best_f1:
        best_f1 = metrics["f1"]
        best_thr = thr

log(f"Best threshold on OOF: {best_thr:.3f} (F1={best_f1:.4f})")

metrics1 = evaluate_at_threshold(y, oof_pred_stage1, best_thr)
metrics2 = evaluate_at_threshold(y, oof_pred_stage2, best_thr)

print('\n' + '='*50)
print('=== Stage 1 (base model) ===')
print(f"ROC-AUC : {roc_auc_score(y, oof_pred_stage1):.5f}")
print(f"Accuracy: {metrics1['accuracy']:.5f}")
print(f"F1      : {metrics1['f1']:.5f}")
print(f"LogLoss : {metrics1['logloss']:.5f}")

print('\n=== Stage 2 (meta boosted) ===')
print(f"ROC-AUC : {roc_auc_score(y, oof_pred_stage2):.5f}")
print(f"Accuracy: {metrics2['accuracy']:.5f}")
print(f"F1      : {metrics2['f1']:.5f}")
print(f"LogLoss : {metrics2['logloss']:.5f}")

improvement = auc_stage2 - auc_stage1
print(f"\n{'✓' if improvement > 0 else '✗'} Stage 2 improvement: {improvement:+.5f} AUC")
print('='*50)


[23:26:21] Searching F1-optimal threshold on Stage 2 OOF probabilities...
[23:26:23] Best threshold on OOF: 0.200 (F1=0.9430)

=== Stage 1 (base model) ===
[23:26:23] Best threshold on OOF: 0.200 (F1=0.9430)

=== Stage 1 (base model) ===
ROC-AUC : 0.92197
Accuracy: 0.90537
F1      : 0.94304
LogLoss : 0.32496

=== Stage 2 (meta boosted) ===
ROC-AUC : 0.92197
Accuracy: 0.90538
F1      : 0.94304
LogLoss : 0.32508

✗ Stage 2 improvement: -0.00000 AUC
ROC-AUC : 0.92197
Accuracy: 0.90537
F1      : 0.94304
LogLoss : 0.32496

=== Stage 2 (meta boosted) ===
ROC-AUC : 0.92197
Accuracy: 0.90538
F1      : 0.94304
LogLoss : 0.32508

✗ Stage 2 improvement: -0.00000 AUC


In [27]:

# 8) Build final test predictions and submission file

test_pred_stage1 = test_pred_stage1_folds.mean(axis=0)
test_pred_stage2 = np.mean(np.vstack(test_pred_stage2_folds), axis=0)

# If meta model improved ROC-AUC, we use Stage 2; otherwise fall back to Stage 1
use_stage2 = auc_stage2 >= auc_stage1
final_test_proba = test_pred_stage2 if use_stage2 else test_pred_stage1

log(f"Using {'Stage 2 meta' if use_stage2 else 'Stage 1 base'} predictions for submission.")

sub = pd.DataFrame()
if id_col is not None:
    sub[id_col] = test_df[id_col]
else:
    sub["id"] = np.arange(len(test_df))

sub[target_col] = final_test_proba

ts = datetime.now().strftime("%Y%m%d_%H%M%S")
sub_path = Path(f"submissions/loan_meta_copilot_{ts}.csv")
sub_path.parent.mkdir(exist_ok=True)
sub.to_csv(sub_path, index=False)
log(f"✓ Saved submission to: {sub_path.resolve()}")


[23:26:23] Using Stage 1 base predictions for submission.
[23:26:23] ✓ Saved submission to: /Users/lionelweng/Downloads/s5e11-Predicting-Loan-Payback/submissions/loan_meta_copilot_20251120_232623.csv
[23:26:23] ✓ Saved submission to: /Users/lionelweng/Downloads/s5e11-Predicting-Loan-Payback/submissions/loan_meta_copilot_20251120_232623.csv
