# Loan Payback — Fast Meta Ensemble (AUC-first)

Goals:
- Reach >= 0.92 AUC quickly; once hit, run 1–2 more seeds then stop.
- Push toward 0.93 AUC via: threshold sweep on meta probs, isotonic calibration + threshold, and a shallow Meta-XGB (depth 3–4).
- Keep training cycles lean (no 100+ seed runs).

Outputs:
- Best CV AUC and configuration.
- Optional calibrated predictions and threshold.
- Submission file under `submissions/`.

In [None]:
# Imports & quick checks
import os, sys, json, math, warnings, gc, time, random
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, roc_curve, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.calibration import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from scipy import stats

# Try XGBoost for meta
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except Exception as e:
    XGB_AVAILABLE = False
    print("xgboost not installed; meta-XGB will be skipped.")

warnings.filterwarnings('ignore')
RANDOM_BASE = 42
np.random.seed(RANDOM_BASE)
random.seed(RANDOM_BASE)

ROOT = Path.cwd()
DATA_DIR = ROOT / 'Data'
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'
SAMPLE_SUB_PATH = DATA_DIR / 'sample_submission.csv'
SUB_DIR = ROOT / 'submissions'
SUB_DIR.mkdir(exist_ok=True, parents=True)

print(f'ROOT: {ROOT}')
print(f'Files exist? train={TRAIN_PATH.exists()} test={TEST_PATH.exists()} sample={SAMPLE_SUB_PATH.exists()}')

ROOT: /Users/lionelweng/Downloads/s5e11-Predicting-Loan-Payback
Files exist? train=True test=True sample=True


In [None]:
# Config & target/id detection
TARGET_CANDIDATES = ['target','TARGET','label','Label','default','is_default','loan_status','loan_repaid']
ID_CANDIDATES = ['id','ID','loan_id','Loan_ID']

def detect_columns(df: pd.DataFrame):
    cols = df.columns.tolist()
    id_col = None
    for c in ID_CANDIDATES:
        if c in cols:
            id_col = c
            break
    
    target_col = None
    for c in TARGET_CANDIDATES:
        if c in cols:
            target_col = c
            break
    if target_col is None:
        # Heuristic: last column if binary-like
        last = cols[-1]
        if df[last].dropna().isin([0,1]).mean() > 0.9:
            target_col = last
    return id_col, target_col

# Peek few rows to detect columns
preview = pd.read_csv(TRAIN_PATH, nrows=100)
ID_COL, TARGET = detect_columns(preview)
print('Detected ID_COL=', ID_COL, ' TARGET=', TARGET)
assert TARGET is not None, 'Target column not detected; please set TARGET manually.'

# Load full data
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH) if TEST_PATH.exists() else None
print(train.shape, 'train shape')
if test is not None:
    print(test.shape, 'test shape')

In [None]:
# Feature/target split and preprocessing pipeline

y = train[TARGET].astype(int)
X = train.drop(columns=[TARGET] + ([ID_COL] if ID_COL else []))
X_test = None
if 'test' in globals() and test is not None:
    X_test = test.drop(columns=[ID_COL] if ID_COL else [])

num_cols = X.select_dtypes(include=['number','float','int','Int8','Int16','Int32','Int64']).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]
print(f'Numeric: {len(num_cols)}, Categorical: {len(cat_cols)}')

numeric_tf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])
categorical_tf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
])
preprocess = ColumnTransformer(transformers=[
    ('num', numeric_tf, num_cols),
    ('cat', categorical_tf, cat_cols)
])


def build_model(name: str):
    if name == 'logreg':
        clf = LogisticRegression(max_iter=2000, n_jobs=None, C=1.0, solver='lbfgs')
    elif name == 'rf':
        clf = RandomForestClassifier(n_estimators=250, max_depth=None, n_jobs=-1, random_state=0)
    elif name == 'gb':
        clf = GradientBoostingClassifier(random_state=0)
    else:
        raise ValueError('Unknown base model')
    return Pipeline(steps=[('prep', preprocess), ('clf', clf)])

In [None]:
# CV, metrics, threshold sweep, and isotonic calibration utils
def get_cv(n_splits=5, seed=42):
    return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

def threshold_sweep(y_true, prob, thresholds=None):
    if thresholds is None:
        thresholds = np.linspace(0.05, 0.95, 19)
    best = {'threshold': None, 'f1': -1, 'precision': None, 'recall': None}
    for t in thresholds:
        pred = (prob >= t).astype(int)
        f1 = f1_score(y_true, pred)
        if f1 > best['f1']:
            # compute precision & recall via confusion matrix
            tn, fp, fn, tp = confusion_matrix(y_true, pred).ravel()
            prec = tp / (tp + fp + 1e-9)
            rec = tp / (tp + fn + 1e-9)
            best = {'threshold': float(t), 'f1': float(f1), 'precision': float(prec), 'recall': float(rec)}
    return best

def fit_isotonic(y_true, prob):
    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(prob, y_true)
    return iso

In [None]:
# Train base models and produce OOF meta features

def train_base_models(X, y, X_test=None, seed=42, n_splits=5):
    cv = get_cv(n_splits=n_splits, seed=seed)
    base_names = ['logreg','rf','gb']
    oof = np.zeros((len(X), len(base_names)))
    test_preds = np.zeros((len(X_test), len(base_names))) if X_test is not None else None
    aucs = {name: [] for name in base_names}

    for j, name in enumerate(base_names):
        fold_idx = 0
        for tr_idx, va_idx in cv.split(X, y):
            X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
            model = build_model(name)
            # propagate seed if supported
            if 'random_state' in model.named_steps['clf'].get_params():
                model.set_params(**{'clf__random_state': seed + fold_idx})
            model.fit(X_tr, y_tr)
            p = model.predict_proba(X_va)[:,1]
            oof[va_idx, j] = p
            auc = roc_auc_score(y_va, p)
            aucs[name].append(auc)
            if X_test is not None:
                test_preds[:, j] += model.predict_proba(X_test)[:,1] / cv.get_n_splits()
            fold_idx += 1
        print(f"Base {name}: AUC per fold {np.round(aucs[name], 4)} -> mean {np.mean(aucs[name]):.4f}")
    return oof, test_preds, aucs

In [None]:
# Leakage audit utilities
# 1. Single-feature AUC to flag suspicious leak features.
# 2. Temporal leakage heuristic: columns that look like aggregates (e.g., total_*, avg_*) might encode future info.
# 3. Type casting helpers.

import re

LEAK_MAX_FEATURES = 40  # cap evaluation for speed

def single_feature_auc_scan(df: pd.DataFrame, y: pd.Series, max_features=LEAK_MAX_FEATURES):
    aucs = []
    for col in df.columns[:max_features]:
        try:
            if df[col].nunique() < 2:
                continue
            vals = df[col].fillna(df[col].median() if df[col].dtype != 'O' else 'missing')
            # For categorical -> encode label frequency
            if vals.dtype == 'O':
                mapping = vals.value_counts(normalize=True).to_dict()
                enc = vals.map(mapping).astype(float)
            else:
                enc = vals.astype(float)
            score = roc_auc_score(y, enc) if len(np.unique(enc)) > 1 else 0.5
            aucs.append((col, score))
        except Exception:
            continue
    aucs.sort(key=lambda x: x[1], reverse=True)
    return aucs

AGG_PATTERNS = [r'^total_', r'^sum_', r'^avg_', r'^mean_', r'^max_', r'^min_']

def looks_leaky(colname: str) -> bool:
    for pat in AGG_PATTERNS:
        if re.search(pat, colname):
            return True
    return False

# KS & PSI drift checks between train/test

def ks_stat(train_col, test_col):
    # dropna
    a = pd.Series(train_col).dropna()
    b = pd.Series(test_col).dropna()
    if a.nunique() < 2 or b.nunique() < 2:
        return 0.0
    try:
        stat, pval = stats.ks_2samp(a, b)
        return stat
    except Exception:
        return 0.0

# Population Stability Index for binned values

def psi(train_col, test_col, buckets=10):
    a = pd.Series(train_col).dropna()
    b = pd.Series(test_col).dropna()
    if a.nunique() < 2 or b.nunique() < 2:
        return 0.0
    quantiles = np.linspace(0, 1, buckets + 1)
    cuts = a.quantile(quantiles).unique()
    a_bins = pd.cut(a, bins=np.unique(cuts), include_lowest=True)
    b_bins = pd.cut(b, bins=np.unique(cuts), include_lowest=True)
    a_dist = a_bins.value_counts(normalize=True)
    b_dist = b_bins.value_counts(normalize=True)
    psi_val = 0.0
    for idx in a_dist.index:
        expected = a_dist.get(idx, 1e-6)
        actual = b_dist.get(idx, 1e-6)
        if expected > 0 and actual > 0:
            psi_val += (actual - expected) * math.log(actual / expected)
    return psi_val

DRIFT_REPORT_LIMIT = 40

def drift_report(train_df: pd.DataFrame, test_df: pd.DataFrame):
    rows = []
    shared = [c for c in train_df.columns if c in test_df.columns]
    for col in shared[:DRIFT_REPORT_LIMIT]:
        try:
            k = ks_stat(train_df[col], test_df[col])
            p = psi(train_df[col], test_df[col])
            rows.append({'feature': col, 'ks': k, 'psi': p})
        except Exception:
            continue
    rep = pd.DataFrame(rows)
    if not rep.empty:
        rep.sort_values(['ks','psi'], ascending=False, inplace=True)
    return rep

BOOL_LIKE = ['y','n','yes','no','true','false']

def cast_types(df: pd.DataFrame):
    for c in df.columns:
        if df[c].dtype == 'O':
            # bool-like
            low = df[c].str.lower()
            if low.isin(BOOL_LIKE).mean() > 0.9:
                df[c] = low.map({'y':1,'yes':1,'true':1,'n':0,'no':0,'false':0}).astype('Int8')
    return df

print('Leakage & drift utilities ready.')

In [None]:
# Apply type casting, leakage audit, and drift checks
# Must run after data load (Cell 3)

assert 'train' in globals(), 'Run the data load cell first.'

# 1) Type casting
if 'ID_COL' in globals() and ID_COL:
    train[ID_COL] = train[ID_COL].astype(str)
    if 'test' in globals() and test is not None and ID_COL in test.columns:
        test[ID_COL] = test[ID_COL].astype(str)

train = cast_types(train)
if 'test' in globals() and test is not None:
    test = cast_types(test)

# 2) Leakage audit (simple, top-N features)
feat_cols = [c for c in train.columns if c not in [TARGET] + ([ID_COL] if ID_COL else [])]
scan_df = train[feat_cols].copy()
scan_aucs = single_feature_auc_scan(scan_df, train[TARGET], max_features=min(LEAK_MAX_FEATURES, len(feat_cols)))
leaky = [c for (c, auc) in scan_aucs if auc >= 0.92 or auc <= 0.08 or looks_leaky(c)]

if len(leaky) > 0:
    print('Dropping suspicious leakage features:', leaky)
    train.drop(columns=[c for c in leaky if c in train.columns], inplace=True)
    if 'test' in globals() and test is not None:
        test.drop(columns=[c for c in leaky if c in test.columns], inplace=True)
else:
    print('No leakage features flagged by simple scan.')

# 3) Drift check (requires test)
if 'test' in globals() and test is not None:
    tr_common = train.drop(columns=[TARGET] + ([ID_COL] if ID_COL else []), errors='ignore')
    te_common = test.drop(columns=[ID_COL] if ID_COL else [], errors='ignore')
    rep = drift_report(tr_common, te_common)
    display(rep.head(12))
    # Drop worst offenders by simple rule
    drop_drift = rep[(rep['ks'] >= 0.2) | (rep['psi'] >= 0.25)]['feature'].tolist()
    if drop_drift:
        print('Dropping drift-heavy features:', drop_drift)
        train.drop(columns=[c for c in drop_drift if c in train.columns], inplace=True)
        test.drop(columns=[c for c in drop_drift if c in test.columns], inplace=True)
else:
    print('Test set not available; skipping drift check.')

print('Preprocessing audits complete.')

In [None]:
# Meta learner training (XGB if available, else Logistic) + isotonic calibration

def train_meta(oof_feats, y, test_feats=None, seed=42, depth=3):
    cv = get_cv(n_splits=5, seed=seed)
    oof_meta = np.zeros(len(y))
    test_meta = np.zeros(len(test_feats)) if test_feats is not None else None
    fold_aucs = []

    for fold, (tr_idx, va_idx) in enumerate(cv.split(oof_feats, y)):
        X_tr, X_va = oof_feats[tr_idx], oof_feats[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        if XGB_AVAILABLE:
            clf = xgb.XGBClassifier(
                max_depth=depth, n_estimators=300, learning_rate=0.05,
                subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0, reg_alpha=0.0,
                objective='binary:logistic', eval_metric='auc', random_state=seed+fold, tree_method='hist'
            )
        else:
            clf = LogisticRegression(max_iter=2000)
        clf.fit(X_tr, y_tr)
        p = clf.predict_proba(X_va)[:,1]
        oof_meta[va_idx] = p
        fold_aucs.append(roc_auc_score(y_va, p))
        if test_feats is not None:
            test_meta += clf.predict_proba(test_feats)[:,1] / cv.get_n_splits()

    meta_auc = roc_auc_score(y, oof_meta)
    print(f'Meta AUC: {meta_auc:.5f}; folds {np.round(fold_aucs,4)}')

    # Isotonic calibration on OOF
    iso = fit_isotonic(y.values, oof_meta)
    oof_meta_cal = iso.predict(oof_meta)
    meta_auc_cal = roc_auc_score(y, oof_meta_cal)
    print(f'Meta AUC (isotonic-calibrated): {meta_auc_cal:.5f}')
    test_meta_cal = iso.predict(test_meta) if test_meta is not None else None

    best_thr_raw = threshold_sweep(y.values, oof_meta)
    best_thr_cal = threshold_sweep(y.values, oof_meta_cal)
    print('Best threshold (raw):', best_thr_raw)
    print('Best threshold (cal):', best_thr_cal)

    return {
        'oof': oof_meta, 'oof_cal': oof_meta_cal, 'auc': meta_auc, 'auc_cal': meta_auc_cal,
        'test': test_meta, 'test_cal': test_meta_cal, 'best_thr_raw': best_thr_raw, 'best_thr_cal': best_thr_cal
    }

In [None]:
# Training orchestrator with early exit once AUC >=0.92 (then 1–2 extra seeds) aiming for 0.93

def run_training(seeds=list(range(42, 52)), extra_runs_after_hit=2, target_auc=0.92, aspire_auc=0.93):
    results = []
    hit = False
    extra = 0
    best = None
    for i, seed in enumerate(seeds):
        print(f"\n==== Seed {seed} ({i+1}/{len(seeds)}) ====")
        oof, test_feats, base_aucs = train_base_models(X, y, X_test, seed=seed, n_splits=5)
        meta = train_meta(oof, y, test_feats, seed=seed, depth=4)
        auc = float(meta['auc'])
        auc_cal = float(meta['auc_cal'])
        record = {
            'seed': seed, 'auc': auc, 'auc_cal': auc_cal,
            'best_thr_raw': meta['best_thr_raw'], 'best_thr_cal': meta['best_thr_cal']
        }
        results.append(record)
        if (best is None) or (auc_cal > best['auc_cal']):
            best = {**record, 'oof_cal': meta['oof_cal'], 'test_cal': meta['test_cal']}

        if auc_cal >= target_auc and not hit:
            hit = True
            extra = extra_runs_after_hit
            print(f'Hit target AUC {target_auc:.2f}. Will run {extra} extra seeds then stop.')
        elif hit and extra > 0:
            extra -= 1
            if extra == 0:
                print('Extra runs finished after hit. Stopping.')
                break
        if auc_cal >= aspire_auc:
            print(f'Reached aspirational AUC {aspire_auc:.2f}. Stopping early!')
            break
        if i >= 9:
            print('Reached max seed cap (10). Stopping.')
            break
    return pd.DataFrame(results), best

In [None]:
# Execute training
SEEDS = [42,43,44,45,46,47,48,49]
results_df, best = run_training(seeds=SEEDS, extra_runs_after_hit=2, target_auc=0.92, aspire_auc=0.93)
display(results_df.sort_values('auc_cal', ascending=False).head())
print('Best:', best)

In [None]:
# Build submission (if test and sample_submission available)
if 'test' in globals() and test is not None and SAMPLE_SUB_PATH.exists() and best is not None:
    sub = pd.read_csv(SAMPLE_SUB_PATH)
    sub_id_col = sub.columns[0]
    sub_target_col = sub.columns[1] if len(sub.columns) > 1 else (TARGET if TARGET is not None else 'target')
    if 'ID_COL' in globals() and ID_COL and sub_id_col != ID_COL and ID_COL in test.columns:
        sub[sub_id_col] = test[ID_COL].values
    preds = best['test_cal'] if best.get('test_cal') is not None else None
    if preds is not None:
        sub[sub_target_col] = preds
        timestamp = time.strftime('%Y%m%d_%H%M%S')
        out_path = SUB_DIR / f'advanced_submission_META_optimized_{timestamp}.csv'
        sub.to_csv(out_path, index=False)
        print('Saved submission to:', out_path)
    else:
        print('No test predictions available to build submission.')
else:
    print('Submission not created (missing test or sample submission or best).')

In [None]:
# Notes & next steps
# - Consider adding LightGBM/CatBoost base models for more diversity (stacking boost).
# - Hyperparameter refinement for XGB depth, learning_rate, n_estimators with early stopping.
# - Feature engineering: interaction terms, target encoding (with CV), monotonic constraints.
# - Robust leakage detection: time-based splits if timestamp exists.
# - Advanced calibration: Platt scaling or ensemble of calibrators.
# - Drift mitigation: reweight training samples by inverse propensity if heavy shift.
print('Notebook complete.')