# Loan Payback Prediction - LightGBM + Optuna

**Objective:** Achieve **92.7%+ ROC-AUC** using single LightGBM model with Optuna optimization

## Strategy:
1. **Advanced Feature Engineering**: Interactions, polynomials, ratios, target encoding
2. **LightGBM Model**: Fast, efficient gradient boosting
3. **Optuna Hyperparameter Tuning**: 100 trials with intelligent pruning
4. **Single Seed (42)**: Full reproducibility
5. **5-Fold Stratified CV**: Robust validation
6. **Fold-Safe Preprocessing**: No data leakage

In [8]:
# 1) Imports and Configuration
import os
import warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler

warnings.filterwarnings('ignore')

# Configuration
RANDOM_STATE = 42
N_FOLDS = 5
N_OPTUNA_TRIALS = 100

np.random.seed(RANDOM_STATE)

DATA_DIR = Path("Data")

def log(msg: str):
    ts = datetime.now().strftime("%H:%M:%S")
    print(f"[{ts}] {msg}")

log(f"üéØ Target: 92.7% ROC-AUC")
log(f"Random seed: {RANDOM_STATE}")
log(f"CV folds: {N_FOLDS}")
log(f"Optuna trials: {N_OPTUNA_TRIALS}")

[15:35:09] üéØ Target: 92.7% ROC-AUC
[15:35:09] Random seed: 42
[15:35:09] CV folds: 5
[15:35:09] Optuna trials: 100


In [9]:
# 2) Load Data
log("Loading train/test datasets...")

train_df = pd.read_csv(DATA_DIR / "train.csv")
test_df = pd.read_csv(DATA_DIR / "test.csv")

log(f"Train shape: {train_df.shape}")
log(f"Test shape: {test_df.shape}")

# Detect target column
target_col = list(set(train_df.columns) - set(test_df.columns))[0]
log(f"Target column: {target_col}")

# Detect ID column
id_col = None
for col in train_df.columns:
    if col == target_col:
        continue
    if col in test_df.columns and train_df[col].is_unique and test_df[col].is_unique:
        id_col = col
        break

log(f"ID column: {id_col}")

# Display info
print("\n" + "="*60)
print("DATASET OVERVIEW")
print("="*60)
print(train_df.head())
print(f"\nTarget distribution:")
print(train_df[target_col].value_counts())
print(f"Imbalance ratio: {train_df[target_col].value_counts().max() / train_df[target_col].value_counts().min():.2f}:1")

[15:35:09] Loading train/test datasets...
[15:35:10] Train shape: (593994, 13)
[15:35:10] Test shape: (254569, 12)
[15:35:10] Target column: loan_paid_back
[15:35:10] ID column: id

DATASET OVERVIEW
   id  annual_income  debt_to_income_ratio  credit_score  loan_amount  \
0   0       29367.99                 0.084           736      2528.42   
1   1       22108.02                 0.166           636      4593.10   
2   2       49566.20                 0.097           694     17005.15   
3   3       46858.25                 0.065           533      4682.48   
4   4       25496.70                 0.053           665     12184.43   

   interest_rate  gender marital_status education_level employment_status  \
0          13.67  Female         Single     High School     Self-employed   
1          12.92    Male        Married        Master's          Employed   
2           9.76    Male         Single     High School          Employed   
3          16.10  Female         Single     High Schoo

In [10]:
# 3) EDA and Missing Values
print("\n" + "="*60)
print("EXPLORATORY DATA ANALYSIS")
print("="*60)

# Missing values
print("\nMissing values in train:")
missing = train_df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
if len(missing) > 0:
    for col, count in missing.items():
        pct = count / len(train_df) * 100
        print(f"  {col}: {count} ({pct:.2f}%)")
else:
    print("  No missing values")

# Column types
numeric_cols_raw = train_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols_raw = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove target and ID
if target_col in numeric_cols_raw:
    numeric_cols_raw.remove(target_col)
if id_col and id_col in numeric_cols_raw:
    numeric_cols_raw.remove(id_col)
if id_col and id_col in categorical_cols_raw:
    categorical_cols_raw.remove(id_col)

print(f"\nNumeric columns: {len(numeric_cols_raw)}")
print(f"Categorical columns: {len(categorical_cols_raw)}")

# Basic stats
print(f"\nNumeric features summary:")
print(train_df[numeric_cols_raw].describe())


EXPLORATORY DATA ANALYSIS

Missing values in train:
  No missing values

Numeric columns: 5
Categorical columns: 6

Numeric features summary:
       annual_income  debt_to_income_ratio   credit_score    loan_amount  \
count  593994.000000         593994.000000  593994.000000  593994.000000   
mean    48212.202976              0.120696     680.916009   15020.297629   
std     26711.942078              0.068573      55.424956    6926.530568   
min      6002.430000              0.011000     395.000000     500.090000   
25%     27934.400000              0.072000     646.000000   10279.620000   
50%     46557.680000              0.096000     682.000000   15000.220000   
75%     60981.320000              0.156000     719.000000   18858.580000   
max    393381.740000              0.627000     849.000000   48959.950000   

       interest_rate  
count  593994.000000  
mean       12.356345  
std         2.008959  
min         3.200000  
25%        10.990000  
50%        12.370000  
75%        

## Schema Alignment for External Datasets
We will align extra datasets to the test schema. Any fields not present will be set to NaN/"Unknown" and we will flip targets so that `loan_paid_back = 1` consistently means paid back.

In [None]:
# 3b) Align External Datasets to Master Schema

MASTER_COLS = [
    "id",
    "annual_income",
    "debt_to_income_ratio",
    "credit_score",
    "loan_amount",
    "interest_rate",
    "gender",
    "marital_status",
    "education_level",
    "employment_status",
    "loan_purpose",
    "grade_subgrade",
    "loan_paid_back",
]

MASTER_FEATURE_COLS = [c for c in MASTER_COLS if c != "loan_paid_back"]

log("\nAligning external datasets to master schema...")

# Helper mappers

def align_kaggle_train(df: pd.DataFrame) -> pd.DataFrame:
    # Ensure all master columns exist, copy what we have
    aligned = pd.DataFrame()
    for c in MASTER_FEATURE_COLS:
        aligned[c] = df[c] if c in df.columns else np.nan
    aligned["loan_paid_back"] = df[target_col].astype(int)
    return aligned[MASTER_COLS]


def align_lendingclub(df: pd.DataFrame, id_prefix: str = "LC") -> pd.DataFrame:
    aligned = pd.DataFrame()
    n = len(df)
    aligned["id"] = [f"{id_prefix}_{i}" for i in range(n)]
    aligned["annual_income"] = np.exp(df["log.annual.inc"]) if "log.annual.inc" in df.columns else np.nan
    aligned["debt_to_income_ratio"] = df.get("dti", np.nan)
    aligned["credit_score"] = df.get("fico", np.nan)
    aligned["loan_amount"] = np.nan
    aligned["interest_rate"] = df.get("int.rate", np.nan)
    aligned["gender"] = np.nan
    aligned["marital_status"] = np.nan
    aligned["education_level"] = np.nan
    aligned["employment_status"] = np.nan
    aligned["loan_purpose"] = df.get("purpose", np.nan)
    aligned["grade_subgrade"] = np.nan
    # Flip target: not.fully.paid == 1 -> loan_paid_back == 0
    if "not.fully.paid" in df.columns:
        aligned["loan_paid_back"] = 1 - df["not.fully.paid"].astype(int)
    else:
        aligned["loan_paid_back"] = np.nan
    return aligned[MASTER_COLS]


def align_loan_payments(df: pd.DataFrame) -> pd.DataFrame:
    aligned = pd.DataFrame()
    aligned["id"] = df.get("Loan_ID", np.arange(len(df)))
    aligned["annual_income"] = np.nan
    aligned["debt_to_income_ratio"] = np.nan
    aligned["credit_score"] = np.nan
    aligned["loan_amount"] = df.get("Principal", np.nan)
    aligned["interest_rate"] = np.nan
    aligned["gender"] = df.get("Gender", np.nan)
    aligned["marital_status"] = np.nan
    aligned["education_level"] = df.get("education", np.nan)
    aligned["employment_status"] = np.nan
    aligned["loan_purpose"] = np.nan
    aligned["grade_subgrade"] = np.nan
    status_map = {
        "PAIDOFF": 1,
        "COLLECTION_PAIDOFF": 1,
        "COLLECTION": 0,
    }
    if "loan_status" in df.columns:
        aligned["loan_paid_back"] = df["loan_status"].map(status_map)
    else:
        aligned["loan_paid_back"] = np.nan
    return aligned[MASTER_COLS]


# Build aligned DataFrames
kaggle_train_aligned = align_kaggle_train(train_df)

ext_parts = []
# LendingClub-like datasets
for fname in ["loan_data.csv", "loan_data 2.csv"]:
    fpath = DATA_DIR / fname
    if fpath.exists():
        try:
            df_ext = pd.read_csv(fpath)
            ext_parts.append(align_lendingclub(df_ext, id_prefix=fname.split('.')[0]))
            log(f"Aligned {fname}: {df_ext.shape}")
        except Exception as e:
            log(f"Failed to load {fname}: {e}")

# Loan payments dataset
lp_path = DATA_DIR / "Loan payments data.csv"
if lp_path.exists():
    try:
        df_lp = pd.read_csv(lp_path)
        ext_parts.append(align_loan_payments(df_lp))
        log(f"Aligned Loan payments data.csv: {df_lp.shape}")
    except Exception as e:
        log(f"Failed to load Loan payments data.csv: {e}")

# Concatenate all aligned training data
if len(ext_parts) > 0:
    big_train = pd.concat([kaggle_train_aligned] + ext_parts, ignore_index=True)
else:
    big_train = kaggle_train_aligned.copy()

# Test aligned (ensure all features present)
test_aligned = pd.DataFrame()
for c in MASTER_FEATURE_COLS:
    test_aligned[c] = test_df[c] if c in test_df.columns else np.nan

log(f"\nCombined training rows: {len(big_train)} (main={len(kaggle_train_aligned)}, external={len(big_train)-len(kaggle_train_aligned)})")
log(f"Train aligned shape: {big_train.shape}")
log(f"Test aligned shape : {test_aligned.shape}")

# Quick check of target consistency
if big_train["loan_paid_back"].isna().any():
    na_rows = big_train["loan_paid_back"].isna().sum()
    log(f"Warning: {na_rows} rows have missing target (will be dropped before training)")

# Drop rows without target (if any)
big_train = big_train.dropna(subset=["loan_paid_back"]).reset_index(drop=True)
log(f"After drop missing target: {big_train.shape}")

In [11]:
# 4) Feature Engineering Functions

def engineer_features(df, is_train=True, ref_stats=None):
    """
    Advanced feature engineering for loan prediction.
    Includes interactions, ratios, polynomials, and derived features.
    """
    df = df.copy()
    
    if ref_stats is None:
        ref_stats = {}
    
    # 1) Log transforms for skewed features
    if 'annual_income' in df.columns:
        df['annual_income_log'] = np.log1p(df['annual_income'].fillna(0))
        df['annual_income_sqrt'] = np.sqrt(df['annual_income'].fillna(0))
    
    if 'loan_amount' in df.columns:
        df['loan_amount_log'] = np.log1p(df['loan_amount'].fillna(0))
        df['loan_amount_sqrt'] = np.sqrt(df['loan_amount'].fillna(0))
        df['loan_amount_sq'] = df['loan_amount'] ** 2
    
    # 2) Financial ratios
    if 'loan_amount' in df.columns and 'annual_income' in df.columns:
        df['loan_to_income'] = df['loan_amount'] / (df['annual_income'] + 1)
        df['income_to_loan'] = df['annual_income'] / (df['loan_amount'] + 1)
        df['loan_income_ratio_log'] = np.log1p(df['loan_to_income'])
    
    if 'debt_to_income_ratio' in df.columns and 'interest_rate' in df.columns:
        df['dti_times_rate'] = df['debt_to_income_ratio'] * df['interest_rate']
        df['dti_div_rate'] = df['debt_to_income_ratio'] / (df['interest_rate'] + 0.01)
        df['dti_rate_interaction'] = df['debt_to_income_ratio'] * df['interest_rate'] ** 2
    
    # 3) Credit score features
    if 'credit_score' in df.columns:
        # Normalized
        if is_train:
            ref_stats['credit_mean'] = df['credit_score'].mean()
            ref_stats['credit_std'] = df['credit_score'].std()
        
        if ref_stats.get('credit_std', 0) > 0:
            df['credit_score_norm'] = (
                (df['credit_score'] - ref_stats['credit_mean']) / ref_stats['credit_std']
            )
        
        # Bins (convert to string immediately to avoid categorical issues)
        credit_bins = pd.cut(
            df['credit_score'].fillna(650),
            bins=[0, 600, 650, 700, 750, 850],
            labels=['very_low', 'low', 'medium', 'high', 'excellent']
        )
        df['credit_bin'] = credit_bins.astype(str)
        
        # Interactions
        if 'interest_rate' in df.columns:
            df['credit_times_rate'] = df['credit_score'] * df['interest_rate']
            df['credit_div_rate'] = df['credit_score'] / (df['interest_rate'] + 0.01)
        
        if 'loan_amount' in df.columns:
            df['credit_per_loan'] = df['credit_score'] / (df['loan_amount'] + 1)
            df['loan_per_credit'] = df['loan_amount'] / (df['credit_score'] + 1)
        
        if 'debt_to_income_ratio' in df.columns:
            df['dti_times_credit'] = df['debt_to_income_ratio'] * df['credit_score']
            df['dti_div_credit'] = df['debt_to_income_ratio'] / (df['credit_score'] + 1)
    
    # 4) Interest rate features
    if 'interest_rate' in df.columns:
        df['interest_rate_sq'] = df['interest_rate'] ** 2
        df['interest_rate_cube'] = df['interest_rate'] ** 3
        df['interest_rate_log'] = np.log1p(df['interest_rate'])
        
        # High risk indicators
        if is_train:
            ref_stats['rate_75th'] = df['interest_rate'].quantile(0.75)
            ref_stats['rate_90th'] = df['interest_rate'].quantile(0.90)
        
        if 'rate_75th' in ref_stats:
            df['high_interest'] = (df['interest_rate'] > ref_stats['rate_75th']).astype(int)
            df['very_high_interest'] = (df['interest_rate'] > ref_stats['rate_90th']).astype(int)
        
        if 'annual_income' in df.columns:
            df['income_times_rate'] = df['annual_income'] * df['interest_rate']
            df['income_div_rate'] = df['annual_income'] / (df['interest_rate'] + 0.01)
    
    # 5) Loan amount bins (convert to string immediately)
    if 'loan_amount' in df.columns:
        if is_train:
            ref_stats['loan_quantiles'] = df['loan_amount'].quantile([0.25, 0.5, 0.75]).tolist()
        
        if 'loan_quantiles' in ref_stats:
            q25, q50, q75 = ref_stats['loan_quantiles']
            loan_bins = pd.cut(
                df['loan_amount'].fillna(q50),
                bins=[0, q25, q50, q75, float('inf')],
                labels=['small', 'medium', 'large', 'xlarge']
            )
            df['loan_size'] = loan_bins.astype(str)
    
    # 6) Composite risk score
    if all(col in df.columns for col in ['loan_amount', 'annual_income', 'credit_score', 'interest_rate']):
        df['risk_score'] = (
            df['loan_amount'] / (df['annual_income'] + 1) * 
            df['interest_rate'] * 
            (800 - df['credit_score'].fillna(650))
        )
        df['risk_score_log'] = np.log1p(df['risk_score'])
    
    # 7) Missing value indicators
    for col in ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']:
        if col in df.columns:
            df[f'{col}_missing'] = df[col].isnull().astype(int)
    
    # 8) Demographic features (convert to string immediately)
    if 'age' in df.columns:
        age_bins = pd.cut(
            df['age'].fillna(35),
            bins=[0, 25, 35, 45, 55, 100],
            labels=['young', 'young_adult', 'middle', 'mature', 'senior']
        )
        df['age_group'] = age_bins.astype(str)
    
    return df, ref_stats


def target_encode_categorical(train_df, val_df, test_df, cat_cols, target, smoothing=10.0):
    """
    Target encode categorical features with smoothing to prevent overfitting.
    """
    train_enc = train_df.copy()
    val_enc = val_df.copy()
    test_enc = test_df.copy()
    
    global_mean = target.mean()
    
    for col in cat_cols:
        if col not in train_df.columns:
            continue
        
        # Convert to string to avoid categorical dtype issues
        train_col = train_df[col].astype(str)
        val_col = val_df[col].astype(str)
        test_col = test_df[col].astype(str)
        
        # Calculate target statistics per category
        stats = pd.DataFrame({
            col: train_col,
            'target': target
        }).groupby(col)['target'].agg(['sum', 'count'])
        
        # Smoothed encoding
        stats['encoded'] = (
            (stats['sum'] + smoothing * global_mean) / 
            (stats['count'] + smoothing)
        )
        
        encoding_map = stats['encoded'].to_dict()
        
        # Apply encoding (create new float column)
        train_enc[f'{col}_target_enc'] = train_col.map(encoding_map).fillna(global_mean).astype(float)
        val_enc[f'{col}_target_enc'] = val_col.map(encoding_map).fillna(global_mean).astype(float)
        test_enc[f'{col}_target_enc'] = test_col.map(encoding_map).fillna(global_mean).astype(float)
    
    return train_enc, val_enc, test_enc


log("Feature engineering functions defined")

[15:35:10] Feature engineering functions defined


In [12]:
# 5) Preprocessing Pipeline

def preprocess_data(X_train, X_val, X_test, y_train):
    """
    Full preprocessing: feature engineering, encoding, imputation.
    Fits on train, transforms all sets (fold-safe).
    """
    # Engineer features
    X_train_eng, ref_stats = engineer_features(X_train, is_train=True)
    X_val_eng, _ = engineer_features(X_val, is_train=False, ref_stats=ref_stats)
    X_test_eng, _ = engineer_features(X_test, is_train=False, ref_stats=ref_stats)
    
    # Identify column types
    numeric_cols = X_train_eng.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train_eng.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Remove target and ID
    for col in [target_col, id_col]:
        if col in numeric_cols:
            numeric_cols.remove(col)
        if col in categorical_cols:
            categorical_cols.remove(col)
    
    # Target encode categoricals
    X_train_enc, X_val_enc, X_test_enc = target_encode_categorical(
        X_train_eng, X_val_eng, X_test_eng,
        categorical_cols, y_train, smoothing=10.0
    )
    
    # Label encode remaining categoricals for LightGBM
    for col in categorical_cols:
        if col in X_train_enc.columns:
            le = LabelEncoder()
            
            # Fit on train
            X_train_enc[col] = X_train_enc[col].fillna('missing')
            le.fit(X_train_enc[col])
            X_train_enc[col] = le.transform(X_train_enc[col])
            
            # Transform val/test with unseen category handling
            X_val_enc[col] = X_val_enc[col].fillna('missing')
            X_val_enc[col] = X_val_enc[col].map(
                lambda x: x if x in le.classes_ else 'missing'
            )
            X_val_enc[col] = le.transform(X_val_enc[col])
            
            X_test_enc[col] = X_test_enc[col].fillna('missing')
            X_test_enc[col] = X_test_enc[col].map(
                lambda x: x if x in le.classes_ else 'missing'
            )
            X_test_enc[col] = le.transform(X_test_enc[col])
    
    # Fill missing numeric values
    for col in numeric_cols:
        if col in X_train_enc.columns:
            fill_value = X_train_enc[col].median()
            X_train_enc[col] = X_train_enc[col].fillna(fill_value)
            X_val_enc[col] = X_val_enc[col].fillna(fill_value)
            X_test_enc[col] = X_test_enc[col].fillna(fill_value)
    
    # Select features (exclude target and ID)
    feature_cols = [c for c in X_train_enc.columns if c not in [target_col, id_col]]
    
    return X_train_enc[feature_cols], X_val_enc[feature_cols], X_test_enc[feature_cols], categorical_cols


log("Preprocessing pipeline defined")

[15:35:10] Preprocessing pipeline defined


In [13]:
# 6) Prepare Data

# Use the merged/aligned training data and aligned test
train_used = big_train.copy()
test_used = test_aligned.copy()

# Extract target
y = train_used["loan_paid_back"].astype(int).values

# Calculate class weights
n_pos = (y == 1).sum()
n_neg = (y == 0).sum()
scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0

log(f"\nClass distribution: Negative={n_neg}, Positive={n_pos}")
log(f"Scale pos weight: {scale_pos_weight:.4f}")

# Feature columns (exactly the test schema features)
feature_cols = MASTER_FEATURE_COLS
X_raw = train_used[feature_cols].copy()
X_test_raw = test_used[feature_cols].copy()

log(f"Training features: {X_raw.shape}")
log(f"Test features: {X_test_raw.shape}")

[15:35:10] 
Class distribution: Negative=119500, Positive=474494
[15:35:10] Scale pos weight: 0.2518
[15:35:10] Training features: (593994, 11)
[15:35:10] Test features: (254569, 11)


In [14]:
# 7) Optuna Hyperparameter Optimization

def objective(trial):
    """Optuna objective for LightGBM hyperparameter tuning."""
    
    # Suggest hyperparameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'seed': RANDOM_STATE,
        'n_jobs': -1,
        
        # Tree structure
        'num_leaves': trial.suggest_int('num_leaves', 31, 255),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 200),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log=True),
        
        # Learning
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        
        # Regularization
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 100.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 100.0, log=True),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.0, 10.0),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
        
        # Sampling
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        
        # Class imbalance
        'scale_pos_weight': scale_pos_weight,
    }
    
    # Quick 2-fold CV
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_raw, y)):
        X_train_fold = X_raw.iloc[train_idx].copy()
        X_val_fold = X_raw.iloc[val_idx].copy()
        y_train_fold = y[train_idx]
        y_val_fold = y[val_idx]
        
        # Preprocess
        X_train_proc, X_val_proc, _, _ = preprocess_data(
            X_train_fold, X_val_fold, X_test_raw.iloc[:100], y_train_fold
        )
        
        # Train
        train_data = lgb.Dataset(X_train_proc, label=y_train_fold)
        val_data = lgb.Dataset(X_val_proc, label=y_val_fold, reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            num_boost_round=1000,
            valid_sets=[val_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50, verbose=False),
                lgb.log_evaluation(period=0)
            ]
        )
        
        # Score
        y_pred = model.predict(X_val_proc, num_iteration=model.best_iteration)
        score = roc_auc_score(y_val_fold, y_pred)
        scores.append(score)
        
        # Prune unpromising trials
        trial.report(score, fold_idx)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return np.mean(scores)


# Run optimization
log("\n" + "="*60)
log("üîç OPTUNA HYPERPARAMETER OPTIMIZATION")
log("="*60)

study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=RANDOM_STATE),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=1)
)

study.optimize(objective, n_trials=N_OPTUNA_TRIALS, show_progress_bar=True)

log("\n" + "="*60)
log(f"‚úÖ Best ROC-AUC: {study.best_value:.6f}")
log("="*60)
log("Best hyperparameters:")
for param, value in study.best_params.items():
    log(f"  {param}: {value}")

# Store best params
best_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': RANDOM_STATE,
    'n_jobs': -1,
    'scale_pos_weight': scale_pos_weight,
    **study.best_params
}

[I 2025-11-21 15:35:10,822] A new study created in memory with name: no-name-ff1be21f-c971-4221-b52a-1bb3f97e0a6a


[15:35:10] 
[15:35:10] üîç OPTUNA HYPERPARAMETER OPTIMIZATION


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-11-21 15:35:58,607] Trial 0 finished with value: 0.9175307401059147 and parameters: {'num_leaves': 115, 'max_depth': 15, 'min_child_samples': 152, 'min_child_weight': 0.24810409748678114, 'learning_rate': 0.007979118876474874, 'lambda_l1': 3.6303224667798554e-07, 'lambda_l2': 3.809220577048033e-08, 'min_gain_to_split': 8.661761457749352, 'min_split_gain': 0.6011150117432088, 'feature_fraction': 0.8832290311184182, 'bagging_fraction': 0.608233797718321, 'bagging_freq': 10}. Best is trial 0 with value: 0.9175307401059147.
[I 2025-11-21 15:36:51,645] Trial 1 finished with value: 0.9188126001055377 and parameters: {'num_leaves': 218, 'max_depth': 7, 'min_child_samples': 52, 'min_child_weight': 0.00541524411940254, 'learning_rate': 0.012439367209907218, 'lambda_l1': 0.001768334077666253, 'lambda_l2': 0.00020866527711063722, 'min_gain_to_split': 2.9122914019804194, 'min_split_gain': 0.6118528947223795, 'feature_fraction': 0.6557975442608167, 'bagging_fraction': 0.7168578594140872, 'b

In [15]:
# 8) Train Final Model with 5-Fold CV

log("\n" + "="*60)
log("üöÄ TRAINING FINAL MODEL")
log("="*60)

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

oof_predictions = np.zeros(len(X_raw))
test_predictions = np.zeros(len(X_test_raw))
feature_importance = pd.DataFrame()
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_raw, y), 1):
    log(f"\n{'='*60}")
    log(f"Fold {fold}/{N_FOLDS}")
    log(f"{'='*60}")
    
    # Split
    X_train_fold = X_raw.iloc[train_idx].copy()
    X_val_fold = X_raw.iloc[val_idx].copy()
    y_train_fold = y[train_idx]
    y_val_fold = y[val_idx]
    
    log(f"Train: {len(X_train_fold)}, Val: {len(X_val_fold)}")
    
    # Preprocess
    X_train_proc, X_val_proc, X_test_proc, _ = preprocess_data(
        X_train_fold, X_val_fold, X_test_raw, y_train_fold
    )
    
    log(f"Features: {X_train_proc.shape[1]}")
    
    # Create datasets
    train_data = lgb.Dataset(X_train_proc, label=y_train_fold)
    val_data = lgb.Dataset(X_val_proc, label=y_val_fold, reference=train_data)
    
    # Train
    model = lgb.train(
        best_params,
        train_data,
        num_boost_round=5000,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=150, verbose=False),
            lgb.log_evaluation(period=200)
        ]
    )
    
    # Predict
    oof_predictions[val_idx] = model.predict(X_val_proc, num_iteration=model.best_iteration)
    test_predictions += model.predict(X_test_proc, num_iteration=model.best_iteration) / N_FOLDS
    
    # Score
    fold_auc = roc_auc_score(y_val_fold, oof_predictions[val_idx])
    fold_scores.append(fold_auc)
    log(f"Fold {fold} ROC-AUC: {fold_auc:.6f}")
    
    # Feature importance
    fold_fi = pd.DataFrame({
        'feature': X_train_proc.columns,
        'importance': model.feature_importance(importance_type='gain'),
        'fold': fold
    })
    feature_importance = pd.concat([feature_importance, fold_fi], axis=0)

# Overall OOF
oof_auc = roc_auc_score(y, oof_predictions)

log("\n" + "="*60)
log("üéØ FINAL RESULTS")
log("="*60)
log(f"Overall OOF ROC-AUC: {oof_auc:.6f}")
log(f"Fold scores: {[f'{s:.6f}' for s in fold_scores]}")
log(f"Mean ¬± Std: {np.mean(fold_scores):.6f} ¬± {np.std(fold_scores):.6f}")

if oof_auc >= 0.927:
    log("‚úÖ TARGET ACHIEVED: 92.7%+ ROC-AUC!")
else:
    log(f"‚ö†Ô∏è  Gap to target: {0.927 - oof_auc:.6f}")

log("="*60)

[16:54:59] 
[16:54:59] üöÄ TRAINING FINAL MODEL
[16:54:59] 
[16:54:59] Fold 1/5
[16:54:59] Train: 475195, Val: 118799
[16:54:59] Train: 475195, Val: 118799
[16:55:07] Features: 53
[16:55:07] Features: 53
[200]	train's auc: 0.920889	valid's auc: 0.920195
[200]	train's auc: 0.920889	valid's auc: 0.920195
[400]	train's auc: 0.925067	valid's auc: 0.922068
[400]	train's auc: 0.925067	valid's auc: 0.922068
[600]	train's auc: 0.927751	valid's auc: 0.922772
[600]	train's auc: 0.927751	valid's auc: 0.922772
[800]	train's auc: 0.929754	valid's auc: 0.923155
[800]	train's auc: 0.929754	valid's auc: 0.923155
[1000]	train's auc: 0.930974	valid's auc: 0.923384
[1000]	train's auc: 0.930974	valid's auc: 0.923384
[16:55:45] Fold 1 ROC-AUC: 0.923386
[16:55:45] 
[16:55:45] Fold 2/5
[16:55:45] Train: 475195, Val: 118799
[16:55:45] Fold 1 ROC-AUC: 0.923386
[16:55:45] 
[16:55:45] Fold 2/5
[16:55:45] Train: 475195, Val: 118799
[16:55:52] Features: 53
[16:55:52] Features: 53
[200]	train's auc: 0.921	valid's 

In [16]:
# 9) Performance Analysis

# Find optimal threshold
thresholds = np.linspace(0.1, 0.9, 17)
best_f1 = 0
best_threshold = 0.5

for thr in thresholds:
    pred_binary = (oof_predictions >= thr).astype(int)
    f1 = f1_score(y, pred_binary)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thr

# Metrics at optimal threshold
pred_binary = (oof_predictions >= best_threshold).astype(int)
acc = accuracy_score(y, pred_binary)
logloss = log_loss(y, oof_predictions)

print("\n" + "="*60)
print("üìä PERFORMANCE SUMMARY")
print("="*60)
print(f"ROC-AUC    : {oof_auc:.6f} {'‚úÖ' if oof_auc >= 0.927 else '‚ö†Ô∏è'}")
print(f"Accuracy   : {acc:.6f}")
print(f"F1 Score   : {best_f1:.6f}")
print(f"Log Loss   : {logloss:.6f}")
print(f"Threshold  : {best_threshold:.3f}")
if oof_auc >= 0.927:
    print(f"üéØ TARGET ACHIEVED! ({oof_auc:.6f} >= 0.927)")
else:
    print(f"Gap to target: {0.927 - oof_auc:.6f}")
print("="*60)

# Top features
print("\nüîù Top 20 Most Important Features:")
print("="*60)
fi_agg = feature_importance.groupby('feature')['importance'].mean()
fi_agg = fi_agg.sort_values(ascending=False)

for i, (feat, imp) in enumerate(fi_agg.head(20).items(), 1):
    print(f"{i:2d}. {feat:45s} {imp:10.2f}")
print("="*60)


üìä PERFORMANCE SUMMARY
ROC-AUC    : 0.922289 ‚ö†Ô∏è
Accuracy   : 0.905718
F1 Score   : 0.943278
Log Loss   : 0.324562
Threshold  : 0.200
Gap to target: 0.004711

üîù Top 20 Most Important Features:
 1. employment_status                              777559.11
 2. dti_div_credit                                 283631.90
 3. employment_status_target_enc                   128216.61
 4. credit_score                                   113451.14
 5. debt_to_income_ratio                            86223.55
 6. dti_times_rate                                  33180.76
 7. grade_subgrade                                  24511.68
 8. credit_score_norm                               18403.53
 9. dti_rate_interaction                            12458.81
10. loan_amount                                     10214.57
11. annual_income                                    7716.93
12. interest_rate                                    6095.89
13. dti_times_credit                                 4435.21
14. g

In [17]:
# 10) Generate Submission

log("\nüìù Generating submission file...")

submission = pd.DataFrame()
if id_col and id_col in test_df.columns:
    submission[id_col] = test_df[id_col]
else:
    submission['id'] = np.arange(len(test_df))

submission[target_col] = test_predictions

# Save
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
sub_path = Path(f"submissions/lgbm_optuna_{oof_auc:.4f}_{timestamp}.csv")
sub_path.parent.mkdir(exist_ok=True, parents=True)
submission.to_csv(sub_path, index=False)

log(f"‚úÖ Submission saved: {sub_path.name}")
log(f"   OOF ROC-AUC: {oof_auc:.6f}")
log(f"   Predictions range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")

print("\nSubmission preview:")
print(submission.head(10))
print(f"\nShape: {submission.shape}")

[16:58:39] 
üìù Generating submission file...
[16:58:40] ‚úÖ Submission saved: lgbm_optuna_0.9223_20251121_165839.csv
[16:58:40]    OOF ROC-AUC: 0.922289
[16:58:40]    Predictions range: [0.0005, 0.9994]

Submission preview:
       id  loan_paid_back
0  593994        0.749214
1  593995        0.918495
2  593996        0.215136
3  593997        0.742797
4  593998        0.891635
5  593999        0.925256
6  594000        0.955934
7  594001        0.877053
8  594002        0.804365
9  594003        0.001436

Shape: (254569, 2)
[16:58:40] ‚úÖ Submission saved: lgbm_optuna_0.9223_20251121_165839.csv
[16:58:40]    OOF ROC-AUC: 0.922289
[16:58:40]    Predictions range: [0.0005, 0.9994]

Submission preview:
       id  loan_paid_back
0  593994        0.749214
1  593995        0.918495
2  593996        0.215136
3  593997        0.742797
4  593998        0.891635
5  593999        0.925256
6  594000        0.955934
7  594001        0.877053
8  594002        0.804365
9  594003        0.001436

Sh

In [18]:
# 11) Save Feature Importance

# Save detailed feature importance
fi_path = Path(f"visualizations/feature_importance_{timestamp}.csv")
fi_path.parent.mkdir(exist_ok=True, parents=True)

fi_summary = feature_importance.groupby('feature')['importance'].agg(['mean', 'std'])
fi_summary = fi_summary.sort_values('mean', ascending=False)
fi_summary.to_csv(fi_path)

log(f"\n‚úÖ Feature importance saved: {fi_path.name}")
log(f"\n{'='*60}")
log("üèÅ PIPELINE COMPLETE!")
log(f"{'='*60}")

[16:58:40] 
‚úÖ Feature importance saved: feature_importance_20251121_165839.csv
[16:58:40] 
[16:58:40] üèÅ PIPELINE COMPLETE!
