# LightGBM Model with Engineered Features

Train a LightGBM model using engineered features.

**Acceptance Criteria:**
- Model trained successfully
- ROC-AUC improvement over baseline (0.747)
- Hyperparameters documented

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import lightgbm as lgb
lgb.__version__
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
import matplotlib.pyplot as plt

## 1. Load Data

In [None]:
# Load pre-merged features
df = pd.read_csv('../data/processed/final_train_features.csv')
print(f"Dataset shape: {df.shape}")
print(f"Target distribution: {df['TARGET'].value_counts(normalize=True).to_dict()}")

## 2. Preprocessing

In [None]:
# Separate target
X = df.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y = df['TARGET']

print(f"Features: {X.shape}")
print(f"Target distribution: {y.value_counts(normalize=True).to_dict()}")

In [None]:
# Identify column types
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric columns: {len(numeric_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")

In [None]:
# Convert categorical columns to category dtype (LightGBM handles this natively)
for col in categorical_cols:
    X[col] = X[col].astype('category')

print(f"Converted {len(categorical_cols)} columns to category dtype")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## 3. Train LightGBM Model

In [None]:
# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
# Hyperparameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1,
}

print("Hyperparameters:")
for k, v in params.items():
    print(f"  {k}: {v}")

In [None]:
# Train model
model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'test'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

print(f"\nBest iteration: {model.best_iteration}")

## 4. Evaluate Model

In [None]:
# Predict probabilities
y_train_pred = model.predict(X_train, num_iteration=model.best_iteration)
y_test_pred = model.predict(X_test, num_iteration=model.best_iteration)

# Calculate ROC-AUC
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test, y_test_pred)

# Baseline comparison
baseline_auc = 0.747
improvement = test_auc - baseline_auc

print("="*60)
print("LIGHTGBM MODEL RESULTS")
print("="*60)
print(f"Training ROC-AUC:   {train_auc:.4f}")
print(f"Test ROC-AUC:       {test_auc:.4f}")
print(f"")
print(f"Baseline ROC-AUC:   {baseline_auc:.4f}")
print(f"Improvement:        {improvement:+.4f} ({improvement/baseline_auc*100:+.2f}%)")
print("="*60)

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(feature_importance.head(20).to_string(index=False))

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 8))
plt.barh(feature_importance['feature'].head(20)[::-1], 
         feature_importance['importance'].head(20)[::-1])
plt.xlabel('Importance (gain)')
plt.title('Top 20 Feature Importances')
plt.tight_layout()
plt.show()

## 5. Model Documentation

### Hyperparameters

| Parameter | Value | Description |
|-----------|-------|-------------|
| objective | binary | Binary classification |
| metric | auc | Optimize for ROC-AUC |
| boosting_type | gbdt | Gradient Boosting Decision Tree |
| num_leaves | 31 | Max leaves per tree |
| learning_rate | 0.05 | Step size shrinkage |
| feature_fraction | 0.8 | Subsample ratio of features |
| bagging_fraction | 0.8 | Subsample ratio of data |
| bagging_freq | 5 | Frequency for bagging |
| early_stopping_rounds | 50 | Stop if no improvement |

### Results Summary

| Metric | Baseline (LogReg) | LightGBM | Improvement |
|--------|-------------------|----------|-------------|
| Test ROC-AUC | 0.747 | TBD | TBD |

### Features Used
- Application features (main table)
- Bureau features (credit history)
- Previous application features (past loans)
- Payment balance features (repayment behavior)

In [None]:
# Final summary
results = {
    'model': 'LightGBM',
    'train_auc': train_auc,
    'test_auc': test_auc,
    'baseline_auc': baseline_auc,
    'improvement': improvement,
    'n_features': X.shape[1],
    'best_iteration': model.best_iteration,
    'hyperparameters': params
}

print("\nFinal Results Summary:")
for key, value in results.items():
    if key != 'hyperparameters':
        print(f"  {key}: {value}")

Now our SHAP notebook will also be able to load the model:

In [None]:
import joblib

save_path = '../models/lightgbm_model.pkl'
joblib.dump(model, save_path)

print(f"Model saved to: {save_path}")
