In [None]:
# 0Ô∏è‚É£ LightGBM Benchmark Model
# Goal: Train a gradient boosting model and compare performance vs logistic + scorecard.


In [None]:
# 1Ô∏è‚É£ Imports and project paths

import pandas as pd
import numpy as np
from pathlib import Path

PROJECT_DIR = Path("~/Documents/credit-scoring-home-credit").expanduser()
DATA_PROCESSED = PROJECT_DIR / "data" / "processed"
clean_path = DATA_PROCESSED / "application_cleaned.csv"

df = pd.read_csv(clean_path)
df.shape


In [None]:
# 2Ô∏è‚É£ Define target and drop ID columns (keep it simple)

target = "TARGET"

# Keep SK_ID_CURR for reference, but don't use it as a feature
id_col = "SK_ID_CURR" if "SK_ID_CURR" in df.columns else None

# Split into X, y
X = df.drop(columns=[target] + ([id_col] if id_col else []))
y = df[target].astype(int)

X.shape, y.mean()


In [None]:
# 3Ô∏è‚É£ Train/validation split (stratified)

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_valid.shape


In [None]:
import lightgbm as lgb
lgb.__version__


In [None]:
# 5Ô∏è‚É£ Identify categorical columns and cast to 'category' dtype (LightGBM native support)

cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()

for c in cat_cols:
    X_train[c] = X_train[c].astype("category")
    X_valid[c] = X_valid[c].astype("category")

len(cat_cols), cat_cols[:15]


In [None]:
# 6Ô∏è‚É£ Train LightGBM model (baseline)

lgbm = lgb.LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    categorical_feature=cat_cols,
    callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=100)]
)


In [None]:
# 7Ô∏è‚É£ Evaluate LightGBM (ROC-AUC and PR-AUC)

from sklearn.metrics import roc_auc_score, average_precision_score

p_valid = lgbm.predict_proba(X_valid)[:, 1]

roc_auc = roc_auc_score(y_valid, p_valid)
pr_auc = average_precision_score(y_valid, p_valid)

roc_auc, pr_auc


In [None]:
# 8Ô∏è‚É£ KS statistic (credit risk standard metric)

import numpy as np
import pandas as pd

def ks_statistic(y_true, y_score, n_bins=100):
    data = pd.DataFrame({"y": y_true, "score": y_score}).sort_values("score")
    data["bin"] = pd.qcut(data["score"], q=n_bins, duplicates="drop")
    grouped = data.groupby("bin", observed=False)["y"]
    bad_rate = grouped.mean()
    total = grouped.size()
    bad_cum = (bad_rate * total).cumsum() / data["y"].sum()
    good_cum = ((1 - bad_rate) * total).cumsum() / (1 - data["y"]).sum()
    return np.max(np.abs(bad_cum - good_cum))

ks = ks_statistic(y_valid, p_valid)
ks


In [None]:
# 9Ô∏è‚É£ Feature importance (quick view)

imp = pd.DataFrame({
    "feature": X_train.columns,
    "importance": lgbm.feature_importances_
}).sort_values("importance", ascending=False)

imp.head(20)


In [None]:
# üîü Install SHAP (run once if needed)

import sys

!{sys.executable} -m pip install shap


In [None]:
# 1Ô∏è‚É£1Ô∏è‚É£ Import SHAP and set up TreeExplainer for LightGBM

import shap
import numpy as np
import pandas as pd

shap.__version__


In [None]:
# 1Ô∏è‚É£2Ô∏è‚É£ Create a smaller sample for SHAP (faster + still representative)

# SHAP can be heavy on 300k rows, so we sample the validation set
sample_size = 5000

X_valid_sample = X_valid.sample(sample_size, random_state=42)

# Important: keep categorical dtypes the same for LightGBM
for c in cat_cols:
    X_valid_sample[c] = X_valid_sample[c].astype("category")

X_valid_sample.shape


In [None]:
# 1Ô∏è‚É£3Ô∏è‚É£ Compute SHAP values (TreeExplainer)

explainer = shap.TreeExplainer(lgbm)

# For binary classification, shap_values will usually be a 2D array: (n_samples, n_features)
shap_values = explainer.shap_values(X_valid_sample)

type(shap_values), np.array(shap_values).shape


In [None]:
# 1Ô∏è‚É£4Ô∏è‚É£ Select the right SHAP matrix for the "default" class

if isinstance(shap_values, list):
    shap_matrix = shap_values[1]   # class 1 = default
else:
    shap_matrix = shap_values

shap_matrix.shape


In [None]:
# 1Ô∏è‚É£5Ô∏è‚É£ Global feature importance (SHAP summary bar plot)
from pathlib import Path

PROJECT_DIR = Path("~/Documents/credit-scoring-home-credit").expanduser()
PLOTS_DIR = PROJECT_DIR / "plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

PLOTS_DIR
import matplotlib.pyplot as plt

plt.figure()
shap.summary_plot(shap_matrix, X_valid_sample, plot_type="bar", show=False)
plt.title("SHAP Feature Importance (Global)")
plt.savefig(PLOTS_DIR / "shap_feature_importance.png", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
# 1Ô∏è‚É£6Ô∏è‚É£ Global explanation (SHAP summary beeswarm plot)


import shap
import matplotlib.pyplot as plt

shap.summary_plot(shap_matrix, X_valid_sample, show=False)

plt.title("SHAP Summary (How features push risk up/down)")

plt.savefig(PLOTS_DIR / "shap_summary.png", dpi=300, bbox_inches="tight")
plt.show()
plt.close()


In [None]:
# 1Ô∏è‚É£7Ô∏è‚É£ Dependence plot for the top feature

# Pick top feature name from SHAP mean absolute value
mean_abs_shap = np.abs(shap_matrix).mean(axis=0)
top_feature = X_valid_sample.columns[np.argmax(mean_abs_shap)]

top_feature
