In [None]:
# 0Ô∏è‚É£ Scorecard (WOE) notebook
# Goal: Build an interpretable credit scorecard using manual binning + WOE + Logistic Regression


In [None]:
# 1Ô∏è‚É£ Imports and project paths

import pandas as pd
import numpy as np
from pathlib import Path

PROJECT_DIR = Path("~/Documents/credit-scoring-home-credit").expanduser()
DATA_PROCESSED = PROJECT_DIR / "data" / "processed"
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)


In [None]:
# Load cleaned dataset created in previous notebook

clean_path = DATA_PROCESSED / "application_cleaned.csv"

df_enhanced = pd.read_csv(clean_path)

df_enhanced.shape


In [None]:
# 3Ô∏è‚É£ Select a small set of variables for the scorecard

scorecard_vars = [
    "EXT_SOURCE_2",
    "EXT_SOURCE_3",
    "AMT_CREDIT",
    "AMT_GOODS_PRICE",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "OCCUPATION_TYPE",
    "TARGET"
]

df_sc = df_enhanced[scorecard_vars].copy()
df_sc.shape


In [None]:
# Fix missing values in categorical variables (important for WOE)

categorical_vars = [
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "OCCUPATION_TYPE"
]

for col in categorical_vars:
    df_sc[col] = df_sc[col].fillna("MISSING")

df_sc[categorical_vars].isna().sum()


In [None]:
# 4Ô∏è‚É£ Create quantile bins and handle missing values explicitly

numeric_vars = ["EXT_SOURCE_2", "EXT_SOURCE_3", "AMT_CREDIT", "AMT_GOODS_PRICE"]

for col in numeric_vars:
    df_sc[col + "_BIN"] = pd.qcut(
        df_sc[col],
        q=5,
        duplicates="drop"
    )
    
    # Convert bins to string
    df_sc[col + "_BIN"] = df_sc[col + "_BIN"].astype(str)
    
    # Replace 'nan' string with explicit missing label
    df_sc.loc[df_sc[col].isna(), col + "_BIN"] = "MISSING"


In [None]:
# 5Ô∏è‚É£ Compute WOE and IV (with Laplace smoothing to avoid infinite values)

def compute_woe_iv(df, feature, target="TARGET"):
    
    grouped = df.groupby(feature)[target]
    
    summary = grouped.agg(
        total="count",
        bad="sum"
    )
    
    summary["good"] = summary["total"] - summary["bad"]
    
    total_bad = summary["bad"].sum()
    total_good = summary["good"].sum()
    
    # Laplace smoothing (prevents infinite WOE)
    summary["bad_dist"] = (summary["bad"] + 0.5) / (total_bad + 1)
    summary["good_dist"] = (summary["good"] + 0.5) / (total_good + 1)
    
    summary["woe"] = np.log(summary["good_dist"] / summary["bad_dist"])
    
    summary["iv"] = (summary["good_dist"] - summary["bad_dist"]) * summary["woe"]
    
    iv_total = summary["iv"].sum()
    
    return summary, iv_total


In [None]:
# 6Ô∏è‚É£ Compute IV for all scorecard variables

features_to_evaluate = [
    "EXT_SOURCE_2_BIN",
    "EXT_SOURCE_3_BIN",
    "AMT_CREDIT_BIN",
    "AMT_GOODS_PRICE_BIN",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "OCCUPATION_TYPE"
]

iv_results = {}

for feature in features_to_evaluate:
    _, iv_value = compute_woe_iv(df_sc, feature)
    iv_results[feature] = iv_value

iv_results


In [None]:
# 7Ô∏è‚É£ Inspect WOE table for EXT_SOURCE_3_BIN

woe_ext3, iv_ext3 = compute_woe_iv(df_sc, "EXT_SOURCE_3_BIN")

woe_ext3


In [None]:
# 8Ô∏è‚É£ Sort IV results from strongest to weakest

iv_sorted = dict(sorted(iv_results.items(), key=lambda x: x[1], reverse=True))
iv_sorted


In [None]:
# 9Ô∏è‚É£ Create WOE mapping dictionary

woe_maps = {}

for feature in features_to_evaluate:
    woe_table, _ = compute_woe_iv(df_sc, feature)
    woe_maps[feature] = woe_table["woe"].to_dict()


In [None]:
# üîü Transform variables into WOE values

df_woe = df_sc.copy()

for feature in features_to_evaluate:
    df_woe[feature + "_WOE"] = df_sc[feature].map(woe_maps[feature])

# Select only WOE columns + target
woe_columns = [f + "_WOE" for f in features_to_evaluate]

df_woe_model = df_woe[woe_columns + ["TARGET"]].copy()

df_woe_model.head()


In [None]:
df_woe_model.isna().sum()


In [None]:
# 1Ô∏è‚É£1Ô∏è‚É£ Prepare X (WOE features) and y (target)

X = df_woe_model.drop(columns=["TARGET"])
y = df_woe_model["TARGET"].astype(int)

X.shape, y.shape


In [None]:
# 1Ô∏è‚É£2Ô∏è‚É£ Train/validation split (stratified)

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_valid.shape


In [None]:
# 1Ô∏è‚É£3Ô∏è‚É£ Train logistic regression on WOE features (scorecard model)

from sklearn.linear_model import LogisticRegression

woe_logit = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    solver="lbfgs"
)

woe_logit.fit(X_train, y_train)


In [None]:
# 1Ô∏è‚É£4Ô∏è‚É£ Evaluate scorecard model (ROC-AUC and PR-AUC)

from sklearn.metrics import roc_auc_score, average_precision_score

p_valid = woe_logit.predict_proba(X_valid)[:, 1]

roc_auc = roc_auc_score(y_valid, p_valid)
pr_auc = average_precision_score(y_valid, p_valid)

roc_auc, pr_auc


In [None]:
# 1Ô∏è‚É£5Ô∏è‚É£ Compute KS statistic (credit risk standard metric)

import numpy as np
import pandas as pd

def ks_statistic(y_true, y_score, n_bins=100):
    data = pd.DataFrame({"y": y_true, "score": y_score}).sort_values("score")
    data["bin"] = pd.qcut(data["score"], q=n_bins, duplicates="drop")
    
    grouped = data.groupby("bin", observed=False)["y"]
    bad_rate = grouped.mean()
    total = grouped.size()
    
    bad_cum = (bad_rate * total).cumsum() / (data["y"].sum())
    good_cum = ((1 - bad_rate) * total).cumsum() / ((1 - data["y"]).sum())
    
    return np.max(np.abs(bad_cum - good_cum))

ks = ks_statistic(y_valid, p_valid)
ks


In [None]:
# 1Ô∏è‚É£6Ô∏è‚É£ Inspect scorecard model coefficients (interpretation)

coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": woe_logit.coef_[0],
    "odds_ratio": np.exp(woe_logit.coef_[0])
}).sort_values("coefficient", ascending=False)

coef_df


In [None]:
# 1Ô∏è‚É£7Ô∏è‚É£ Choose score scaling parameters (Base Score + PDO)

# Base score: score at a chosen odds level (e.g., odds = 50:1 means PD ~ 1.96%)
BASE_SCORE = 600
BASE_ODDS = 50   # good:bad odds (50 means 50 non-default for 1 default)

# Points to Double the Odds (PDO): +PDO points halves default odds
PDO = 50


In [None]:
# 1Ô∏è‚É£8Ô∏è‚É£ Convert scaling parameters into A and B for: Score = A - B * log_odds

import numpy as np

B = PDO / np.log(2)                # scaling factor
A = BASE_SCORE + B * np.log(BASE_ODDS)  # offset

A, B


In [None]:
# 1Ô∏è‚É£9Ô∏è‚É£ Compute log-odds, PD, and credit score on the validation set

# log-odds from logistic regression: log(p/(1-p)) = intercept + X*beta
log_odds_valid = woe_logit.intercept_[0] + np.dot(X_valid.values, woe_logit.coef_[0])

# Convert log-odds to PD
pd_valid = 1 / (1 + np.exp(-log_odds_valid))

# Convert log-odds to credit score (higher = lower risk)
score_valid = A - B * log_odds_valid

pd_valid[:5], score_valid[:5]


In [None]:
# 2Ô∏è‚É£0Ô∏è‚É£ Put results into a nice dataframe (score + PD + actual target)

valid_scored = X_valid.copy()
valid_scored["PD"] = pd_valid
valid_scored["SCORE"] = score_valid
valid_scored["TARGET"] = y_valid.values

valid_scored[["PD", "SCORE", "TARGET"]].head(10)


In [None]:
# 2Ô∏è‚É£1Ô∏è‚É£ Check score summary and default rate by score decile

valid_scored["score_decile"] = pd.qcut(valid_scored["SCORE"], 10, duplicates="drop")

decile_summary = valid_scored.groupby("score_decile", observed=False).agg(
    n=("TARGET", "count"),
    default_rate=("TARGET", "mean"),
    avg_score=("SCORE", "mean"),
    avg_pd=("PD", "mean")
).sort_index(ascending=False)

decile_summary


In [None]:
# 2Ô∏è‚É£2Ô∏è‚É£ Plot score distribution for goods vs bads
from pathlib import Path
import matplotlib.pyplot as plt
PROJECT_DIR = Path("~/Documents/credit-scoring-home-credit").expanduser()
PLOTS_DIR = PROJECT_DIR / "plots"
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

PLOTS_DIR




plt.hist(valid_scored.loc[valid_scored["TARGET"] == 0, "SCORE"], bins=50, alpha=0.6, label="Non-default (0)")
plt.hist(valid_scored.loc[valid_scored["TARGET"] == 1, "SCORE"], bins=50, alpha=0.6, label="Default (1)")
plt.xlabel("Credit Score")
plt.ylabel("Count")
plt.title("Score Distribution (Validation)")
plt.legend()
plt.savefig(PLOTS_DIR / "score_distribution_validation.png", dpi=300, bbox_inches="tight")
