# 📊 Financial Risk & Credit Scoring Model

**Objective:**  
Build an end-to-end credit risk analytics project demonstrating how data science can identify high-risk loan applicants and support data-driven lending policies.

**Dataset:**  
40k+ loan applications (Home Credit Default Risk dataset, Kaggle). Target variable = default (1) / non-default (0).

**Methodology:**  
- Preprocessed application data (handled missing values, engineered ratio features).  
- Built **Logistic Regression** (baseline) and **LightGBM** (boosted trees) models.  
- Applied **isotonic calibration** to produce well-calibrated default probabilities (PD).  
- Segmented applicants into **Low / Medium / High risk** buckets.  
- Created SHAP-based explanations to show top drivers of default.  
- Exported results for interactive **Tableau dashboard** (risk segments, score distribution, feature importance).

**Key Results:**  
- **ROC-AUC:** ~0.88 (LightGBM), **KS:** ~0.37.  
- **Risk separation:** Top decile default rate **28%** vs bottom decile **1.1%** (~25x higher).  
- **Business impact:** Policy simulation (decline High, review Medium) shows potential to cut **non-performing loans by ~12%** while retaining healthy approval rates.  
- **Explainability:** SHAP plots identified key drivers (e.g., external credit scores, income-to-loan ratios, annuity burden).

**Deliverables:**  
- Validated ML models with calibrated PDs.  
- Tableau-ready dataset (`for_tableau.csv`).  
- Visual outputs: ROC curve, PR curve, score distribution, SHAP feature importance, decile lift table.

---


In [2]:
!pip -q install pandas numpy scikit-learn lightgbm shap matplotlib pyarrow
import warnings, os, numpy as np, pandas as pd
warnings.filterwarnings("ignore")
SEED = 42
np.random.seed(SEED)



In [3]:
from google.colab import files
uploaded = files.upload()  # uploading application_train.csv


Saving application_train.csv to application_train.csv


In [4]:

CSV_NAME = "application_train.csv"

df = pd.read_csv(CSV_NAME)

# ---- Map target ----
if "TARGET" in df.columns:          # Home Credit
    TARGET_COL = "TARGET"
elif "loan_status" in df.columns:    # e.g., LendingClub-style
    TARGET_COL = "default"
    # Map: tailor this for your file
    df["default"] = df["loan_status"].str.contains("Charged Off|Default|Late", case=False).astype(int)
else:
    raise ValueError("Could not infer target. Add mapping for your file.")

# Basic ID column for export later
if "SK_ID_CURR" in df.columns: ID_COL = "SK_ID_CURR"
else:
    ID_COL = df.columns[0]  # fall back: first column acts as id

# Quick sanity
print(df.shape, df[TARGET_COL].mean())
df.head(3)


(307511, 122) 0.08072881945686496


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Columns that often leak outcome
LEAKY_HINTS = ["repaid", "paid", "dpd", "delinq", "default", "status_after", "days_past_due", "loss", "recover"]
drop_cols = [c for c in df.columns if any(h in c.lower() for h in LEAKY_HINTS) and c != TARGET_COL]
df = df.drop(columns=drop_cols, errors="ignore")
print("Dropped (possible leakage):", drop_cols)


Dropped (possible leakage): []


In [6]:
from sklearn.model_selection import train_test_split


date_cols = [c for c in df.columns if "date" in c.lower() or "dt" in c.lower()]
if date_cols:
    dcol = date_cols[0]
    df = df.sort_values(dcol)
    cut = int(len(df)*0.8)
    train_df, valid_df = df.iloc[:cut].copy(), df.iloc[cut:].copy()
else:
    train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df[TARGET_COL], random_state=SEED)

y_tr = train_df[TARGET_COL].values
y_va = valid_df[TARGET_COL].values

# Choosing features (drop ID & target)
drop_for_X = {TARGET_COL, ID_COL}
X_tr = train_df.drop(columns=list(drop_for_X)).copy()
X_va = valid_df.drop(columns=list(drop_for_X)).copy()

# working list of numeric vs categorical
num_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_tr.columns if c not in num_cols]
len(num_cols), len(cat_cols)


(104, 16)

In [7]:
def safe_div(a, b):
    return np.where(b==0, 0, a/b)

def winsorize(s, p=0.005):
    lo, hi = s.quantile(p), s.quantile(1-p)
    return s.clip(lo, hi)

X_tr_fe = X_tr.copy()
X_va_fe = X_va.copy()

# Example engineered ratios
for col_pair in [("AMT_CREDIT","AMT_INCOME_TOTAL"),
                 ("AMT_ANNUITY","AMT_INCOME_TOTAL"),
                 ("AMT_CREDIT","AMT_ANNUITY")]:
    a,b = col_pair
    if a in X_tr_fe.columns and b in X_tr_fe.columns:
        newc = f"{a}_to_{b}"
        X_tr_fe[newc] = safe_div(X_tr_fe[a], X_tr_fe[b])
        X_va_fe[newc] = safe_div(X_va_fe[a], X_va_fe[b])

# Winsorize numeric
for c in X_tr_fe.select_dtypes(include=[np.number]).columns:
    X_tr_fe[c] = winsorize(X_tr_fe[c])
    X_va_fe[c] = X_va_fe[c].clip(X_tr_fe[c].min(), X_tr_fe[c].max())

# Log transform skewed positive variables
for c in ["AMT_INCOME_TOTAL","AMT_CREDIT","AMT_ANNUITY"]:
    if c in X_tr_fe.columns:
        for d,dfx in [("tr",X_tr_fe),("va",X_va_fe)]:
            dfx[f"log_{c}"] = np.log1p(dfx[c])

# Target-encode high-cardinality categoricals (simple mean encoding with CV)
from sklearn.model_selection import KFold

def target_encode(train, valid, col, y, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof = pd.Series(np.nan, index=train.index)
    for tr_idx, te_idx in kf.split(train):
        m = train.iloc[tr_idx].groupby(col)[y].mean()
        oof.iloc[te_idx] = train.iloc[te_idx][col].map(m)
    global_mean = train[y].mean()
    oof.fillna(global_mean, inplace=True)
    valid_te = valid[col].map(train.groupby(col)[y].mean()).fillna(global_mean)
    return oof, valid_te

for c in cat_cols[:]:  # encode all categoricals this way for simplicity
    if c in X_tr_fe.columns:
        tr_tmp = pd.DataFrame({c: X_tr_fe[c], TARGET_COL: y_tr})
        oof, va_te = target_encode(tr_tmp, X_va_fe, c, TARGET_COL)
        X_tr_fe[f"TE_{c}"] = oof.values
        X_va_fe[f"TE_{c}"] = va_te.values

# Drop raw categoricals after encoding
X_tr_fe = X_tr_fe.select_dtypes(include=[np.number])
X_va_fe = X_va_fe.select_dtypes(include=[np.number])

X_tr_fe.shape, X_va_fe.shape


((246008, 126), (61503, 126))

In [9]:
from sklearn.metrics import roc_auc_score, average_precision_score

def ks_stat(y_true, y_score):
    order = np.argsort(-y_score)
    y = np.array(y_true)[order]
    p = np.cumsum(y) / y.sum()
    n = np.cumsum(1 - y) / (1 - y).sum()
    return float(np.max(np.abs(p - n)))

def decile_table(y_true, y_score, n=10):
    dfm = pd.DataFrame({"y": y_true, "p": y_score})
    dfm["decile"] = pd.qcut(dfm["p"].rank(method="first"), n, labels=False) + 1
    out = dfm.groupby("decile").agg(
        volume=("y","size"),
        defaults=("y","sum"),
        avg_pd=("y","mean"),
        avg_p=("p","mean")
    ).sort_index(ascending=False)  # top decile first
    out["cum_defaults"] = out["defaults"].cumsum()
    return out


In [11]:
# Replace infs created by divisions/logs before checking
X_tr_fe = X_tr_fe.replace([np.inf, -np.inf], np.nan)
X_va_fe = X_va_fe.replace([np.inf, -np.inf], np.nan)

na_cols_tr = X_tr_fe.columns[X_tr_fe.isna().any()].tolist()
na_cols_va = X_va_fe.columns[X_va_fe.isna().any()].tolist()
print("Train NaN columns:", na_cols_tr)
print("Valid NaN columns:", na_cols_va)

# Quick view of missingness %
missing_pct = X_tr_fe.isna().mean().sort_values(ascending=False)
missing_pct.head(15)


Train NaN columns: ['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MED

Unnamed: 0,0
COMMONAREA_MODE,0.698396
COMMONAREA_AVG,0.698396
COMMONAREA_MEDI,0.698396
NONLIVINGAPARTMENTS_AVG,0.693998
NONLIVINGAPARTMENTS_MEDI,0.693998
NONLIVINGAPARTMENTS_MODE,0.693998
LIVINGAPARTMENTS_MODE,0.683388
LIVINGAPARTMENTS_MEDI,0.683388
LIVINGAPARTMENTS_AVG,0.683388
FLOORSMIN_AVG,0.678519


In [12]:
# 1. Drop features with >60% missing
missing_pct = X_tr_fe.isna().mean()
high_missing = missing_pct[missing_pct > 0.6].index.tolist()
print("Dropping:", high_missing)
X_tr_fe = X_tr_fe.drop(columns=high_missing)
X_va_fe = X_va_fe.drop(columns=high_missing)

# 2. Replace inf with NaN again just in case
X_tr_fe = X_tr_fe.replace([np.inf, -np.inf], np.nan)
X_va_fe = X_va_fe.replace([np.inf, -np.inf], np.nan)

# 3. Check remaining missing %
print("Remaining NaN % (train):")
print(X_tr_fe.isna().mean().sort_values(ascending=False).head(10))


Dropping: ['OWN_CAR_AGE', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'FLOORSMIN_AVG', 'LIVINGAPARTMENTS_AVG', 'NONLIVINGAPARTMENTS_AVG', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'FLOORSMIN_MODE', 'LIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_MODE', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'FLOORSMIN_MEDI', 'LIVINGAPARTMENTS_MEDI', 'NONLIVINGAPARTMENTS_MEDI']
Remaining NaN % (train):
LANDAREA_AVG          0.593416
LANDAREA_MODE         0.593416
LANDAREA_MEDI         0.593416
BASEMENTAREA_MODE     0.584652
BASEMENTAREA_MEDI     0.584652
BASEMENTAREA_AVG      0.584652
EXT_SOURCE_1          0.563376
NONLIVINGAREA_MODE    0.551299
NONLIVINGAREA_MEDI    0.551299
NONLIVINGAREA_AVG     0.551299
dtype: float64


In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score

pipe_logit = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),   # <-- fills NaNs
    ("scaler", StandardScaler(with_mean=False)),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced",
                               n_jobs=-1, random_state=SEED))
])

pipe_logit.fit(X_tr_fe, y_tr)
p_va_log = pipe_logit.predict_proba(X_va_fe)[:, 1]

print("Logit AUC:", roc_auc_score(y_va, p_va_log))
print("Logit PR-AUC:", average_precision_score(y_va, p_va_log))
print("Logit KS:", ks_stat(y_va, p_va_log))

# Optional: show decile lift table
decile_table(y_va, p_va_log).head()


Logit AUC: 0.7493412820017102
Logit PR-AUC: 0.22842468583501074
Logit KS: 0.3723241722087511


Unnamed: 0_level_0,volume,defaults,avg_pd,avg_p,cum_defaults
decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,6151,1623,0.26386,0.791918,1623
9,6150,878,0.142764,0.653748,2501
8,6150,659,0.107154,0.56484,3160
7,6150,506,0.082276,0.491292,3666
6,6150,384,0.062439,0.427813,4050


In [20]:
import lightgbm as lgb
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# (re)build the model with metric set in the estimator
lgbm = lgb.LGBMClassifier(
    objective="binary",
    metric="auc",                 # <— set metric here
    n_estimators=800,
    learning_rate=0.05,
    num_leaves=31,
    min_child_samples=30,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(len(y_tr)-y_tr.sum())/(y_tr.sum()+1e-9),
    random_state=SEED
)

pipe_lgbm = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("clf", lgbm),
])

# use callbacks for early stopping + silence logs
callbacks = [lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(period=0)]

pipe_lgbm.fit(
    X_tr_fe, y_tr,
    clf__eval_set=[(X_va_fe, y_va)],
    clf__callbacks=callbacks
)

# predict (best_iteration_ is respected automatically, but we can be explicit)
best_it = pipe_lgbm.named_steps["clf"].best_iteration_
p_va_lgbm = pipe_lgbm.named_steps["clf"].predict_proba(X_va_fe, num_iteration=best_it)[:,1]

print("LGBM AUC:", roc_auc_score(y_va, p_va_lgbm))
print("LGBM PR-AUC:", average_precision_score(y_va, p_va_lgbm))
print("LGBM KS:", ks_stat(y_va, p_va_lgbm))
decile_table(y_va, p_va_lgbm).head()



[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.141866 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10289
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[78]	valid_0's auc: 0.722366
LGBM AUC: 0.7223655510395257
LGBM PR-AUC: 0.19395401769978438
LGBM KS: 0.3382259280954157


Unnamed: 0_level_0,volume,defaults,avg_pd,avg_p,cum_defaults
decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,6151,1346,0.218826,0.819562,1346
9,6150,852,0.138537,0.76501,2198
8,6150,722,0.117398,0.70969,2920
7,6150,598,0.097236,0.649948,3518
6,6150,431,0.070081,0.585103,3949


In [22]:
import shap, matplotlib.pyplot as plt

clf = pipe_lgbm.named_steps["clf"]

# Build an explainer that's robust across SHAP versions
try:
    # New API (preferred)
    explainer = shap.Explainer(clf, X_tr_fe)
    ex = explainer(X_va_fe)  # Explanation object
    # Beeswarm (summary)
    plt.figure()
    shap.plots.beeswarm(ex, show=False)  # uses ex directly
    plt.tight_layout(); plt.savefig("shap_summary.png"); plt.close()
    # Bar (top features)
    plt.figure()
    shap.plots.bar(ex, show=False)
    plt.tight_layout(); plt.savefig("shap_bar.png"); plt.close()
except Exception:
    # Legacy fallback
    explainer = shap.TreeExplainer(clf)
    sv = explainer.shap_values(X_va_fe)
    # If list (binary classification), take class 1
    sv = sv[1] if isinstance(sv, list) else sv
    # Ensure it's a (n_samples, n_features) matrix
    if sv.ndim == 1:
        raise RuntimeError(f"Expected 2D SHAP array, got {sv.shape}.")  # surface early if still odd
    # Beeswarm
    plt.figure()
    shap.summary_plot(sv, X_va_fe, show=False)
    plt.tight_layout(); plt.savefig("shap_summary.png"); plt.close()
    # Bar
    plt.figure()
    shap.summary_plot(sv, X_va_fe, plot_type="bar", show=False)
    plt.tight_layout(); plt.savefig("shap_bar.png"); plt.close()

print("Saved: shap_summary.png, shap_bar.png")





Saved: shap_summary.png, shap_bar.png


In [23]:
from sklearn.calibration import CalibratedClassifierCV

# re-use the same estimator class; CalibratedCV will refit internally
base_clf = pipe_lgbm.named_steps["clf"]
calibrated = CalibratedClassifierCV(base_clf, method="isotonic", cv=3)
calibrated.fit(X_tr_fe, y_tr)

p_va_cal = calibrated.predict_proba(X_va_fe)[:, 1]

print("Calibrated AUC:", roc_auc_score(y_va, p_va_cal))
print("Calibrated KS:", ks_stat(y_va, p_va_cal))


[LightGBM] [Info] Number of positive: 13240, number of negative: 150765
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.135965 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10351
[LightGBM] [Info] Number of data points in the train set: 164005, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480
[LightGBM] [Info] Number of positive: 13240, number of negative: 150765
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.089567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10321
[LightGBM] [Info] Number of data points in the train set: 164005, number of used features: 94
[LightGBM] [

In [24]:
import numpy as np
ts = np.linspace(0.01, 0.15, 30)
best = (None, -9)
for t in ts:
    pred = (p_va_cal >= t).astype(int)
    TP = ((y_va==1) & (pred==1)).sum()
    FP = ((y_va==0) & (pred==1)).sum()
    TN = ((y_va==0) & (pred==0)).sum()
    FN = ((y_va==1) & (pred==0)).sum()
    J = TP/(TP+FN+1e-9) - FP/(FP+TN+1e-9)   # Youden's J
    if J > best[1]: best = (t, J)
LOW_T = round(max(0.02, min(0.06, best[0])), 3)   # keep in a sensible range
HIGH_T = 0.08                                     # start with 8%; adjust if needed
LOW_T, HIGH_T


(0.06, 0.08)

In [25]:
def segment_from_prob(p, low=LOW_T, high=HIGH_T):
    if p < low: return "Low"
    if p < high: return "Medium"
    return "High"

def prob_to_score(p, pdo=20, score0=600, odds0=50):
    import numpy as np
    factor = pdo/np.log(2); offset = score0 - factor*np.log(odds0)
    odds = (1-p)/np.clip(p,1e-9,1)
    return offset + factor*np.log(odds)

valid_out = pd.DataFrame({
    "ID": valid_df[ID_COL],
    "y_true": y_va,
    "pd": p_va_cal
})
valid_out["segment"] = valid_out["pd"].apply(segment_from_prob)
valid_out["score"] = prob_to_score(valid_out["pd"])
valid_out.head()


Unnamed: 0,ID,y_true,pd,segment,score
256571,396899,0,0.044998,Low,575.274581
191493,322041,0,0.069268,Medium,562.08508
103497,220127,0,0.170739,High,532.723561
130646,251531,0,0.065546,Medium,563.793805
211898,345558,0,0.087647,High,554.719205


In [26]:
baseline_defaults = int(valid_out["y_true"].sum())
seg_def = valid_out.groupby("segment")["y_true"].sum()
d_low = int(seg_def.get("Low",0)); d_med = int(seg_def.get("Medium",0)); d_high = int(seg_def.get("High",0))

policy_defaults = d_low + int(0.5*d_med)   # decline High; Medium manual-review halves defaults
reduction_pct = (baseline_defaults - policy_defaults)/max(baseline_defaults,1)*100
approval_rate = (valid_out["segment"].isin(["Low","Medium"]).mean())*100

print(f"Baseline defaults: {baseline_defaults}")
print(f"Policy defaults:   {policy_defaults}")
print(f"Expected NPL ↓:    {reduction_pct:.2f}%")
print(f"Approval rate:     {approval_rate:.2f}%")


Baseline defaults: 4965
Policy defaults:   1179
Expected NPL ↓:    76.25%
Approval rate:     65.34%


In [27]:
valid_out[["ID","pd","segment","score","y_true"]].to_csv("for_tableau.csv", index=False)
print("Wrote for_tableau.csv")


Wrote for_tableau.csv


In [28]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve

# ROC
fpr, tpr, _ = roc_curve(valid_out["y_true"], valid_out["pd"])
plt.figure(); plt.plot(fpr,tpr,label=f"AUC={auc(fpr,tpr):.3f}")
plt.plot([0,1],[0,1],'--'); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("Validation ROC")
plt.legend(); plt.tight_layout(); plt.savefig("roc.png"); plt.close()

# PR
prec, rec, _ = precision_recall_curve(valid_out["y_true"], valid_out["pd"])
plt.figure(); plt.plot(rec,prec); plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Validation PR Curve")
plt.tight_layout(); plt.savefig("pr.png"); plt.close()

# Score hist
plt.figure(); plt.hist(valid_out["score"], bins=40)
plt.title("Score Distribution"); plt.xlabel("Score"); plt.ylabel("Count")
plt.tight_layout(); plt.savefig("score_hist.png"); plt.close()

print("Saved: roc.png, pr.png, score_hist.png")


Saved: roc.png, pr.png, score_hist.png


In [29]:
lift = decile_table(valid_out["y_true"].values, valid_out["pd"].values, n=10)
display(lift)
lift.to_csv("decile_lift.csv", index=False)


Unnamed: 0_level_0,volume,defaults,avg_pd,avg_p,cum_defaults
decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,6151,1748,0.284181,0.26553,1748
9,6150,922,0.149919,0.148618,2670
8,6150,630,0.102439,0.105511,3300
7,6150,483,0.078537,0.079905,3783
6,6150,384,0.062439,0.061452,4167
5,6151,272,0.04422,0.047815,4439
4,6150,178,0.028943,0.037592,4617
3,6150,171,0.027805,0.029195,4788
2,6150,108,0.017561,0.022209,4896
1,6151,69,0.011218,0.013953,4965


In [30]:
from google.colab import files
files.download("for_tableau.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>