In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    classification_report, confusion_matrix,
    precision_recall_curve
)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

RANDOM_STATE = 42

DATA_DIR = Path("../data")
DATA_SAMPLE_DIR = Path("../data_sample")
ART_DIR = Path("../artifacts")
DATA_SAMPLE_DIR.mkdir(parents=True, exist_ok=True)
ART_DIR.mkdir(parents=True, exist_ok=True)

ACCEPTED_CSV = DATA_DIR / "accepted_2007_to_2018Q4.csv"
STAGE2_PARQUET = DATA_SAMPLE_DIR / "stage2_sample.parquet"

MODEL_PATH = ART_DIR / "stage2_pipeline.pkl"
METRICS_PATH = ART_DIR / "stage2_metrics.json"
UI_META_PATH = ART_DIR / "stage2_ui_metadata.json"


In [3]:
CANONICAL_COLS = [
    "loan_amount",
    "term",
    "purpose",
    "annual_income",
    "emp_length",
    "dti",
    "utilization",
    "delinquencies",
    "fico_est",
    "fico_missing",
    "emp_length_missing",
]
TARGET = "is_default"  # Stage 2 label: 1=bad(default), 0=good(paid)


In [None]:
def to_float(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce")

def pct_to_float(s: pd.Series) -> pd.Series:
    # handles "55.2%" or numeric
    if s.dtype == "O":
        s = s.astype(str).str.replace("%", "", regex=False).str.strip()
    return pd.to_numeric(s, errors="coerce")

def clean_emp_length_fast(s: pd.Series) -> pd.Series:
    if s.dtype != "O":
        return pd.to_numeric(s, errors="coerce")

    st = s.astype(str).str.lower()
    out = pd.Series(np.nan, index=s.index, dtype="float32")

    out[st.str.contains(r"<\s*1", na=False)] = 0.5
    out[st.str.contains(r"10\+", na=False)] = 10.0

    extracted = st.str.extract(r"(\d+)", expand=False)
    extracted_num = pd.to_numeric(extracted, errors="coerce")

    out = out.fillna(extracted_num)
    return out.astype("float32")

def clip_fico(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors="coerce")
    return s.clip(lower=300, upper=850)

def clip_nonneg(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors="coerce")
    return s.clip(lower=0)

def parse_term_months(s: pd.Series) -> pd.Series:
    if pd.api.types.is_numeric_dtype(s):
        return pd.to_numeric(s, errors="coerce")
    #extract first number found
    extracted = s.astype(str).str.extract(r"(\d+)", expand=False)
    return pd.to_numeric(extracted, errors="coerce")


In [None]:
USECOLS = [
    "loan_amnt", "term", "purpose", "annual_inc", "emp_length", "dti",
    "revol_util", "delinq_2yrs", "fico_range_low", "fico_range_high",
    "loan_status"
]

acc = pd.read_csv(ACCEPTED_CSV, usecols=USECOLS, low_memory=False)
print("Loaded accepted shape:", acc.shape)

#Build target
good_status = {
    "Fully Paid",
    "Does not meet the credit policy. Status:Fully Paid"
}
bad_status = {
    "Charged Off",
    "Default",
    "Does not meet the credit policy. Status:Charged Off"
}

acc = acc[acc["loan_status"].isin(good_status | bad_status)].copy()
acc[TARGET] = acc["loan_status"].isin(bad_status).astype("int8")

print("Filtered to final-outcome loans:", acc.shape)
print(acc["loan_status"].value_counts())
print("Default rate:", float(acc[TARGET].mean()))

#Canonicalize features
df2 = pd.DataFrame({
    "loan_amount": to_float(acc["loan_amnt"]),
    "term": parse_term_months(acc["term"]),  
    "purpose": acc["purpose"].astype(str),
    "annual_income": to_float(acc["annual_inc"]),
    "emp_length": clean_emp_length_fast(acc["emp_length"]),
    "dti": pct_to_float(acc["dti"]),
    "utilization": pct_to_float(acc["revol_util"]),
    "delinquencies": to_float(acc["delinq_2yrs"]),
    "fico_est": (to_float(acc["fico_range_low"]) + to_float(acc["fico_range_high"])) / 2.0,
    TARGET: acc[TARGET].astype("int8")
})

#If term came in like " 36 months" fix it:
if df2["term"].dtype == "O":
    df2["term"] = pd.to_numeric(df2["term"].astype(str).str.extract(r"(\d+)", expand=False), errors="coerce")

df2["fico_est"] = clip_fico(df2["fico_est"])

#Missingness flags BEFORE fill
df2["fico_missing"] = df2["fico_est"].isna().astype("int8")
df2["emp_length_missing"] = df2["emp_length"].isna().astype("int8")

#Fill numeric with medians
for col in ["loan_amount","term","annual_income","emp_length","dti","utilization","delinquencies","fico_est"]:
    med = float(pd.to_numeric(df2[col], errors="coerce").median(skipna=True))
    df2[col] = pd.to_numeric(df2[col], errors="coerce").fillna(med)

#Sanity clips to  remove outliers
df2["loan_amount"] = clip_nonneg(df2["loan_amount"])
df2["annual_income"] = clip_nonneg(df2["annual_income"])
df2["delinquencies"] = clip_nonneg(df2["delinquencies"]).clip(upper=50)
df2["dti"] = df2["dti"].clip(lower=0, upper=80)
df2["utilization"] = df2["utilization"].clip(lower=0, upper=100)
df2["emp_length"] = df2["emp_length"].clip(lower=0, upper=50)
df2["term"] = df2["term"].clip(lower=0, upper=120)

print("Stage2 canonical shape:", df2.shape)
print("Any NaNs left?", df2.isna().any().any())

#Save Parquet
df2.to_parquet(STAGE2_PARQUET, index=False)
print("✅ Saved Stage 2 Parquet:", STAGE2_PARQUET)

df2.head()
print("term dtype:", df2["term"].dtype)
print("term NaN rate:", float(df2["term"].isna().mean()))
print("term value counts:")
print(df2["term"].value_counts().head(10))


Loaded accepted shape: (2260701, 11)
Filtered to final-outcome loans: (1348099, 12)
loan_status
Fully Paid                                             1076751
Charged Off                                             268559
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     40
Name: count, dtype: int64
Default rate: 0.19980728418313493


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Stage2 canonical shape: (1348099, 12)
Any NaNs left? True
✅ Saved Stage 2 Parquet: ..\data_sample\stage2_sample.parquet


Unnamed: 0,loan_amount,term,purpose,annual_income,emp_length,dti,utilization,delinquencies,fico_est,is_default,fico_missing,emp_length_missing
0,3600.0,,debt_consolidation,55000.0,10.0,5.91,29.7,0.0,677.0,0,0,0
1,24700.0,,small_business,65000.0,10.0,16.06,19.2,1.0,717.0,0,0,0
2,20000.0,,home_improvement,63000.0,10.0,10.78,56.2,0.0,697.0,0,0,0
4,10400.0,,major_purchase,104433.0,3.0,25.37,64.5,1.0,697.0,0,0,0
5,11950.0,,debt_consolidation,34000.0,4.0,10.2,68.4,0.0,692.0,0,0,0


In [None]:
df2 = pd.read_parquet(STAGE2_PARQUET)
print(df2.shape)
print("Default rate:", float(df2[TARGET].mean()))
df2.head()


(1348099, 12)
Default rate: 0.19980728418313493


Unnamed: 0,loan_amnt,term,emp_length,annual_inc,loan_status,purpose,dti,delinq_2yrs,fico_range_low,fico_range_high,revol_util,is_default
0,3600.0,36 months,10+ years,55000.0,Fully Paid,debt_consolidation,5.91,0.0,675.0,679.0,29.7,0
1,24700.0,36 months,10+ years,65000.0,Fully Paid,small_business,16.06,1.0,715.0,719.0,19.2,0
2,20000.0,60 months,10+ years,63000.0,Fully Paid,home_improvement,10.78,0.0,695.0,699.0,56.2,0
4,10400.0,60 months,3 years,104433.0,Fully Paid,major_purchase,25.37,1.0,695.0,699.0,64.5,0
5,11950.0,36 months,4 years,34000.0,Fully Paid,debt_consolidation,10.2,0.0,690.0,694.0,68.4,0
