In [1]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    classification_report, confusion_matrix,
    precision_recall_curve, roc_curve
)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

RANDOM_STATE = 123

DATA_SAMPLE_DIR = Path("../data_sample")
ART_DIR = Path("../artifacts")
ART_DIR.mkdir(parents=True, exist_ok=True)

SAMPLE_PATH = DATA_SAMPLE_DIR / "stage1_sample.parquet"

MODEL_PATH = ART_DIR / "stage1_pipeline.pkl"
METRICS_PATH = ART_DIR / "stage1_metrics.json"
UI_META_PATH = ART_DIR / "stage1_ui_metadata.json"


In [2]:
df = pd.read_parquet(SAMPLE_PATH)

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nLabel distribution:")
print(df["is_accepted"].value_counts(normalize=True))

display(df.head())


Shape: (500000, 7)

Columns: ['loan_amount', 'emp_length', 'dti', 'fico_est', 'fico_missing', 'emp_length_missing', 'is_accepted']

Label distribution:
is_accepted
0    0.924416
1    0.075584
Name: proportion, dtype: float64


Unnamed: 0,loan_amount,emp_length,dti,fico_est,fico_missing,emp_length_missing,is_accepted
0,3000.0,0.5,0.0,637.0,1,1,0
1,3000.0,0.5,26.35,640.0,0,0,0
2,4000.0,0.5,18.22,674.0,0,0,0
3,1200.0,0.5,4.74,579.0,0,0,0
4,20000.0,0.5,17.13,683.0,0,0,0


In [None]:
#Missingness check
missing = df.isna().mean().sort_values(ascending=False)
print("Top missingness should be ~0:")
display(missing.head(10))

#Quick descriptive stats
display(df.describe().T)

#Flag sanity missingness flags should be 0/1
print("\nFlag rates by class:")
display(df.groupby("is_accepted")[["fico_missing", "emp_length_missing"]].mean())


Top missingness (should be ~0 now):


loan_amount           0.0
emp_length            0.0
dti                   0.0
fico_est              0.0
fico_missing          0.0
emp_length_missing    0.0
is_accepted           0.0
dtype: float64

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
loan_amount,500000.0,13285.239483,14623.166121,0.0,5000.0,10000.0,20000.0,300000.0
emp_length,500000.0,1.488603,2.352001,0.5,0.5,0.5,0.5,10.0
dti,500000.0,25.340552,22.573579,0.0,8.52,19.62,35.05,80.0
fico_est,500000.0,639.944702,44.805004,300.0,637.0,637.0,637.0,850.0
fico_missing,500000.0,0.619336,0.485551,0.0,0.0,1.0,1.0,1.0
emp_length_missing,500000.0,0.036474,0.187467,0.0,0.0,0.0,0.0,1.0
is_accepted,500000.0,0.075584,0.264332,0.0,0.0,0.0,0.0,1.0



Flag rates by class:


Unnamed: 0_level_0,fico_missing,emp_length_missing
is_accepted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.669973,0.034156
1,2.6e-05,0.064829


In [None]:
TARGET = "is_accepted"

X = df.drop(columns=[TARGET]).copy()
y = df[TARGET].astype(int).copy()

num_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols) 


Numeric cols: ['loan_amount', 'emp_length', 'dti', 'fico_est', 'fico_missing', 'emp_length_missing']
Categorical cols: []


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_STATE, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=RANDOM_STATE, stratify=y_temp
)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)
print("Train pos rate:", float(y_train.mean()))
