In [7]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    classification_report, confusion_matrix,
    precision_recall_curve, roc_curve
)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

RANDOM_STATE = 123

DATA_SAMPLE_DIR = Path("../data_sample")
ART_DIR = Path("../artifacts")
ART_DIR.mkdir(parents=True, exist_ok=True)

SAMPLE_PATH = DATA_SAMPLE_DIR / "stage1_sample.parquet"

MODEL_PATH = ART_DIR / "stage1_pipeline.pkl"
METRICS_PATH = ART_DIR / "stage1_metrics.json"
UI_META_PATH = ART_DIR / "stage1_ui_metadata.json"


In [8]:
df = pd.read_parquet(SAMPLE_PATH)

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nLabel distribution:")
print(df["is_accepted"].value_counts(normalize=True))

display(df.head())


Shape: (500000, 7)

Columns: ['loan_amount', 'emp_length', 'dti', 'fico_est', 'fico_missing', 'emp_length_missing', 'is_accepted']

Label distribution:
is_accepted
0    0.924416
1    0.075584
Name: proportion, dtype: float64


Unnamed: 0,loan_amount,emp_length,dti,fico_est,fico_missing,emp_length_missing,is_accepted
0,3000.0,0.5,0.0,637.0,1,1,0
1,3000.0,0.5,26.35,640.0,0,0,0
2,4000.0,0.5,18.22,674.0,0,0,0
3,1200.0,0.5,4.74,579.0,0,0,0
4,20000.0,0.5,17.13,683.0,0,0,0


In [9]:
#Missingness check
missing = df.isna().mean().sort_values(ascending=False)
print("Top missingness should be ~0:")
display(missing.head(10))

#Quick descriptive stats
display(df.describe().T)

#Flag sanity missingness flags should be 0/1
print("\nFlag rates by class:")
display(df.groupby("is_accepted")[["fico_missing", "emp_length_missing"]].mean())


Top missingness should be ~0:


loan_amount           0.0
emp_length            0.0
dti                   0.0
fico_est              0.0
fico_missing          0.0
emp_length_missing    0.0
is_accepted           0.0
dtype: float64

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
loan_amount,500000.0,13285.239483,14623.166121,0.0,5000.0,10000.0,20000.0,300000.0
emp_length,500000.0,1.488603,2.352001,0.5,0.5,0.5,0.5,10.0
dti,500000.0,25.340552,22.573579,0.0,8.52,19.62,35.05,80.0
fico_est,500000.0,639.944702,44.805004,300.0,637.0,637.0,637.0,850.0
fico_missing,500000.0,0.619336,0.485551,0.0,0.0,1.0,1.0,1.0
emp_length_missing,500000.0,0.036474,0.187467,0.0,0.0,0.0,0.0,1.0
is_accepted,500000.0,0.075584,0.264332,0.0,0.0,0.0,0.0,1.0



Flag rates by class:


Unnamed: 0_level_0,fico_missing,emp_length_missing
is_accepted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.669973,0.034156
1,2.6e-05,0.064829


In [10]:
TARGET = "is_accepted"

X = df.drop(columns=[TARGET]).copy()
y = df[TARGET].astype(int).copy()

num_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols) 


Numeric cols: ['loan_amount', 'emp_length', 'dti', 'fico_est', 'fico_missing', 'emp_length_missing']
Categorical cols: []


In [11]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_STATE, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=RANDOM_STATE, stratify=y_temp
)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)
print("Train pos rate:", float(y_train.mean()))


Train: (350000, 6) Val: (75000, 6) Test: (75000, 6)
Train pos rate: 0.07558285714285715


In [12]:
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()) 
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),
    ],
    remainder="drop"
)


In [13]:
clf = LogisticRegression(
    max_iter=500,
    class_weight="balanced",
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", clf)
])

pipe.fit(X_train, y_train)

val_proba = pipe.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_proba)
val_ap = average_precision_score(y_val, val_proba)

print("Validation ROC-AUC:", val_auc)
print("Validation PR-AUC :", val_ap)




Validation ROC-AUC: 0.9794713704105934
Validation PR-AUC : 0.7192873057937827


In [14]:
prec, rec, thr = precision_recall_curve(y_val, val_proba)
f1 = (2 * prec * rec) / (prec + rec + 1e-12)

best_idx = np.argmax(f1)
best_threshold = float(thr[best_idx]) if best_idx < len(thr) else 0.5

print("Best threshold (val, max F1):", best_threshold)
print("Best F1:", float(f1[best_idx]))


Best threshold (val, max F1): 0.8371134281309226
Best F1: 0.7217800877045542


In [15]:
test_proba = pipe.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, test_proba)
test_ap = average_precision_score(y_test, test_proba)

test_pred = (test_proba >= best_threshold).astype(int)

cm = confusion_matrix(y_test, test_pred)
report = classification_report(y_test, test_pred, digits=4)

print("Test ROC-AUC:", test_auc)
print("Test PR-AUC :", test_ap)
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)


Test ROC-AUC: 0.9793928778881547
Test PR-AUC : 0.7319421961384944

Confusion Matrix:
 [[67182  2149]
 [ 1294  4375]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9811    0.9690    0.9750     69331
           1     0.6706    0.7717    0.7176      5669

    accuracy                         0.9541     75000
   macro avg     0.8259    0.8704    0.8463     75000
weighted avg     0.9576    0.9541    0.9556     75000



In [16]:
joblib.dump(pipe, MODEL_PATH)
print("Saved model:", MODEL_PATH)

metrics = {
    "stage": "stage1_acceptance",
    "model": "logistic_regression",
    "features": X.columns.tolist(),
    "num_cols": num_cols,
    "cat_cols": cat_cols,
    "val_roc_auc": float(val_auc),
    "val_pr_auc": float(val_ap),
    "test_roc_auc": float(test_auc),
    "test_pr_auc": float(test_ap),
    "threshold": float(best_threshold),
    "confusion_matrix": cm.tolist(),
    "classification_report": report
}

with open(METRICS_PATH, "w") as f:
    json.dump(metrics, f, indent=2)

print("Saved metrics:", METRICS_PATH)


Saved model: ..\artifacts\stage1_pipeline.pkl
Saved metrics: ..\artifacts\stage1_metrics.json


Create the metadata to use on the frontend 

In [17]:
def make_numeric_metadata(df_train: pd.DataFrame, columns: list[str]) -> dict:
    meta = {}
    for c in columns:
        s = pd.to_numeric(df_train[c], errors="coerce").dropna()
        if len(s) == 0:
            continue
        meta[c] = {
            "min": float(s.min()),
            "max": float(s.max()),
            "p1": float(np.percentile(s, 1)),
            "p99": float(np.percentile(s, 99)),
            "recommended_min": float(np.percentile(s, 10)),
            "recommended_max": float(np.percentile(s, 90)),
        }
    return meta

ui_meta = {
    "stage": "stage1_acceptance",
    "numeric": make_numeric_metadata(X_train, num_cols),
    "categorical": {}  # none expected in Stage 1
}

with open(UI_META_PATH, "w") as f:
    json.dump(ui_meta, f, indent=2)

print("Saved UI metadata:", UI_META_PATH)
ui_meta


Saved UI metadata: ..\artifacts\stage1_ui_metadata.json


{'stage': 'stage1_acceptance',
 'numeric': {'loan_amount': {'min': 0.0,
   'max': 300000.0,
   'p1': 1000.0,
   'p99': 40000.0,
   'recommended_min': 2000.0,
   'recommended_max': 30000.0},
  'emp_length': {'min': 0.5,
   'max': 10.0,
   'p1': 0.5,
   'p99': 10.0,
   'recommended_min': 0.5,
   'recommended_max': 5.0},
  'dti': {'min': 0.0,
   'max': 80.0,
   'p1': 0.0,
   'p99': 80.0,
   'recommended_min': 1.01,
   'recommended_max': 61.97},
  'fico_est': {'min': 300.0,
   'max': 850.0,
   'p1': 501.0,
   'p99': 771.0,
   'recommended_min': 608.0,
   'recommended_max': 687.0},
  'fico_missing': {'min': 0.0,
   'max': 1.0,
   'p1': 0.0,
   'p99': 1.0,
   'recommended_min': 0.0,
   'recommended_max': 1.0},
  'emp_length_missing': {'min': 0.0,
   'max': 1.0,
   'p1': 0.0,
   'p99': 1.0,
   'recommended_min': 0.0,
   'recommended_max': 0.0}},
 'categorical': {}}