In [None]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    classification_report, confusion_matrix,
    precision_recall_curve, roc_curve
)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

RANDOM_STATE = 123

DATA_SAMPLE_DIR = Path("../data_sample")
ART_DIR = Path("../artifacts")
ART_DIR.mkdir(parents=True, exist_ok=True)

SAMPLE_PATH = DATA_SAMPLE_DIR / "stage1_sample.parquet"

MODEL_PATH = ART_DIR / "stage1_pipeline.pkl"
METRICS_PATH = ART_DIR / "stage1_metrics.json"
UI_META_PATH = ART_DIR / "stage1_ui_metadata.json"


In [None]:
df = pd.read_parquet(SAMPLE_PATH)

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nLabel distribution:")
print(df["is_accepted"].value_counts(normalize=True))

display(df.head())


In [None]:
#Missingness check
missing = df.isna().mean().sort_values(ascending=False)
print("Top missingness should be ~0:")
display(missing.head(10))

#Quick descriptive stats
display(df.describe().T)

#Flag sanity missingness flags should be 0/1
print("\nFlag rates by class:")
display(df.groupby("is_accepted")[["fico_missing", "emp_length_missing"]].mean())


In [None]:
TARGET = "is_accepted"

X = df.drop(columns=[TARGET]).copy()
y = df[TARGET].astype(int).copy()

num_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols) 


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_STATE, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=RANDOM_STATE, stratify=y_temp
)

print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)
print("Train pos rate:", float(y_train.mean()))


In [None]:
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()) 
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),
    ],
    remainder="drop"
)


In [None]:
clf = LogisticRegression(
    max_iter=500,
    class_weight="balanced",
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", clf)
])

pipe.fit(X_train, y_train)

val_proba = pipe.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_proba)
val_ap = average_precision_score(y_val, val_proba)

print("Validation ROC-AUC:", val_auc)
print("Validation PR-AUC :", val_ap)


In [None]:
prec, rec, thr = precision_recall_curve(y_val, val_proba)
f1 = (2 * prec * rec) / (prec + rec + 1e-12)

best_idx = np.argmax(f1)
best_threshold = float(thr[best_idx]) if best_idx < len(thr) else 0.5

print("Best threshold (val, max F1):", best_threshold)
print("Best F1:", float(f1[best_idx]))


In [None]:
test_proba = pipe.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, test_proba)
test_ap = average_precision_score(y_test, test_proba)

test_pred = (test_proba >= best_threshold).astype(int)

cm = confusion_matrix(y_test, test_pred)
report = classification_report(y_test, test_pred, digits=4)

print("Test ROC-AUC:", test_auc)
print("Test PR-AUC :", test_ap)
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)


In [None]:
joblib.dump(pipe, MODEL_PATH)
print("Saved model:", MODEL_PATH)

metrics = {
    "stage": "stage1_acceptance",
    "model": "logistic_regression",
    "features": X.columns.tolist(),
    "num_cols": num_cols,
    "cat_cols": cat_cols,
    "val_roc_auc": float(val_auc),
    "val_pr_auc": float(val_ap),
    "test_roc_auc": float(test_auc),
    "test_pr_auc": float(test_ap),
    "threshold": float(best_threshold),
    "confusion_matrix": cm.tolist(),
    "classification_report": report
}

with open(METRICS_PATH, "w") as f:
    json.dump(metrics, f, indent=2)

print("Saved metrics:", METRICS_PATH)


Create the metadata to use on the frontend 

In [None]:
def make_numeric_metadata(df_train: pd.DataFrame, columns: list[str]) -> dict:
    meta = {}
    for c in columns:
        s = pd.to_numeric(df_train[c], errors="coerce").dropna()
        if len(s) == 0:
            continue
        meta[c] = {
            "min": float(s.min()),
            "max": float(s.max()),
            "p1": float(np.percentile(s, 1)),
            "p99": float(np.percentile(s, 99)),
            "recommended_min": float(np.percentile(s, 10)),
            "recommended_max": float(np.percentile(s, 90)),
        }
    return meta

ui_meta = {
    "stage": "stage1_acceptance",
    "numeric": make_numeric_metadata(X_train, num_cols),
    "categorical": {}  # none expected in Stage 1
}

with open(UI_META_PATH, "w") as f:
    json.dump(ui_meta, f, indent=2)

print("Saved UI metadata:", UI_META_PATH)
ui_meta
