# Machine Learning Model Design

## 1. Setup & Imports

In [None]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
sys.path.append(str(project_root))

# print(sys.path)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine
from src.db.connection import get_engine

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix,
    RocCurveDisplay
)

import joblib
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

## 2. Load Data

In [None]:
# load data
engine = get_engine()

df = pd.read_sql(
    "SELECT * FROM client_loan_features",
    engine
)
df.head()

In [None]:
df.shape

## 3. Target & Feature Separation

In [None]:
# Define Features and Target

X = df.drop(columns=["client_id", "loan_date", "defaulted"])
y = df["defaulted"]

## 4. Train–Validation–Test Split

In [None]:
# First split: train+val vs test

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

In [None]:
# Second split: train vs validation

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.25,   # 0.25 * 0.8 = 0.2
    stratify=y_temp,
    random_state=RANDOM_STATE
)

## 5. Feature Preprocessing

In [None]:
# Identify feature types

numeric_features = X.columns.tolist()
numeric_features

In [None]:
# Preprocessing pipeline

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median", add_indicator=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)

## 6. Baseline Model

In [None]:
X_train.isna().mean().sort_values(ascending=False)

In [None]:
#logistic regression

baseline_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        random_state=RANDOM_STATE
    ))
])

baseline_pipeline.fit(X_train, y_train)

In [None]:
val_pred_proba = baseline_pipeline.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, val_pred_proba)

## 7. Model Selection

In [None]:
models = {
    "logistic": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        random_state=RANDOM_STATE
    ),
    "random_forest": RandomForestClassifier(
        n_estimators=200,
        class_weight="balanced",
        random_state=RANDOM_STATE
    ),
    "gradient_boosting": GradientBoostingClassifier(
        random_state=RANDOM_STATE
    )
}

In [None]:
# Train and compare

results = []

for name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    pipe.fit(X_train, y_train)
    val_proba = pipe.predict_proba(X_val)[:, 1]
    
    results.append({
        "model": name,
        "roc_auc": roc_auc_score(y_val, val_proba)
    })

pd.DataFrame(results).sort_values("roc_auc", ascending=False)

## 8. Hyperparameter Tuning

In [None]:
# Parameter Grid

rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=RANDOM_STATE, class_weight="balanced"))
])

param_grid = {
    "model__n_estimators": [200, 300, 500],
    "model__max_depth": [5, 8, 12],
    "model__min_samples_split": [10, 20, 50],
    "model__min_samples_leaf": [5, 10, 20],
    "model__max_features": ["sqrt", "log2", 0.5]
}

In [None]:
# Cross-validation setup

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
rf_grid = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    verbose=2
)

rf_grid.fit(X_train, y_train)

In [None]:
best_rf_model = rf_grid.best_estimator_
rf_grid.best_params_

## 9. Model Evaluation

In [None]:
val_proba = best_rf_model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_proba)
print(f"Validation ROC-AUC: {val_auc:.4f}")

In [None]:
from sklearn.metrics import RocCurveDisplay

RocCurveDisplay.from_predictions(y_val, val_proba)
plt.title("Random Forest – Validation ROC Curve")
plt.show()

In [None]:
val_preds = (val_proba >= 0.4).astype(int)
from sklearn.metrics import classification_report

print(classification_report(y_val, val_preds))

## 10. Model Interpretation

In [None]:
importances = best_rf_model.named_steps["model"].feature_importances_

feature_importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": importances
}).sort_values("importance", ascending=False)

feature_importance_df

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(
    data=feature_importance_df.head(10),
    x="importance",
    y="feature"
)
plt.title("Top 10 Feature Importances – Random Forest")
plt.show()

## 11. Model Selection & Test Set Evaluation

In [None]:
test_proba = best_rf_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, test_proba)
print(f"Test ROC-AUC: {test_auc:.4f}")

In [None]:
# Test ROC Curve

RocCurveDisplay.from_predictions(y_test, test_proba)
plt.title("Random Forest – Test ROC Curve")
plt.show()

In [None]:
test_preds = (test_proba >= 0.4).astype(int)
print(classification_report(y_test, test_preds))

## Summary

Random Forest was selected as the final model after hyperparameter tuning using stratified cross-validation. 
It achieved the highest ROC-AUC on the validation set, demonstrating strong discriminatory power between defaulters and non-defaulters.
Feature importances reveal key risk factors