In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import mlflow
import mlflow.sklearn


In [3]:
data_path = r"C:\Users\ruham\Downloads\credit-risk-model-week-4\credit-risk-model-week-4\data\processed\processed_data_with_target.csv"
df = pd.read_csv(data_path)

# Drop non-feature columns
X = df.drop(columns=["CustomerId", "is_high_risk"])
y = df["is_high_risk"]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [5]:
# Identify numeric columns (skip strings like TransactionId, AccountId)
num_cols = X_train.select_dtypes(include=np.number).columns.tolist()


In [6]:
# Random Forest pipeline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # optional for tree-based but safe
    ('classifier', RandomForestClassifier(random_state=42))
])

# Gradient Boosting pipeline
gbm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GradientBoostingClassifier(random_state=42))
])


In [7]:
rf_params = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5]
}

gbm_params = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [3, 5]
}


In [8]:
mlruns_path = r"C:\Users\ruham\Downloads\credit-risk-model-week-4\credit-risk-model-week-4\mlruns"
os.makedirs(mlruns_path, exist_ok=True)

# Use proper Windows path with file:/// and forward slashes
mlflow.set_tracking_uri(f"file:///{mlruns_path.replace(os.sep, '/')}")

# Create or set experiment
mlflow.set_experiment("Credit_Risk_Modeling")


  return FileStore(store_uri, store_uri)
2025/12/12 01:45:59 INFO mlflow.tracking.fluent: Experiment with name 'Credit_Risk_Modeling' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/ruham/Downloads/credit-risk-model-week-4/credit-risk-model-week-4/mlruns/330440290389774077', creation_time=1765493159942, experiment_id='330440290389774077', last_update_time=1765493159942, lifecycle_stage='active', name='Credit_Risk_Modeling', tags={}>

In [12]:
rf_grid = GridSearchCV(rf_pipeline, param_grid=rf_params, scoring='roc_auc', cv=3, n_jobs=-1)
rf_grid.fit(X_train[num_cols], y_train)

best_rf = rf_grid.best_estimator_

# Evaluate
y_pred = best_rf.predict(X_test[num_cols])
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_pred)
}

# Log in MLflow
with mlflow.start_run(run_name="RandomForest_Run"):
    mlflow.log_params(rf_grid.best_params_)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(best_rf, name="random_forest_model")


In [16]:
from sklearn.impute import SimpleImputer

# Create Gradient Boosting pipeline with imputer
gbm_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # fill NaNs with median
    ('scaler', StandardScaler()),
    ('classifier', GradientBoostingClassifier(random_state=42))
])


In [17]:
gbm_grid = GridSearchCV(
    gbm_pipeline, 
    param_grid=gbm_params, 
    scoring='roc_auc', 
    cv=3, 
    n_jobs=-1
)
gbm_grid.fit(X_train[num_cols], y_train)


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'classifier__learning_rate': [0.05, 0.1], 'classifier__max_depth': [3, 5], 'classifier__n_estimators': [100, 200]}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,200
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,5
,min_impurity_decrease,0.0


In [21]:
# -------------------------------
# 1️⃣ Get best estimators
# -------------------------------
best_gbm = gbm_grid.best_estimator_

# -------------------------------
# 2️⃣ Make predictions
# -------------------------------
y_pred_rf = best_rf.predict(X_test[num_cols])
y_pred_gbm = best_gbm.predict(X_test[num_cols])

# -------------------------------
# 3️⃣ Compute metrics
# -------------------------------
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def compute_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_pred)
    }

metrics_rf = compute_metrics(y_test, y_pred_rf)
metrics_gbm = compute_metrics(y_test, y_pred_gbm)

# -------------------------------
# 4️⃣ Compare in a DataFrame
# -------------------------------
import pandas as pd

comparison_df = pd.DataFrame([metrics_rf, metrics_gbm], index=['RandomForest', 'GradientBoosting'])
print(comparison_df)

# -------------------------------
# 5️⃣ Log GBM to MLflow
# -------------------------------
with mlflow.start_run(run_name="GBM_Run"):
    mlflow.log_params(gbm_grid.best_params_)
    mlflow.log_metrics(metrics_gbm)
    mlflow.sklearn.log_model(best_gbm, name="gradient_boosting_model")

print("GBM run logged in MLflow")


                  accuracy  precision    recall        f1   roc_auc
RandomForest      0.999321   0.995928  0.998186  0.997055  0.998827
GradientBoosting  0.994460   0.965410  0.987302  0.976233  0.991347
GBM run logged in MLflow


In [22]:
# Decide the best model based on ROC-AUC
if metrics_rf['roc_auc'] >= metrics_gbm['roc_auc']:
    best_model = best_rf
    best_model_name = "RandomForest"
else:
    best_model = best_gbm
    best_model_name = "GradientBoosting"

print(f"Best model: {best_model_name} with ROC-AUC = {max(metrics_rf['roc_auc'], metrics_gbm['roc_auc']):.4f}")


Best model: RandomForest with ROC-AUC = 0.9988


In [23]:
import mlflow.sklearn

# Start a run for the best model
with mlflow.start_run(run_name=f"{best_model_name}_Final_Run"):
    # Log the model
    mlflow.sklearn.log_model(best_model, name=f"{best_model_name}_model")
    
    # Log the metrics
    best_metrics = metrics_rf if best_model_name == "RandomForest" else metrics_gbm
    mlflow.log_metrics(best_metrics)

    # Register the model in MLflow Model Registry
    model_uri = f"runs:/{mlflow.active_run().info.run_id}/{best_model_name}_model"
    mlflow.register_model(model_uri=model_uri, name=f"CreditRisk_{best_model_name}")

print(f"{best_model_name} registered in MLflow Model Registry successfully!")


  return FileStore(store_uri)
Successfully registered model 'CreditRisk_RandomForest'.


RandomForest registered in MLflow Model Registry successfully!


Created version '1' of model 'CreditRisk_RandomForest'.


In [26]:
from pathlib import Path

# Define the path where you want to save (outside the notebook folder)
save_path = Path("../data/processed/model_comparison_metrics.csv")  # example: one level up

# Create parent directories if they don't exist
save_path.parent.mkdir(parents=True, exist_ok=True)

# Save the CSV
comparison_df.to_csv(save_path, index=True)
print(f"Model comparison metrics saved at: {save_path.resolve()}")


Model comparison metrics saved at: C:\Users\ruham\Downloads\credit-risk-model-week-4\credit-risk-model-week-4\data\processed\model_comparison_metrics.csv
