# 04 â€” Hyperparameter Tuning

CV-based tuning while keeping preprocessing inside the pipeline.

In [None]:
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

OUT = Path("..") / "outputs"
X_train, X_test, y_train, y_test = joblib.load(OUT / "data_splits.joblib")
preprocessor = joblib.load(OUT / "preprocessor.joblib")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_best(best_pipe, name):
    y_pred = best_pipe.predict(X_test)
    y_proba = best_pipe.predict_proba(X_test)[:, 1] if hasattr(best_pipe, "predict_proba") else None
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    return {"model": name, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "roc_auc": auc}


In [None]:
from sklearn.linear_model import LogisticRegression

tuned_results = []

logreg = LogisticRegression(max_iter=4000, class_weight="balanced", random_state=42)
logreg_pipe = Pipeline(steps=[("preprocess", preprocessor), ("model", logreg)])

param_grid_logreg = {
    "model__C": [0.01, 0.1, 1.0, 5.0, 10.0],
    "model__solver": ["lbfgs"],
    "model__penalty": ["l2"]
}

gs_logreg = GridSearchCV(
    estimator=logreg_pipe,
    param_grid=param_grid_logreg,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    verbose=1
)

gs_logreg.fit(X_train, y_train)
print("Best LR CV ROC-AUC:", gs_logreg.best_score_)
print("Best LR Params:", gs_logreg.best_params_)
tuned_results.append(evaluate_best(gs_logreg.best_estimator_, "Tuned Logistic Regression"))


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

rf = RandomForestClassifier(random_state=42, class_weight="balanced", n_jobs=-1)
rf_pipe = Pipeline(steps=[("preprocess", preprocessor), ("model", rf)])

param_dist_rf = {
    "model__n_estimators": [200, 400, 600, 800],
    "model__max_depth": [None, 6, 10, 14, 18],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__min_samples_leaf": [1, 2, 5, 10],
    "model__max_features": ["sqrt", "log2", None]
}

rs_rf = RandomizedSearchCV(
    estimator=rf_pipe,
    param_distributions=param_dist_rf,
    n_iter=20,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

rs_rf.fit(X_train, y_train)
print("Best RF CV ROC-AUC:", rs_rf.best_score_)
print("Best RF Params:", rs_rf.best_params_)
tuned_results.append(evaluate_best(rs_rf.best_estimator_, "Tuned Random Forest"))

gb = GradientBoostingClassifier(random_state=42)
gb_pipe = Pipeline(steps=[("preprocess", preprocessor), ("model", gb)])

param_dist_gb = {
    "model__n_estimators": [100, 200, 300, 500],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "model__max_depth": [2, 3, 4],
    "model__subsample": [0.7, 0.85, 1.0],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 5]
}

rs_gb = RandomizedSearchCV(
    estimator=gb_pipe,
    param_distributions=param_dist_gb,
    n_iter=20,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

rs_gb.fit(X_train, y_train)
print("Best GB CV ROC-AUC:", rs_gb.best_score_)
print("Best GB Params:", rs_gb.best_params_)
tuned_results.append(evaluate_best(rs_gb.best_estimator_, "Tuned Gradient Boosting"))

pd.DataFrame(tuned_results).sort_values(by="roc_auc", ascending=False)


In [None]:
# Optional: XGBoost tuning (may take time)
try:
    from xgboost import XGBClassifier
except Exception:
    !pip -q install xgboost
    from xgboost import XGBClassifier

neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
spw = float(neg) / float(pos)

xgb = XGBClassifier(random_state=42, eval_metric="logloss", n_jobs=-1, scale_pos_weight=spw)
xgb_pipe = Pipeline(steps=[("preprocess", preprocessor), ("model", xgb)])

param_dist_xgb = {
    "model__n_estimators": [300, 600, 900, 1200],
    "model__learning_rate": [0.01, 0.03, 0.05, 0.1],
    "model__max_depth": [3, 4, 5, 6],
    "model__subsample": [0.7, 0.85, 1.0],
    "model__colsample_bytree": [0.7, 0.85, 1.0],
    "model__min_child_weight": [1, 3, 5, 10],
    "model__reg_lambda": [0.5, 1.0, 2.0, 5.0]
}

rs_xgb = RandomizedSearchCV(
    estimator=xgb_pipe,
    param_distributions=param_dist_xgb,
    n_iter=20,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

rs_xgb.fit(X_train, y_train)
print("Best XGB CV ROC-AUC:", rs_xgb.best_score_)
print("Best XGB Params:", rs_xgb.best_params_)
tuned_results.append(evaluate_best(rs_xgb.best_estimator_, "Tuned XGBoost"))

tuned_df = pd.DataFrame(tuned_results).sort_values(by="roc_auc", ascending=False)
tuned_df


In [None]:
tuned_df = pd.DataFrame(tuned_results).sort_values(by="roc_auc", ascending=False)
tuned_df.to_csv(OUT / "tuned_model_metrics.csv", index=False)
print("Saved:", OUT / "tuned_model_metrics.csv")
