# 06 — Hyperparameter Tuning & Export

Performs RandomizedSearch / GridSearch on a few models and exports the best **full pipeline**
(preprocessor + estimator) to `models/final_pipeline.pkl`.

In [None]:
import numpy as np, pandas as pd, joblib
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import loguniform, randint
from sklearn.metrics import roc_auc_score

RANDOM_STATE = 42

train = pd.read_csv("../data/processed/train.csv")
test = pd.read_csv("../data/processed/test.csv")
target = next((t for t in ["target","num","condition","disease"] if t in train.columns), None)

X_train, y_train = train.drop(columns=[target]), train[target]
X_test, y_test = test.drop(columns=[target]), test[target]

preprocessor = joblib.load("../models/preprocessor.pkl")

candidates = {
    "LogReg": Pipeline([("prep", preprocessor), ("clf", LogisticRegression(max_iter=3000, random_state=RANDOM_STATE))]),
    "RF": Pipeline([("prep", preprocessor), ("clf", RandomForestClassifier(random_state=RANDOM_STATE))]),
    "SVM": Pipeline([("prep", preprocessor), ("clf", SVC(probability=True, random_state=RANDOM_STATE))]),
}

param_dist = {
    "LogReg": {"clf__C": loguniform(1e-3, 1e2)},
    "RF": {"clf__n_estimators": randint(200, 800), "clf__max_depth": randint(3, 20)},
    "SVM": {"clf__C": loguniform(1e-2, 1e2), "clf__gamma": loguniform(1e-4, 1e-1)},
}

best_auc = -1
best_name = None
best_model = None

for name, pipe in candidates.items():
    print(f"\\nTuning {{name}} ...")
    search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist[name],
        n_iter=20,
        cv=5,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        scoring="roc_auc"
    )
    search.fit(X_train, y_train)
    y_prob = search.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_prob)
    print(f"{{name}} AUC: {{auc:.4f}}")
    if auc > best_auc:
        best_auc, best_name, best_model = auc, name, search.best_estimator_

print(f"\\nBest model: {{best_name}} (AUC={{best_auc:.4f}})")
joblib.dump(best_model, "../models/final_pipeline.pkl")
print("Saved ../models/final_pipeline.pkl")