# 04_hyperparameter_search

In [1]:
# ==========================================================
# 0. Imports
# ==========================================================
import os
import sys
import json
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from scipy.stats import randint, uniform

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# ==========================================================
# 1. Load preprocessed data + selected features
# ==========================================================
# Load processed data
df = pd.read_csv("../data/processed/diabetes_processed.csv")

# Load selected features
with open("../src/selected_features.json", "r") as f:
    selected_features = json.load(f)

# FIX: Keep only the features that actually exist in the dataframe
selected_features = [f for f in selected_features if f in df.columns]

# If empty â†’ fallback to all columns except target
if len(selected_features) == 0:
    print("Warning: No selected features found. Using all features.")
    selected_features = df.drop("diabetes", axis=1).columns.tolist()

# Now extract
X = df[selected_features]
y = df["diabetes"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================================================
# 2. Define models + parameter grids
# ==========================================================

# ----- Logistic Regression Grid Search -----
log_reg = LogisticRegression(max_iter=1000)
log_reg_grid = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "solver": ["lbfgs"],
}

# ----- Random Forest Grid Search -----
rf = RandomForestClassifier()
rf_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
}

# ----- Randomized Search Parameters -----
rf_random_grid = {
    "n_estimators": randint(100, 500),
    "max_depth": [None] + list(range(5, 30)),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 10),
}

# ==========================================================
# 3. Run GRID SEARCH
# ==========================================================
print("Running GridSearchCV...")

grid_results = {}

# Logistic Regression Grid Search
log_reg_grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=log_reg_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)
log_reg_grid_search.fit(X_train, y_train)

grid_results["LogisticRegression"] = {
    "best_params": log_reg_grid_search.best_params_,
    "best_score": log_reg_grid_search.best_score_
}

# Random Forest Grid Search
rf_grid_search = GridSearchCV(
    estimator=rf,
    param_grid=rf_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)
rf_grid_search.fit(X_train, y_train)

grid_results["RandomForest_Grid"] = {
    "best_params": rf_grid_search.best_params_,
    "best_score": rf_grid_search.best_score_
}

# ==========================================================
# 4. Run RANDOMIZED SEARCH
# ==========================================================
print("Running RandomizedSearchCV...")

rf_random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_random_grid,
    n_iter=20,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    random_state=42
)
rf_random_search.fit(X_train, y_train)

grid_results["RandomForest_Random"] = {
    "best_params": rf_random_search.best_params_,
    "best_score": rf_random_search.best_score_
}

# ==========================================================
# 5. Evaluate best models on test set
# ==========================================================
def evaluate(model, name):
    preds = model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, preds)
    print(f"{name} Test ROC-AUC:", score)
    return score

test_scores = {
    "LogReg_Test": evaluate(log_reg_grid_search.best_estimator_, "LogisticRegression"),
    "RF_Grid_Test": evaluate(rf_grid_search.best_estimator_, "RandomForest Grid"),
    "RF_Random_Test": evaluate(rf_random_search.best_estimator_, "RandomForest Random"),
}

# Save test scores
grid_results["TestScores"] = test_scores

# ==========================================================
# 6. Save best parameters to JSON
# ==========================================================
with open("../models/best_params.json", "w") as f:
    json.dump(grid_results, f, indent=4)

print("Best parameters saved to models/best_params.json")




Running GridSearchCV...
Running RandomizedSearchCV...
LogisticRegression Test ROC-AUC: 0.9502415057375256
RandomForest Grid Test ROC-AUC: 0.9707342692394161
RandomForest Random Test ROC-AUC: 0.970688658637323
Best parameters saved to models/best_params.json


In [None]:
# ==========================================================
# 7. (Optional) MLflow logging placeholder
# ==========================================================
# import mlflow
# mlflow.log_params(grid_results["RandomForest_Grid"]["best_params"])
# mlflow.log_metric("rf_grid_best_score", grid_results["RandomForest_Grid"]["best_score"])
# ...

  return FileStore(store_uri, store_uri)
