In [None]:
import pathlib

import optuna
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sqlalchemy import create_engine
from xgboost import XGBClassifier

from utils.config import load_config
from utils.data import initialize_project

config_path = pathlib.Path.cwd().parent / "config.yaml"
config = load_config(config_path)
sklearn.set_config(transform_output="pandas")

db_uri = initialize_project(config=config, init_db=False)
engine = create_engine(db_uri)

Config loaded from /Users/haukesteffen/dev/TabularShenanigans/config.yaml.


In [2]:
with engine.begin() as connection:
    train = (
        pd.read_sql(
            sql=f"SELECT * FROM [{config.competition_name}-train-preprocessed]",
            con=connection,
        )
        .convert_dtypes()
        .sample(frac=1.0)
    )
    test = (
        pd.read_sql(
            sql=f"SELECT * FROM [{config.competition_name}-test-preprocessed]",
            con=connection,
        )
        .convert_dtypes()
        .sample(frac=1.0)
    )

train = train.set_index(config.id_column)
X_train = train.drop(columns=[config.target_column])
y_train = train[config.target_column]

test = test.set_index(config.id_column)
X_test = test

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

In [6]:
model_configs = [
    {
        "name": "XGBoost",
        "model": XGBClassifier,
        "params": lambda trial: {
            "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 9),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "eval_metric": "logloss",
            "random_state": 42,
            "n_jobs": -1,
        },
    },
    {
        "name": "KNeighbors",
        "model": KNeighborsClassifier,
        "params": lambda trial: {
            "n_neighbors": trial.suggest_int("n_neighbors", 3, 30),
            "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
            "p": trial.suggest_int("p", 1, 2),
            "n_jobs": -1,
        },
    },
    {
        "name": "SVC",
        "model": SVC,
        "params": lambda trial: {
            "C": trial.suggest_float("C", 1e-4, 1e3, log=True),
            "gamma": trial.suggest_float("gamma", 1e-4, 1e2, log=True),
            "kernel": "rbf",
            "probability": True,
            "random_state": 42,
        },
    },
    {
        "name": "MLPClassifier",
        "model": MLPClassifier,
        "params": lambda trial: {
            "hidden_layer_sizes": (trial.suggest_int("n_units", 50, 200),),
            "activation": trial.suggest_categorical("activation", ["relu", "tanh"]),
            "alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
            "learning_rate_init": trial.suggest_float(
                "learning_rate_init", 1e-4, 1e-2, log=True
            ),
            "max_iter": 1000,
            "random_state": 42,
        },
    },
    {
        "name": "GaussianNB",
        "model": GaussianNB,
        "params": lambda trial: {
            # No hyperparameters to tune or execution parameters to set.
        },
    },
    {
        "name": "LogisticRegression",
        "model": LogisticRegression,
        "params": lambda trial: {
            "solver": "liblinear",
            "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
            "C": trial.suggest_float("C", 1e-4, 1e2, log=True),
            "max_iter": 2000,
            "random_state": 42,
        },
    },
]

In [None]:
def tune_model(model_config, storage_location, X_train, y_train):
    """
    A generic function to run an Optuna study for any given model configuration,
    saving the results to the specified storage.
    """
    model_class = model_config["model"]
    param_lambda = model_config["params"]
    study_name = f"study-{model_config['name']}"  # e.g., "study-XGBoost"

    def objective(trial):
        params = param_lambda(trial)
        model = model_class(**params)
        score = cross_val_score(
            model, X_train, y_train, scoring="roc_auc", cv=5, n_jobs=1
        )
        return score.mean()

    # ** THE CHANGE IS HERE **
    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(seed=42),
        storage=storage_location,  # Tells Optuna where to save the study
        study_name=study_name,  # The unique name for this model's study
        load_if_exists=True,  # The magic flag to resume the study
    )

    # We can check if the study is already finished
    if len(study.trials) >= 50:  # The n_trials we want to run
        print(f"Study '{study_name}' already completed. Skipping.")
    else:
        remaining_trials = 50 - len(study.trials)
        print(f"Resuming study '{study_name}'. {remaining_trials} trials left to run.")
        study.optimize(objective, n_trials=remaining_trials)

    return study.best_trial


# --- 4. Run the Tuning for All Models ---
storage_db = "sqlite:///optuna.db"

best_models = []
for config in model_configs:
    print(f"--- Tuning {config['name']} ---")
    # ** NEW: Pass the storage location to the function **
    best_trial = tune_model(config, storage_db, X_train, y_train)

    best_models.append(
        {
            "name": config["name"],
            "best_score_cv": best_trial.value,
            "best_params": best_trial.params,
            "model_class": config["model"],
        }
    )
    print(f"Best CV ROC AUC: {best_trial.value:.4f}")
    print(f"Best params: {best_trial.params}\n")

# --- 5. Evaluate the Tuned Models on the Validation Set ---
print("\n--- Final Evaluation on Validation Set ---")
for model_info in best_models:
    # Instantiate and train the best model on the full training data
    FinalModel = model_info["model_class"]
    final_model = FinalModel(**model_info["best_params"], random_state=42)
    final_model.fit(X_train, y_train)

    # Evaluate on the hold-out validation set
    y_pred_proba = final_model.predict_proba(X_val)[:, 1]
    val_score = roc_auc_score(y_val, y_pred_proba)

    print(f"{model_info['name']}:")
    print(f"  Validation ROC AUC: {val_score:.4f}")

--- Tuning XGBoost ---


[I 2025-08-19 19:47:42,762] A new study created in RDB with name: study-XGBoost


Resuming study 'study-XGBoost'. 50 trials left to run.


[I 2025-08-19 19:49:46,034] Trial 0 finished with value: 0.9631957528365922 and parameters: {'n_estimators': 500, 'learning_rate': 0.2536999076681771, 'max_depth': 8, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132}. Best is trial 0 with value: 0.9631957528365922.
[I 2025-08-19 19:50:31,872] Trial 1 finished with value: 0.9660641963106237 and parameters: {'n_estimators': 246, 'learning_rate': 0.19030368381735815, 'max_depth': 7, 'subsample': 0.8540362888980227, 'colsample_bytree': 0.5102922471479012, 'gamma': 4.8495492608099715}. Best is trial 1 with value: 0.9660641963106237.
[I 2025-08-19 19:53:02,674] Trial 2 finished with value: 0.9639123355508641 and parameters: {'n_estimators': 866, 'learning_rate': 0.020589728197687916, 'max_depth': 4, 'subsample': 0.5917022549267169, 'colsample_bytree': 0.6521211214797689, 'gamma': 2.6237821581611893}. Best is trial 1 with value: 0.9660641963106237.
[I 2025-08-19 19:55:16,727] Trial 3 finishe

Best CV ROC AUC: 0.9673
Best params: {'n_estimators': 928, 'learning_rate': 0.043587132140967134, 'max_depth': 7, 'subsample': 0.7381642880984616, 'colsample_bytree': 0.7845159696285168, 'gamma': 1.435507759148219}

--- Tuning KNeighbors ---
Resuming study 'study-KNeighbors'. 50 trials left to run.
