In [34]:
import os
import pathlib

import joblib
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import optuna
import pandas as pd
import sklearn
from sklearn.model_selection import (
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sqlalchemy import create_engine
from xgboost import XGBClassifier

from utils.config import load_config
from utils.data import initialize_project

config_path = pathlib.Path.cwd().parent / "config.yaml"
config = load_config(config_path)
sklearn.set_config(transform_output="pandas")
joblib.parallel_backend("threading")
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("base-models")
print("Tracking URI:", mlflow.get_tracking_uri(), "CWD:", os.getcwd())

db_uri = initialize_project(config=config, init_db=False)
engine = create_engine(db_uri)

2025/08/26 22:33:20 INFO mlflow.tracking.fluent: Experiment with name 'base-models' does not exist. Creating a new experiment.


Config loaded from /Users/haukesteffen/dev/TabularShenanigans/config.yaml
Tracking URI: file:./mlruns CWD: /Users/haukesteffen/dev/TabularShenanigans/src


In [12]:
with engine.begin() as connection:
    train = (
        pd.read_sql(
            sql=f"SELECT * FROM [{config.competition_name}-train-preprocessed]",
            con=connection,
        )
        .convert_dtypes()
        .sample(frac=1.0)
    )
    test = (
        pd.read_sql(
            sql=f"SELECT * FROM [{config.competition_name}-test-preprocessed]",
            con=connection,
        )
        .convert_dtypes()
        .sample(frac=1.0)
    )

train = train.set_index(config.id_column)

# subsample for faster experiments
train = train.sample(n=200_000)

X_train = train.drop(columns=[config.target_column])
y_train = train[config.target_column]

test = test.set_index(config.id_column)
X_test = test

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.5, random_state=42
)

In [None]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="Choices for a categorical distribution should be a tuple of None, bool, int, float and str",
    category=UserWarning,
)

# ------------------
# Config
# ------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# ------------------
# Helper functions
# ------------------
def evaluate_with_logging(model_name, params, trial, model):
    score = cross_val_score(
        model, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1
    ).mean()

    # log this trial
    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        mlflow.log_metric("cv_score", score)
        mlflow.set_tag("trial_number", trial.number)
        mlflow.set_tag("model", model_name)

    return score


def make_estimator_and_logger(model_name: str, params: dict):
    """
    Returns (estimator, log_fn).
    log_fn(model, name, input_example) logs with the right MLflow flavor.
    """
    if model_name == "XGBoost":
        from xgboost import XGBClassifier

        est = XGBClassifier(**{**params, "n_jobs": -1, "random_state": 42})

        def log_fn(model, name, input_example=None):
            mlflow.xgboost.log_model(model, name=name, input_example=input_example)

        return est, log_fn

    elif model_name == "MLPClassifier":
        from sklearn.neural_network import MLPClassifier

        est = MLPClassifier(**{**params, "random_state": 42})

        def log_fn(model, name, input_example=None):
            mlflow.sklearn.log_model(model, name=name, input_example=input_example)

        return est, log_fn

    elif model_name == "KNeighborsClassifier":
        from sklearn.neighbors import KNeighborsClassifier

        est = KNeighborsClassifier(**params)

        def log_fn(model, name, input_example=None):
            mlflow.sklearn.log_model(model, name=name, input_example=input_example)

        return est, log_fn

    else:
        raise ValueError(f"Unknown model_name: {model_name}")


# ------------------
# Define search spaces
# ------------------
def xgb_objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 250),
        "max_depth": trial.suggest_int("max_depth", 2, 8),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "eval_metric": "logloss",
        "random_state": 42,
        "n_jobs": -1,
    }
    model = XGBClassifier(**params)
    return evaluate_with_logging("XGBoost", params, trial, model)


def mlp_objective(trial):
    hidden_layer_sizes = trial.suggest_categorical(
        "hidden_layer_sizes",
        [
            (50,),
            (100,),
            (150,),
            (200,),
            (50, 50),
            (100, 100),
            (150, 150),
            (200, 100),
            (100, 50),
            (200, 50),
        ],
    )
    params = {
        "hidden_layer_sizes": hidden_layer_sizes,
        "alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
        "learning_rate_init": trial.suggest_float(
            "learning_rate_init", 1e-4, 1e-1, log=True
        ),
        "max_iter": 500,
        "random_state": 42,
    }
    model = MLPClassifier(**params)
    return evaluate_with_logging("MLPClassifier", params, trial, model)


def knn_objective(trial):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 2, 24),
        "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
        "p": trial.suggest_int("p", 1, 2),
    }
    model = KNeighborsClassifier(**params)
    return evaluate_with_logging("KNeighborsClassifier", params, trial, model)


# ------------------
# Run studies + MLflow logging
# ------------------
def run_study(model_name, objective, n_trials=30):
    with mlflow.start_run(run_name=f"{model_name}_tuning") as parent_run:
        mlflow.set_tag("phase", "tuning_started")
        mlflow.log_param("n_trials", n_trials)

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_trials)

        # best params/score
        best_params = study.best_trial.params
        mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})
        mlflow.log_metric("best_cv_score", study.best_value)

        # fit full data + log with correct flavor
        est, log_fn = make_estimator_and_logger(model_name, best_params)
        est.fit(X_train, y_train)
        log_fn(est, name=f"{model_name}_best_model", input_example=X_train[:5])

        # (optional) persist study
        # os.makedirs("studies", exist_ok=True)
        # joblib.dump(study, f"studies/{model_name}_study.pkl")

    return best_params, study.best_value


# ------------------
# Run all models
# ------------------
best_mlp, mlp_score = run_study("MLPClassifier", mlp_objective, n_trials=50)
best_xgb, xgb_score = run_study("XGBoost", xgb_objective, n_trials=50)
best_knn, knn_score = run_study("KNeighborsClassifier", knn_objective, n_trials=50)

print(f"Best XGB: {best_xgb} with score {xgb_score:.4f}")
print(f"Best MLP: {best_mlp} with score {mlp_score:.4f}")
print(f"Best KNN: {best_knn} with score {knn_score:.4f}")

[I 2025-08-26 22:33:24,967] A new study created in memory with name: no-name-99bafc94-e97a-4d0d-bc59-03ed8e99dec8


[I 2025-08-26 22:42:55,348] Trial 0 finished with value: 0.9293730781868526 and parameters: {'hidden_layer_sizes': (100, 100), 'alpha': 0.0006628544610972243, 'learning_rate_init': 0.00012870445586976405}. Best is trial 0 with value: 0.9293730781868526.
[I 2025-08-26 22:51:40,775] Trial 1 finished with value: 0.9272695929912406 and parameters: {'hidden_layer_sizes': (100, 100), 'alpha': 0.001212893565513204, 'learning_rate_init': 0.00025345304505937686}. Best is trial 0 with value: 0.9293730781868526.
[I 2025-08-26 22:54:34,686] Trial 2 finished with value: 0.9348790877646133 and parameters: {'hidden_layer_sizes': (150,), 'alpha': 0.000441144253835466, 'learning_rate_init': 0.002907448745661375}. Best is trial 2 with value: 0.9348790877646133.
[I 2025-08-26 22:54:58,196] Trial 3 finished with value: 0.937531827308365 and parameters: {'hidden_layer_sizes': (50,), 'alpha': 0.043495439678340984, 'learning_rate_init': 0.03936709169767397}. Best is trial 3 with value: 0.937531827308365.
[I 

Best XGB: {'n_estimators': 229, 'max_depth': 5, 'learning_rate': 0.07209577934066372, 'subsample': 0.860986688255628, 'colsample_bytree': 0.8711552099849538} with score 0.9617
Best MLP: {'hidden_layer_sizes': (50,), 'alpha': 0.05585037847660343, 'learning_rate_init': 0.00010513356077035176} with score 0.9567
Best KNN: {'n_neighbors': 24, 'weights': 'distance', 'p': 1} with score 0.9319
