In [None]:
import pathlib

import joblib

# ==== imports / setup ====
import mlflow
import optuna
import pandas as pd
import sklearn
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sqlalchemy import create_engine
from xgboost import XGBClassifier

from utils.config import load_config
from utils.data import initialize_project

# point to the same tracking URI you used during base tuning
mlflow.set_tracking_uri("file:./mlruns")
BASE_EXPERIMENT = "base-models"
STACKING_EXPERIMENT = "meta-model"
joblib.parallel_backend("threading")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
SCORING = "roc_auc"


config_path = pathlib.Path.cwd().parent / "config.yaml"
config = load_config(config_path)
sklearn.set_config(transform_output="pandas")

db_uri = initialize_project(config=config, init_db=False)
engine = create_engine(db_uri)

  from .autonotebook import tqdm as notebook_tqdm


Config loaded from /Users/haukesteffen/dev/TabularShenanigans/config.yaml


In [2]:
# ==== helpers: find + load base models from MLflow ====


def find_parent_run_id(model_name: str, experiment_name: str) -> str:
    """Find the most recent parent run for <model_name> (e.g., 'XGBoost_tuning')."""
    exp = mlflow.get_experiment_by_name(experiment_name)
    if exp is None:
        raise ValueError(f"Experiment not found: {experiment_name}")
    runs = mlflow.search_runs(
        experiment_ids=[exp.experiment_id],
        filter_string=f"attributes.run_name = '{model_name}_tuning'",
        order_by=["attributes.start_time DESC"],
        max_results=1,
    )
    if runs.empty:
        raise ValueError(f"No parent run found for {model_name}_tuning")
    return runs.iloc[0]["run_id"]


def load_base_estimator(model_name: str, experiment_name: str):
    """
    Load the fitted best model artifact we logged in base tuning.
    Returns an unfitted clone-equivalent (sklearn will re-fit anyway).
    """
    run_id = find_parent_run_id(model_name, experiment_name)
    artifact_name = f"{model_name}_best_model"  # how we logged it earlier

    if model_name == "XGBoost":
        # XGBoost models logged with mlflow.xgboost
        model = mlflow.xgboost.load_model(f"runs:/{run_id}/{artifact_name}")
    else:
        # sklearn flavor (MLP, KNN)
        model = mlflow.sklearn.load_model(f"runs:/{run_id}/{artifact_name}")
    return model


In [3]:
with engine.begin() as connection:
    train = (
        pd.read_sql(
            sql=f"SELECT * FROM [{config.competition_name}-train-preprocessed]",
            con=connection,
        )
        .convert_dtypes()
        .sample(frac=1.0)
    )
    test = (
        pd.read_sql(
            sql=f"SELECT * FROM [{config.competition_name}-test-preprocessed]",
            con=connection,
        )
        .convert_dtypes()
        .sample(frac=1.0)
    )

train = train.set_index(config.id_column)

# subsample for faster experiments
train = train.sample(n=200_000)

X_train = train.drop(columns=[config.target_column])
y_train = train[config.target_column]

test = test.set_index(config.id_column)
X_test = test

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.5, random_state=42
)

In [4]:
# ==== build base estimators from MLflow ====
xgb_base = load_base_estimator("XGBoost", BASE_EXPERIMENT)
mlp_base = load_base_estimator("MLPClassifier", BASE_EXPERIMENT)
knn_base = load_base_estimator("KNeighborsClassifier", BASE_EXPERIMENT)

BASE_ESTIMATORS = [
    ("xgb", xgb_base),
    ("mlp", mlp_base),
    ("knn", knn_base),
]


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 3101.64it/s] 
Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 3472.11it/s] 
Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 168.21it/s]  


In [5]:
# ==== Optuna objective: tune XGBoost meta learner ====
def meta_objective(trial):
    meta = XGBClassifier(
        n_estimators=trial.suggest_int("meta_n_estimators", 50, 400),
        max_depth=trial.suggest_int("meta_max_depth", 2, 6),
        learning_rate=trial.suggest_float("meta_lr", 1e-3, 0.3, log=True),
        subsample=trial.suggest_float("meta_subsample", 0.6, 1.0),
        colsample_bytree=trial.suggest_float("meta_colsample_bytree", 0.6, 1.0),
        reg_lambda=trial.suggest_float("meta_reg_lambda", 1e-3, 10.0, log=True),
        reg_alpha=trial.suggest_float("meta_reg_alpha", 1e-4, 1.0, log=True),
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1,
    )

    stack = StackingClassifier(
        estimators=BASE_ESTIMATORS,
        final_estimator=meta,
        stack_method="predict_proba",  # AUC needs probabilities
        passthrough=False,  # hard-coded: let meta see original features as well
        cv=cv,
        n_jobs=-1,
    )

    # X_train, y_train must be in scope from your data-prep earlier
    score = cross_val_score(
        stack, X_train, y_train, cv=cv, scoring=SCORING, n_jobs=-1
    ).mean()

    # log each trial (nested)
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model", "StackingClassifier")
        mlflow.set_tag("meta_family", "XGBoost")
        mlflow.log_params(trial.params)
        mlflow.log_metric("cv_roc_auc", score)

    return score


In [None]:
import warnings

from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)


# ==== run the study and log the final fitted stack ====
def run_meta_study(n_trials=50):
    mlflow.set_experiment(STACKING_EXPERIMENT)
    with mlflow.start_run(run_name="stacking_meta_xgb"):
        mlflow.set_tag("phase", "stacking_tuning_started")
        mlflow.log_param("n_trials", n_trials)
        mlflow.log_param("base_models", ",".join([name for name, _ in BASE_ESTIMATORS]))
        mlflow.log_param("passthrough", False)
        mlflow.set_tag("meta_family", "XGBoost")

        study = optuna.create_study(direction="maximize")
        study.optimize(meta_objective, n_trials=n_trials)

        # summarize best
        mlflow.log_metric("best_cv_roc_auc", study.best_value)
        mlflow.log_params({f"best_{k}": v for k, v in study.best_trial.params.items()})

        # rebuild best meta, best stack, fit full train, log the model
        bp = study.best_trial.params
        best_meta = XGBClassifier(
            n_estimators=bp["meta_n_estimators"],
            max_depth=bp["meta_max_depth"],
            learning_rate=bp["meta_lr"],
            subsample=bp["meta_subsample"],
            colsample_bytree=bp["meta_colsample_bytree"],
            reg_lambda=bp["meta_reg_lambda"],
            reg_alpha=bp["meta_reg_alpha"],
            eval_metric="logloss",
            random_state=42,
            n_jobs=-1,
        )
        best_stack = StackingClassifier(
            estimators=BASE_ESTIMATORS,
            final_estimator=best_meta,
            stack_method="predict_proba",
            passthrough=False,
            cv=cv,
            n_jobs=-1,
        )
        best_stack.fit(X_train, y_train)
        mlflow.sklearn.log_model(
            best_stack, name="StackingClassifier_best_model", input_example=X_train[:5]
        )

    return study


study = run_meta_study(n_trials=50)
print("Best meta params:", study.best_trial.params, "AUC:", study.best_value)


[I 2025-08-27 12:31:52,850] A new study created in memory with name: no-name-cc131cd8-6852-4519-8c4e-1dea2b6c6c44
[I 2025-08-27 12:51:41,573] Trial 0 finished with value: 0.962938404834141 and parameters: {'meta_n_estimators': 374, 'meta_max_depth': 2, 'meta_lr': 0.016898405694014965, 'meta_subsample': 0.6927097358536879, 'meta_colsample_bytree': 0.9377691910068838, 'meta_reg_lambda': 0.15549239597877748, 'meta_reg_alpha': 0.003055869878616567}. Best is trial 0 with value: 0.962938404834141.
[I 2025-08-27 13:11:35,568] Trial 1 finished with value: 0.9629531696320882 and parameters: {'meta_n_estimators': 110, 'meta_max_depth': 3, 'meta_lr': 0.07830184497345524, 'meta_subsample': 0.8176498232304393, 'meta_colsample_bytree': 0.9168373789461088, 'meta_reg_lambda': 0.001757597264298947, 'meta_reg_alpha': 0.9786016024534376}. Best is trial 1 with value: 0.9629531696320882.
[I 2025-08-27 13:31:45,642] Trial 2 finished with value: 0.9588099592564857 and parameters: {'meta_n_estimators': 187, '