In [None]:
import json

import pandas as pd
from scipy.stats import loguniform, randint, uniform
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import HalvingRandomSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from utils.config import load_config

config = load_config("config.yaml")

Config loaded from config.yaml.


In [2]:
X_train = pd.read_csv(
    config.data_path / config.X_train_file, index_col="id"
).convert_dtypes()
y_train = pd.read_csv(
    config.data_path / config.y_train_file, index_col="id"
).convert_dtypes()
X_val = pd.read_csv(
    config.data_path / config.X_val_file, index_col="id"
).convert_dtypes()
y_val = pd.read_csv(
    config.data_path / config.y_val_file, index_col="id"
).convert_dtypes()
X_test = pd.read_csv(
    config.data_path / config.X_test_file, index_col="id"
).convert_dtypes()

In [None]:
models_and_params = {
    # ===== LOGISTIC REGRESSION =====
    "logreg": (
        LogisticRegression(max_iter=5000, random_state=42),
        [
            {
                "penalty": ["l2"],
                "C": loguniform(1e-2, 1e4),
                "solver": ["lbfgs", "newton-cg", "sag", "saga"],
            },
            {
                "penalty": ["l1"],
                "C": loguniform(1e-2, 1e4),
                "solver": ["liblinear", "saga"],
            },
            {
                "penalty": ["elasticnet"],
                "C": loguniform(1e-2, 1e4),
                "solver": ["saga"],
                "l1_ratio": uniform(0, 1),
            },
            {
                "penalty": [None],
                "solver": ["lbfgs", "newton-cg", "sag", "saga"],
            },
        ],
    ),
    # ===== MLP CLASSIFIER =====
    "mlp": (
        MLPClassifier(max_iter=5000, random_state=42),
        {
            "hidden_layer_sizes": [(50,), (100,), (50, 50), (100, 50), (100, 100)],
            "activation": ["tanh", "relu"],
            "solver": ["sgd", "adam"],
            "alpha": loguniform(1e-5, 1e-1),
            "learning_rate": ["constant", "invscaling", "adaptive"],
            "learning_rate_init": loguniform(1e-4, 1e-2),
        },
    ),
    # ===== LINEAR MODEL =====
    "linear": (
        SGDClassifier(max_iter=5000, random_state=42),
        {
            "loss": ["log_loss", "modified_huber"],
            "penalty": ["l2", "l1", "elasticnet"],
            "alpha": loguniform(1e-6, 1e-1),
            "l1_ratio": uniform(0, 1),
            "learning_rate": ["optimal", "invscaling", "adaptive"],
            "eta0": loguniform(1e-4, 1e0),
        },
    ),
    # ===== K-NEAREST NEIGHBORS =====
    "knn": (
        KNeighborsClassifier(),
        {
            "n_neighbors": randint(1, 20),
            "weights": ["uniform", "distance"],
            "algorithm": ["auto", "kd_tree", "brute"],
            "p": [1, 2],
            "leaf_size": randint(10, 50),
            "metric": ["minkowski", "euclidean", "manhattan"],
        },
    ),
    # ===== GRADIENT BOOSTING CLASSIFIER =====
    "gbdt": (
        GradientBoostingClassifier(random_state=42),
        {
            "n_estimators": randint(100, 1000),
            "learning_rate": loguniform(0.01, 1.0),
            "max_depth": randint(3, 10),
            "min_samples_split": randint(2, 20),
            "min_samples_leaf": randint(1, 20),
            "subsample": uniform(0.6, 0.4),
            "max_features": ["sqrt", "log2", None],
        },
    ),
}


In [None]:
best_models = {}
folds = 3
factor = 2

for name, (model, param_dist) in models_and_params.items():
    print("=" * 40 + f"\n{name}\n" + "=" * 40)
    stratified_cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    search = HalvingRandomSearchCV(
        estimator=model,
        param_distributions=param_dist,
        cv=stratified_cv,
        scoring="roc_auc",
        n_jobs=-1,
        random_state=42,
        verbose=3,
        factor=factor,
        min_resources=int(len(X_train) / factor**5),
        max_resources=int(len(X_train) * 0.3),
    )

    search.fit(X_train, y_train[config.target_column])
    best_models[name] = search.best_estimator_
    print(f"{name} best score: {search.best_score_:.4f}")
    print("\nBest Hyperparameters Found:")
    print(search.best_params_)

logreg
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 21093
max_resources_: 202500
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 9
n_resources: 21093
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 3/3] END penalty=None, solver=lbfgs;, score=(train=0.940, test=0.941) total time=   0.1s
[CV 1/3] END penalty=None, solver=lbfgs;, score=(train=0.938, test=0.927) total time=   0.2s
[CV 2/3] END penalty=None, solver=lbfgs;, score=(train=0.941, test=0.930) total time=   0.1s
[CV 3/3] END C=0.08632008168602544, penalty=l2, solver=sag;, score=(train=0.939, test=0.940) total time=   1.9s
[CV 1/3] END C=0.022310108018679227, l1_ratio=0.8661761457749352, penalty=elasticnet, solver=saga;, score=(train=0.931, test=0.924) total time=   0.7s
[CV 1/3] END C=0.08632008168602544, penalty=l2, solver=sag;, score=(train=0.936, test=0.927) total time=   1.7s
[CV 2/3] END C=0.08632008168602544, penalty=l2, solver=sag;, score=(train=



[CV 3/3] END learning_rate=0.05611516415334506, max_depth=7, max_features=None, min_samples_leaf=11, min_samples_split=9, n_estimators=800, subsample=0.8387400631785948;, score=(train=1.000, test=0.953) total time=  50.7s
[CV 2/3] END learning_rate=0.05611516415334506, max_depth=7, max_features=None, min_samples_leaf=11, min_samples_split=9, n_estimators=800, subsample=0.8387400631785948;, score=(train=1.000, test=0.951) total time=  51.1s
[CV 1/3] END learning_rate=0.01901024531987036, max_depth=6, max_features=None, min_samples_leaf=15, min_samples_split=13, n_estimators=666, subsample=0.9932923543227152;, score=(train=0.988, test=0.953) total time=  42.9s
[CV 2/3] END learning_rate=0.01901024531987036, max_depth=6, max_features=None, min_samples_leaf=15, min_samples_split=13, n_estimators=666, subsample=0.9932923543227152;, score=(train=0.991, test=0.954) total time=  43.0s
[CV 1/3] END learning_rate=0.7663082680255849, max_depth=8, max_features=log2, min_samples_leaf=9, min_samples

In [13]:
print("Validation ROC AUC scores:\n" + "-" * 40)

for name, model in best_models.items():
    val_preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, val_preds)
    print(f"{name:<10} : {auc:.4f}")

Validation ROC AUC scores:
----------------------------------------
logreg     : 0.9379
mlp        : 0.9621
linear     : 0.9358
knn        : 0.9368
gbdt       : 0.9704


In [14]:
best_params = {name: model.get_params() for name, model in best_models.items()}

with open(config.data_path / "best_params.json", "w") as f:
    json.dump(best_params, f, indent=4)