In [None]:
import json
from datetime import datetime

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from utils.config import load_config

config = load_config("config.yaml")

Config loaded from config.yaml.


In [2]:
X_train = pd.read_csv(
    config.data_path / config.X_train_file, index_col="id"
).convert_dtypes()
y_train = pd.read_csv(
    config.data_path / config.y_train_file, index_col="id"
).convert_dtypes()
X_val = pd.read_csv(
    config.data_path / config.X_val_file, index_col="id"
).convert_dtypes()
y_val = pd.read_csv(
    config.data_path / config.y_val_file, index_col="id"
).convert_dtypes()
X_test = pd.read_csv(
    config.data_path / config.X_test_file, index_col="id"
).convert_dtypes()

In [3]:
with open("data/best_params.json", "r") as f:
    best_params = json.load(f)

In [None]:
base_models = {
    "logreg": LogisticRegression(**best_params["logreg"]),
    "mlp": MLPClassifier(**best_params["mlp"]),
    "knn": KNeighborsClassifier(**best_params["knn"]),
    "gbdt": GradientBoostingClassifier(**best_params["gbdt"]),
    "linear": SGDClassifier(**best_params["linear"]),
}


In [None]:
oof_preds = pd.DataFrame(index=X_train.index, columns=base_models.keys(), dtype=float)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for name, model in base_models.items():
    oof_preds[name] = cross_val_predict(
        model,
        X_train,
        y_train[config.target_column],
        cv=cv,
        method="predict_proba",
        n_jobs=-1,
        verbose=3,
    )[:, 1]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  8.3min remaining: 12.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.2min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.3min remaining:  4.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 31.1min remaining: 46.7min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 31.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_

In [6]:
oof_preds.to_csv(config.data_path / "oof_preds.csv")

In [None]:
from scipy.stats import loguniform, randint, uniform
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_clf = XGBClassifier(objective="binary:logistic", eval_metric="auc", random_state=42)

param_dist = {
    "n_estimators": randint(100, 1000),
    "learning_rate": loguniform(1e-3, 0.3),
    "max_depth": randint(2, 8),
    "min_child_weight": randint(1, 10),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "gamma": uniform(0, 5),
    "reg_alpha": loguniform(1e-4, 10),
    "reg_lambda": loguniform(1e-4, 10),
}

search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=50,
    scoring="roc_auc",
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42,
)

search.fit(pd.concat([X_train, oof_preds], axis=1), y_train[config.target_column])
print("Best AUC:", search.best_score_)
print("Best params:", search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.06504856968981275, max_depth=6, min_child_weight=5, n_estimators=714, reg_alpha=0.01694976823788473, reg_lambda=0.0003161364550529618, subsample=0.7836995567863468; total time=  27.0s
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.06504856968981275, max_depth=6, min_child_weight=5, n_estimators=714, reg_alpha=0.01694976823788473, reg_lambda=0.0003161364550529618, subsample=0.7836995567863468; total time=  27.2s
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.06504856968981275, max_depth=6, min_child_weight=5, n_estimators=714, reg_alpha=0.01694976823788473, reg_lambda=0.0003161364550529618, subsample=0.7836995567863468; total time=  27.6s
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.06504856968981275, max_depth=6, min_child_weig



[CV] END colsample_bytree=0.6923575302488596, gamma=1.2051273301300585, learning_rate=0.04926364988526881, max_depth=5, min_child_weight=8, n_estimators=134, reg_alpha=3.5204810455260365, reg_lambda=0.0019674328025306126, subsample=0.8650089137415928; total time=   9.8s
[CV] END colsample_bytree=0.6923575302488596, gamma=1.2051273301300585, learning_rate=0.04926364988526881, max_depth=5, min_child_weight=8, n_estimators=134, reg_alpha=3.5204810455260365, reg_lambda=0.0019674328025306126, subsample=0.8650089137415928; total time=   7.8s
[CV] END colsample_bytree=0.8430179407605753, gamma=0.8526206184364576, learning_rate=0.0014492412389916862, max_depth=5, min_child_weight=9, n_estimators=415, reg_alpha=0.06553013900933982, reg_lambda=0.008454394570685001, subsample=0.6063865008880857; total time=  26.6s
[CV] END colsample_bytree=0.6923575302488596, gamma=1.2051273301300585, learning_rate=0.04926364988526881, max_depth=5, min_child_weight=8, n_estimators=134, reg_alpha=3.520481045526036

In [None]:
for name, model in base_models.items():
    start = datetime.now()
    print(f"Training {name}...")
    model.fit(X_train, y_train[config.target_column])
    print(
        f"{name} training complete after {((datetime.now() - start).seconds) / 60.0:.2f} minutes."
    )

Training logreg...
logreg training complete after 9.20 minutes.
Training mlp...
mlp training complete after 1.63 minutes.
Training knn...
knn training complete after 0.00 minutes.
Training gbdt...
gbdt training complete after 34.02 minutes.
Training linear...
linear training complete after 0.10 minutes.


In [None]:
from sklearn.metrics import roc_auc_score

val_preds = pd.DataFrame(index=X_val.index, columns=base_models.keys(), dtype=float)

for name, model in base_models.items():
    start = datetime.now()
    print(f"Predicting with {name}...")
    val_preds[name] = model.predict_proba(X_val)[:, 1]
    print(
        f"{name} prediction complete after {((datetime.now() - start).seconds) / 60.0:.2f} minutes."
    )
    print(
        f"{name} AUC: {roc_auc_score(y_val[config.target_column], val_preds[name]):.4f}"
    )

meta_val_preds = search.best_estimator_.predict_proba(
    pd.concat([X_val, val_preds], axis=1)
)[:, 1]
print(
    f"Meta model AUC: {roc_auc_score(y_val[config.target_column], meta_val_preds):.4f}"
)

Predicting with logreg...
logreg prediction complete after 0.00 minutes.
logreg AUC: 0.9379
Predicting with mlp...
mlp prediction complete after 0.00 minutes.
mlp AUC: 0.9621
Predicting with knn...
knn prediction complete after 0.35 minutes.
knn AUC: 0.9368
Predicting with gbdt...
gbdt prediction complete after 0.00 minutes.
gbdt AUC: 0.9704
Predicting with linear...
linear prediction complete after 0.00 minutes.
linear AUC: 0.9358
Meta model AUC: 0.9713


In [None]:
test_preds = pd.DataFrame(index=X_test.index, columns=base_models.keys(), dtype=float)

for name, model in base_models.items():
    start = datetime.now()
    print(f"Predicting with {name}...")
    test_preds[name] = model.predict_proba(X_test)[:, 1]
    print(
        f"{name} predictions complete after {((datetime.now() - start).seconds) / 60.0:.2f} minutes."
    )

meta_test_preds = search.best_estimator_.predict_proba(
    pd.concat([X_test, test_preds], axis=1)
)[:, 1]

Predicting with logreg...
logreg predictions complete after 0.00 minutes.
Predicting with mlp...
mlp predictions complete after 0.00 minutes.
Predicting with knn...
knn predictions complete after 1.15 minutes.
Predicting with gbdt...
gbdt predictions complete after 0.03 minutes.
Predicting with linear...
linear predictions complete after 0.00 minutes.


In [None]:
submission = pd.DataFrame({"id": X_test.index, config.target_column: meta_test_preds})
submission.to_csv(config.data_path / "submission_5.csv", index=False)