In [None]:
import joblib
import pandas as pd
from scipy.stats import randint, uniform
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import HalvingRandomSearchCV, StratifiedKFold

enable_halving_search_cv

from utils.config import load_config

config = load_config("config.yaml")

Config loaded from config.yaml.


In [3]:
X_train = pd.read_csv(config.data_path / config.X_train_file).convert_dtypes()
y_train = pd.read_csv(config.data_path / config.y_train_file).convert_dtypes()
X_val = pd.read_csv(config.data_path / config.X_val_file).convert_dtypes()
y_val = pd.read_csv(config.data_path / config.y_val_file).convert_dtypes()
X_test = pd.read_csv(config.data_path / config.X_test_file).convert_dtypes()
test_ids = pd.read_csv(config.data_path / config.test_ids).convert_dtypes()

In [4]:
def make_submission_file(preds, filename="submission.csv"):
    submission = pd.DataFrame({"id": test_ids["id"], config.target_column: preds})
    submission.to_csv(f"./data/{filename}", index=False)
    print(f"Submission file saved as {filename}")

In [10]:
folds = 5
factor = 3

stratified_cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

param_distributions = {
    "n_estimators": randint(100, 500),
    "learning_rate": uniform(0.01, 0.2),
    "max_depth": randint(3, 10),
    "subsample": uniform(0.7, 0.3),
    "min_samples_leaf": randint(1, 5),
    "min_samples_split": randint(2, 10),
}

gbc_base = GradientBoostingClassifier(random_state=42)

halving_search = HalvingRandomSearchCV(
    estimator=gbc_base,
    param_distributions=param_distributions,
    cv=stratified_cv,
    scoring="roc_auc",
    n_jobs=-1,
    random_state=42,
    verbose=3,
    factor=factor,
    min_resources=int(len(X_train) / factor**5),
)


with joblib.parallel_backend("threading"):
    halving_search.fit(X_train, y_train[config.target_column])
best_gbc = halving_search.best_estimator_

print("\nBest Hyperparameters Found:")
print(halving_search.best_params_)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 2777
max_resources_: 675000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 243
n_resources: 2777
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV 2/5] END learning_rate=0.0412037280884873, max_depth=5, min_samples_leaf=3, min_samples_split=4, n_estimators=187, subsample=0.8001125833417065;, score=(train=0.999, test=0.952) total time=   4.4s
[CV 3/5] END learning_rate=0.0412037280884873, max_depth=5, min_samples_leaf=3, min_samples_split=4, n_estimators=187, subsample=0.8001125833417065;, score=(train=0.998, test=0.934) total time=   4.5s
[CV 1/5] END learning_rate=0.0412037280884873, max_depth=5, min_samples_leaf=3, min_samples_split=4, n_estimators=187, subsample=0.8001125833417065;, score=(train=1.000, test=0.951) total time=   4.6s
[CV 4/5] END learning_rate=0.0849080237694725, max_depth=7, min_samples_leaf=3, min_samples_split=4, n_estimators=171, subsample

In [11]:
val_preds = best_gbc.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, val_preds)

0.9647495435832092

In [12]:
# refit with complete training data
X_train_complete = pd.concat([X_train, X_val]).reset_index(drop=True)
y_train_complete = pd.concat([y_train, y_val]).reset_index(drop=True)
best_gbc.fit(X_train_complete, y_train_complete)
test_preds = best_gbc.predict_proba(X_test)[:, 1]

  y = column_or_1d(y, warn=True)


In [14]:
make_submission_file(test_preds, filename="submission_3.csv")

Submission file saved as submission_3.csv
