In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score

from preprocessing import TrainingPreProcessor

RANDOM_STATE = 0

train_set = pd.read_csv("input/train.csv")

training_preprocessor = TrainingPreProcessor()
training_preprocessor.fit(train_set, ignore_columns=["CLIENTNUM"])

X, y = training_preprocessor.transform(train_set)

class_weight = training_preprocessor.class_weight
xgb_class_weight = compute_sample_weight(class_weight='balanced', y=y)

In [2]:
xgboost_model = xgb.XGBClassifier(seed=RANDOM_STATE)

## Grid search

In [3]:
kfold = StratifiedKFold(3, shuffle=True, random_state=RANDOM_STATE)

grid_parameters = {
    "n_estimators": [100, 300, 500, 700, 900],
    "subsample": [0.2, 0.4, 0.6, 0.8, 1.0],
    "min_child_weight": [1, 3, 5],
    "gamma": [0, 0.5, 1, 5, 10],
    "colsample_bytree": [0.9, 0.95, 1.0],
    "max_depth": [3, 7, 11, 15],
}

grid_search = GridSearchCV(
    xgboost_model,
    grid_parameters,
    scoring="balanced_accuracy",
    cv=kfold,
    verbose=3,
    n_jobs=-1,
)

In [11]:
grid_search.fit(X, y, sample_weight=xgb_class_weight)
print(grid_search.best_params_)
print(grid_search.best_score_)
grid_search.best_estimator_

Fitting 3 folds for each of 4500 candidates, totalling 13500 fits
{'colsample_bytree': 1.0, 'gamma': 1, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}
0.9595274202376629


In [12]:
grid_parameters_2 = {
    "n_estimators": [100, 150, 200, 250, 300],
    "subsample": [0.7, 0.75, 0.8, 0.85, 0.9],
    "min_child_weight": [2, 3, 4],
    "gamma": [0.75, 1, 3],
    "colsample_bytree": [0.95, 0.975, 1.0],
    "max_depth": [2, 3, 4],
}

grid_search_2 = GridSearchCV(
    xgboost_model,
    grid_parameters_2,
    scoring="balanced_accuracy",
    cv=kfold,
    verbose=3,
    n_jobs=-1,
)

In [14]:
grid_search_2.fit(X, y, sample_weight=xgb_class_weight)
print(grid_search_2.best_params_)
print(grid_search_2.best_score_)
grid_search_2.best_estimator_

Fitting 3 folds for each of 2025 candidates, totalling 6075 fits
{'colsample_bytree': 1.0, 'gamma': 3, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 250, 'subsample': 0.9}
0.9605273251543056


In [4]:
# {'colsample_bytree': 1.0, 'gamma': 3, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 250, 'subsample': 0.9}
best_model = xgb.XGBClassifier(seed=RANDOM_STATE, colsample_bytree=1.0, gamma=3, max_depth=3, min_child_weight=2, n_estimators=250, subsample=0.9)

In [7]:
metrics = {
    'balanced_accuracy':[],
    'accuracy':[],
    'f1':[],
    'roc_auc':[],
}
for train_index, val_index in kfold.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    xgb_class_weight = compute_sample_weight(class_weight='balanced', y=y_train)
    best_model.fit(X_train, y_train, sample_weight=xgb_class_weight)
    y_val_pred = best_model.predict(X_val)
    metrics['balanced_accuracy'].append(balanced_accuracy_score(y_val, y_val_pred))
    metrics['accuracy'].append(accuracy_score(y_val, y_val_pred))
    metrics['f1'].append(f1_score(y_val, y_val_pred))
    metrics['roc_auc'].append(roc_auc_score(y_val, y_val_pred))
for metric, values in metrics.items():
    print(f"{metric}: {np.array(metrics[metric]).mean():.4f} +- {np.array(metrics[metric]).std():.4f}")

balanced_accuracy: 0.9558 +- 0.0042
accuracy: 0.9635 +- 0.0034
f1: 0.9780 +- 0.0021
roc_auc: 0.9558 +- 0.0042
