In [8]:
import catboost
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from preprocessing import TrainingPreProcessor

RANDOM_STATE = 0

train_set = pd.read_csv("input/train.csv")

training_preprocessor = TrainingPreProcessor()
training_preprocessor.fit(train_set, ignore_columns=["CLIENTNUM"])

X, y = training_preprocessor.transform(train_set)
class_weight = training_preprocessor.class_weight

In [6]:
catboost_model = catboost.CatBoostClassifier(
    random_state=RANDOM_STATE,
    class_weights=class_weight,
)

## Grid search

In [7]:
kfold = StratifiedKFold(3, shuffle=True, random_state=RANDOM_STATE)
grid_parameters = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.03, 0.05, 0.1],
    "iterations": [10, 30, 50, 100, 300, 500],
}

In [4]:
grid_search = GridSearchCV(
    catboost_model,
    grid_parameters,
    scoring="balanced_accuracy",
    cv=kfold,
    verbose=3,
    n_jobs=-1,
)

In [5]:
grid_search.fit(
    X, y  
)
print(grid_search.best_params_)
print(grid_search.best_score_)
grid_search.best_estimator_

Fitting 3 folds for each of 72 candidates, totalling 216 fits
0:	learn: 0.6457202	total: 150ms	remaining: 44.9s
1:	learn: 0.5998631	total: 155ms	remaining: 23.1s
2:	learn: 0.5620370	total: 159ms	remaining: 15.8s
3:	learn: 0.5271172	total: 164ms	remaining: 12.1s
4:	learn: 0.4996344	total: 168ms	remaining: 9.94s
5:	learn: 0.4762069	total: 173ms	remaining: 8.48s
6:	learn: 0.4543191	total: 177ms	remaining: 7.4s
7:	learn: 0.4325027	total: 181ms	remaining: 6.6s
8:	learn: 0.4160071	total: 186ms	remaining: 6s
9:	learn: 0.4013742	total: 191ms	remaining: 5.53s
10:	learn: 0.3901922	total: 195ms	remaining: 5.12s
11:	learn: 0.3763558	total: 200ms	remaining: 4.79s
12:	learn: 0.3672238	total: 204ms	remaining: 4.51s
13:	learn: 0.3597148	total: 208ms	remaining: 4.26s
14:	learn: 0.3497061	total: 212ms	remaining: 4.03s
15:	learn: 0.3378960	total: 217ms	remaining: 3.85s
16:	learn: 0.3305672	total: 222ms	remaining: 3.69s
17:	learn: 0.3223792	total: 226ms	remaining: 3.54s
18:	learn: 0.3160604	total: 230ms	r

<catboost.core.CatBoostClassifier at 0x1ad67a5a470>

In [4]:
# {'iterations': 300, 'learning_rate': 0.05, 'max_depth': 5}
best_model = catboost.CatBoostClassifier(
    random_state=RANDOM_STATE,
    class_weights=class_weight,
    iterations=300,
    learning_rate=0.05,
    max_depth=5
)

### Avaliação do encontrado

In [9]:
metrics = {
    'balanced_accuracy':[],
    'accuracy':[],
    'f1':[],
    'roc_auc':[],
}
for train_index, val_index in kfold.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    best_model.fit(X_train, y_train)
    y_val_pred = best_model.predict(X_val)
    metrics['balanced_accuracy'].append(balanced_accuracy_score(y_val, y_val_pred))
    metrics['accuracy'].append(accuracy_score(y_val, y_val_pred))
    metrics['f1'].append(f1_score(y_val, y_val_pred))
    metrics['roc_auc'].append(roc_auc_score(y_val, y_val_pred))
for metric, values in metrics.items():
    print(f"{metric}: {np.array(metrics[metric]).mean():.4f} +- {np.array(metrics[metric]).std():.4f}")

0:	learn: 0.6452426	total: 139ms	remaining: 41.5s
1:	learn: 0.6019457	total: 143ms	remaining: 21.3s
2:	learn: 0.5677523	total: 147ms	remaining: 14.6s
3:	learn: 0.5339141	total: 151ms	remaining: 11.1s
4:	learn: 0.5064719	total: 154ms	remaining: 9.08s
5:	learn: 0.4871195	total: 157ms	remaining: 7.71s
6:	learn: 0.4649477	total: 161ms	remaining: 6.75s
7:	learn: 0.4426029	total: 165ms	remaining: 6.02s
8:	learn: 0.4243832	total: 168ms	remaining: 5.45s
9:	learn: 0.4120213	total: 172ms	remaining: 4.98s
10:	learn: 0.4016261	total: 176ms	remaining: 4.61s
11:	learn: 0.3857883	total: 179ms	remaining: 4.3s
12:	learn: 0.3779608	total: 182ms	remaining: 4.02s
13:	learn: 0.3646275	total: 186ms	remaining: 3.79s
14:	learn: 0.3535983	total: 189ms	remaining: 3.6s
15:	learn: 0.3466183	total: 194ms	remaining: 3.43s
16:	learn: 0.3409775	total: 197ms	remaining: 3.28s
17:	learn: 0.3336431	total: 200ms	remaining: 3.13s
18:	learn: 0.3280504	total: 204ms	remaining: 3.01s
19:	learn: 0.3176354	total: 208ms	remaining