In [68]:
import catboost as ctb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

# Чтение данных

In [7]:
data = pd.read_csv('../data/processed/bank-additional-encoded-and-scaled.csv',
                   delimiter=';', index_col=0)
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,-0.980752,1,1,2,0,2,0,0,6,0,-0.209228,999,-0.351356,1,-1.206054,-1.185448,-1.240939,-1.331707,-0.914779,0
1,-0.107991,7,2,3,0,0,0,1,6,0,0.569634,999,-0.351356,1,0.649441,0.715193,0.892269,0.711698,0.332862,0
2,-1.465619,7,1,3,0,2,0,1,4,4,-0.59866,999,-0.351356,1,0.841389,1.528273,-0.283172,0.773427,0.836535,0
3,-0.204965,7,1,2,0,1,1,1,4,0,0.180203,999,-0.351356,1,0.841389,1.528273,-0.283172,0.771697,0.836535,0
4,0.667795,0,1,6,0,2,0,0,7,1,-0.59866,999,-0.351356,1,-0.11835,-0.655478,-0.326707,0.328632,0.398028,0


In [8]:
y = data['y'].copy()
X = data.drop('y', axis=1).copy()

Посмотрим на сбалансированность классов:

In [19]:
y.value_counts()

0    3668
1     451
Name: y, dtype: int64

Видно, что классы сильно не сбалансированные. Разделим выборку на обучающую и тестовую, при этом учтем, что классы несбалансированные с помощью параметра `stratify`. 

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, 
                                                    random_state=42, stratify=y)

# Catboost

С полным списком параметров можно ознакомиться $\href{https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html}{здесь}$. Нас будут интересовать следующие:

* iterations 
* learning_rate
* loss_function
* eval_metric
* bootstrap_type
* class_weights/auto_class_weights	
* boosting_type

* depth
* max_leaves
* min_data_in_leaf


Подберем гиперпараметры с помощью GridSearch. В качестве метрики будем использовать F1, поскольку  классы несбалансированные. Также добавим параметр, отвечающий за балансировку классов

In [None]:
ctb_classifier = ctb.CatBoostClassifier(eval_metric='F1',
                                       loss_function='Logloss',
                                       auto_class_weights='SqrtBalanced',
                                       logging_level='Silent')

grid = {
    'iterations': np.arange(50, 1000, 100),
    'learning_rate': np.linspace(0.001, 0.05, 10),
    'depth': np.arange(4, 10),
    'l2_leaf_reg': np.linspace(1, 10, 10)

}

search_result = ctb_classifier.grid_search(
                            grid,
                            X=X_train, 
                            y=y_train, 
                            plot=True
)

![image.png](attachment:image.png)

Обучим классификатор с лучшими параметрами:

In [193]:
ctb_classifier = ctb.CatBoostClassifier(eval_metric='F1',
                                       loss_function='Logloss',
                                       auto_class_weights='SqrtBalanced',
                                       logging_level='Silent',
                                       depth = 8,
                                       iterations = 150,
                                       l2_leaf_reg = 6.0,
                                       learning_rate = 0.023,
                                       min_data_in_leaf = 5
                                       )

In [194]:
ctb_classifier.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f75b6607510>

In [195]:
y_pred = ctb_classifier.predict(X_test)

In [196]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      1211
           1       0.47      0.43      0.45       149

    accuracy                           0.88      1360
   macro avg       0.70      0.68      0.69      1360
weighted avg       0.88      0.88      0.88      1360



# XGBoost

Сначала посмотрим на классификатор с параметрами, аналогичными полученным в CatBoost

In [253]:
xgb_classifier = xgb.XGBClassifier(silent=True, 
                      scale_pos_weight=1211/149,
                      learning_rate=0.023,  
                      colsample_bytree = 0.4,
                      subsample = 0.9,
                      objective='binary:logistic',
                      eval_metric='auc',
                      n_estimators=150, 
                      reg_alpha = 6,
                      max_depth=8, 
                      gamma=20)

In [254]:
xgb_classifier.fit(X_train, y_train)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, eval_metric='auc',
              gamma=20, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.023, max_delta_step=0,
              max_depth=8, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=150, n_jobs=0,
              num_parallel_tree=1, objective='binary:logistic', random_state=0,
              reg_alpha=6, reg_lambda=1, scale_pos_weight=8.12751677852349,
              silent=True, subsample=0.9, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [255]:
y_pred = xgb_classifier.predict(X_test)

In [256]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.88      0.91      1211
           1       0.38      0.60      0.46       149

    accuracy                           0.85      1360
   macro avg       0.66      0.74      0.69      1360
weighted avg       0.88      0.85      0.86      1360



А теперь подберем гиперпараметры с помощью GridSearch

In [275]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

params = {
            'eta': np.arange(0.1, 0.26, 0.05),
            'subsample': np.arange(0.5, 1.0, 0.15),
            'n_estimators': np.arange(50, 500, 100),
            'reg_alpha': [1, 3, 6, 10],
            'max_depth': [4, 6, 8]
    
        }

skf = StratifiedKFold(n_splits=3, shuffle = True, random_state=42)


xgb_model = xgb.XGBClassifier(silent=True, 
                      scale_pos_weight=1211/149,
                      objective='binary:logistic')


grid = GridSearchCV(xgb_model, 
                    param_grid = params,  
                    n_jobs = -1, 
                    cv = skf.split(X_train, y_train),
                    scoring='f1_macro')


In [276]:
grid.fit(X_train, y_train)
best_params = grid.best_params_
best_params

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




{'eta': 0.1,
 'max_depth': 8,
 'n_estimators': 50,
 'reg_alpha': 3,
 'subsample': 0.65}

In [277]:
best_classifier = grid.best_estimator_

In [278]:
y_pred = best_classifier.predict(X_test)

In [279]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1211
           1       0.39      0.46      0.42       149

    accuracy                           0.86      1360
   macro avg       0.66      0.68      0.67      1360
weighted avg       0.87      0.86      0.87      1360



Результат получился хуже, чем изначальные параметры, подобранные по аналогии с кэтбустом.