In [58]:
import pandas as pd
import sklearn
import catboost as ctb

from sklearn import metrics

from sklearn.metrics import classification_report
from matplotlib import pyplot as plt

# Постановка задачи

# Предобработка

Предобработка данных заключалась в следующих шагах:

* закодированы категориальные признаки
* отнормированы численные признаки
* удалены выбросы
* удалены пары признаков с большой корреляцией

Подробно посмотреть предварительный анализ можно $\href{https://github.com/katyalatypova/Methods-of-artificial-intelligence-in-data-analysis-2020/blob/master/notebooks/Preliminary%20analysis.ipynb}{здесь}$

# Исследованные модели

На данный момент были опробованы следующие методы:
* SVM c различными вариантами ядер - $\href{https://github.com/katyalatypova/Methods-of-artificial-intelligence-in-data-analysis-2020/blob/master/notebooks/SVM.ipynb}{ноутбук}$
* RandomForest - $\href{https://github.com/katyalatypova/Methods-of-artificial-intelligence-in-data-analysis-2020/blob/master/notebooks/RandomForest.ipynb}{ноутбук}$
* GradientBoosting (xgboost, catboost) - $\href{https://github.com/katyalatypova/Methods-of-artificial-intelligence-in-data-analysis-2020/blob/master/notebooks/Gradient%20Boosting.ipynb}{ноутбук}$


Для сравнения моделей между собой в этом ноутбуке были выбраны лучшие модели, полученные в каждом семействе моделей, исследованных в отдельных ноутбуках.

# Загрузка моделей

In [50]:
from sklearn.model_selection import train_test_split

In [52]:
data = pd.read_csv('../data/processed/bank-additional-prepared.csv',
                   delimiter=';', index_col=0)

y = data['y_yes'].copy()
X = data.drop('y_yes', axis=1).copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, 
                                                    random_state=42, stratify=y)

In [61]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', 
          class_weight='balanced',
          C=0.51,
          probability=True)

svm.fit(X_train, y_train)
y_pred_train_svm = svm.predict(X_train) 
y_pred_test_svm = svm.predict(X_test)

In [62]:
ctb_classifier = ctb.CatBoostClassifier(eval_metric='F1',
                                        loss_function='Logloss',
                                        auto_class_weights='SqrtBalanced',
                                        depth=4,
                                        iterations=150,
                                        learning_rate=0.04,
                                        l2_leaf_reg=5.0,
                                        logging_level='Silent')

ctb_classifier.fit(X_train, y_train)
y_pred_train_ctb = ctb_classifier.predict(X_train) 
y_pred_test_ctb = ctb_classifier.predict(X_test)

# Результаты экспериментов

In [64]:
metrics_to_eval = {'Accuracy': lambda y_true, y_pred: metrics.accuracy_score(y_true, y_pred),
           'Balanced Accuracy': lambda y_true, y_pred: metrics.balanced_accuracy_score(y_true, y_pred), 
           'Precision-0':  lambda y_true, y_pred: metrics.precision_score(y_true, y_pred, pos_label=0),
           'Precision-1':  lambda y_true, y_pred: metrics.precision_score(y_true, y_pred, pos_label=1), 
           'Recall-0':  lambda y_true, y_pred: metrics.recall_score(y_true, y_pred, pos_label=0), 
           'Recall-1':  lambda y_true, y_pred: metrics.recall_score(y_true, y_pred, pos_label=1),
           'F1-score-0':  lambda y_true, y_pred: metrics.f1_score(y_true, y_pred, pos_label=0), 
           'F1-score-1':  lambda y_true, y_pred: metrics.f1_score(y_true, y_pred, pos_label=1), 
           'F1-macro':  lambda y_true, y_pred: metrics.f1_score(y_true, y_pred, average='macro')}

methods={'SVM': [(y_train, y_pred_train_svm), (y_test, y_pred_test_svm)],
         'GradientBoosting': [(y_train, y_pred_train_ctb), (y_test, y_pred_test_ctb)]}

## Сравним модели на train

In [65]:
metrics_train = pd.DataFrame(columns=['SVM', 'RandomForest', 'GradientBoosting'],
                       index=['Accuracy', 
                              'Balanced Accuracy', 
                              'AUC', 
                              'Precision-0', 'Precision-1', 
                              'Recall-0', 'Recall-1',
                              'F1-score-0', 'F1-score-1', 'F1-macro'])

In [78]:
for metric, metric_func in metrics_to_eval.items():
    for method, predictions in methods.items():
        metrics_train.loc[metric, method] = metric_func(*predictions[0]).round(2)

In [79]:
metrics_train.head(len(metrics_to_eval) + 1)

Unnamed: 0,SVM,RandomForest,GradientBoosting
Accuracy,0.84,,0.89
Balanced Accuracy,0.75,,0.74
AUC,,,
Precision-0,0.95,,0.94
Precision-1,0.37,,0.52
Recall-0,0.86,,0.94
Recall-1,0.64,,0.55
F1-score-0,0.91,,0.94
F1-score-1,0.47,,0.53
F1-macro,0.69,,0.74


## Сравним модели на test

In [68]:
metrics_test = pd.DataFrame(columns=['SVM', 'RandomForest', 'GradientBoosting'],
                       index=['Accuracy', 
                              'Balanced Accuracy', 
                              'AUC', 
                              'Precision-0', 'Precision-1', 
                              'Recall-0', 'Recall-1',
                              'F1-score-0', 'F1-score-1', 'F1-macro'])

In [76]:
for metric, metric_func in metrics_to_eval.items():
    for method, predictions in methods.items():
        metrics_test.loc[metric, method] = metric_func(*predictions[1]).round(2)

In [77]:
metrics_test.head(len(metrics_to_eval) + 1)

Unnamed: 0,SVM,RandomForest,GradientBoosting
Accuracy,0.81,,0.88
Balanced Accuracy,0.71,,0.7
AUC,,,
Precision-0,0.94,,0.93
Precision-1,0.31,,0.45
Recall-0,0.84,,0.93
Recall-1,0.58,,0.48
F1-score-0,0.89,,0.93
F1-score-1,0.41,,0.46
F1-macro,0.65,,0.7
