***Пример подбора оптимальных гиперпараметров для ансамблевых методов при помощи Байесовской оптимизации,
используя данные по сердечно-сосудистым заболеваниям***

In [31]:
import os
import warnings
import pprint
from time import time
from datetime import datetime

# Files
import pickle
import pandas as pd
import numpy as np

from sklearn.metrics import (roc_auc_score, recall_score,
                             f1_score, precision_score)
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import category_encoders as ce

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV
from skopt.callbacks import VerboseCallback, DeadlineStopper


In [34]:
df = pd.read_csv('https://raw.githubusercontent.com/iakubovskii7/DataScience/main/DataAnalysis/Data/cardio.csv', sep=';')
df.drop("id", axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(df.drop("cardio", axis=1), df['cardio'],
                                                    test_size=0.3, random_state=17)
X_train.head()
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(49000, 11) (49000,) (21000, 11) (21000,)


In [33]:
X_train.nunique()

age            7791
gender            2
height          101
weight          251
ap_hi           146
ap_lo           139
cholesterol       3
gluc              3
smoke             2
alco              2
active            2
dtype: int64

In [17]:
# TRANSFORMATION PIPELINE

categorical_features = ['cholesterol',
                        'gluc',
                        'smoke',
                        'alco',
                        'active',
                        'gender'
                        ]
for col_cat in categorical_features:
    X_train[col_cat] = X_train[col_cat].astype("category")
    X_test[col_cat] = X_test[col_cat].astype("category")

numeric_features = [i for i in X_train.columns if i not in categorical_features]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_std=True, with_mean=True))])

categorical_transformer = ce.OneHotEncoder(use_cat_names=True)
# categorical_transformer = OneHotEncoder(drop='if_binary')
# categorical_transformer = DataFrameOneHotEncoder(col_overrule_params={"in_app_purchase":{"drop":"first"}})
# categorical_transformer = ce.GLMMEncoder()
# categorical_transformer = ce.CatBoostEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [40]:
# BAYESIAN VALIDATION

def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers
    optimizer = a sklearn or a skopt optimizer
    X = the training set
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    d = pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           + u"\u00B1" + " %.3f") % (time() - start,
                                     len(optimizer.cv_results_['params']),
                                     best_score,
                                     best_score_std))
    print('Best parameters:')
    pprint.pprint(best_params)
    return best_params, optimizer


# Define scoring function
recall_opt = make_scorer(recall_score, greater_is_better=True, needs_threshold=False)

# Cross-validation
skf = StratifiedKFold(n_splits=5)

pipe_forest = Pipeline([('scl', preprocessor),
                        ('clf', RandomForestClassifier(n_jobs=-1, oob_score=False))
                        ])  # pipeline with all steps
forest_search_spaces = {'clf__max_depth': Integer(3, 15),
                         'clf__n_estimators': Integer(100, 1000),
                         'clf__max_features': ['sqrt', 'log2'],
                         'clf__min_samples_leaf': Integer(1, 15)
                         }  # parameters for Grid

pipe_xgboost = Pipeline([('scl', preprocessor),
                        ('clf', XGBClassifier(eval_metric="logloss", n_jobs=-1, use_label_encoder=False))
                        ])  # pipeline with all steps
xgboost_search_spaces = {"subsample": Real(0.5, 0.9, 'log-uniform'),
                         'max_depth': Integer(1, 5),
                         'colsample_bytree': Real(0.75, 0.9, 'log-uniform'),
                         'colsample_bylevel': Real(0.75, 0.9, 'log-uniform'),
                         'colsample_bynode': Real(0.75, 0.9, 'log-uniform'),
                         'learning_rate': Real(0.01, 0.5, 'log-uniform'),
                         'alpha': Real(0.01, 5, 'log-uniform'),
                         'lambda': Real(0.01, 5, 'log-uniform'),
                         'n_estimators': Integer(100, 300)
                         }
xgboost_search_spaces = {"clf__" + key: value for key, value in xgboost_search_spaces.items()}

# Подбор для случайного леса

In [35]:
model_param_search = BayesSearchCV(pipe_forest,
                                   forest_search_spaces,
                                   scoring=recall_opt,
                                   cv=skf,
                                   n_iter=3,
                                   n_jobs=1,
                                   return_train_score=True,
                                   refit=True,
                                   random_state=17
                                   )

best_params,  optimizer_model = report_perf(model_param_search,
                                            title="cardio",
                                            X=X_train, y=y_train,
                                            callbacks=[VerboseCallback(100),
                                                       DeadlineStopper(60 * 10)
                                                       ])
print(recall_score(y_test, optimizer_model.predict(X_test)))

Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 14.4397
Function value obtained: -0.6714
Current minimum: -0.6714
Iteration No: 2 started. Searching for the next optimal point.
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 6.2953
Function value obtained: -0.6979
Current minimum: -0.6979
Iteration No: 3 started. Searching for the next optimal point.
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 13.5004
Function value obtained: -0.6760
Current minimum: -0.6979
Iteration No: 4 started. Searching for the next optimal point.
cardio took 35.53 seconds,  candidates checked: 3, best CV score: 0.698 ± 0.007
Best parameters:
OrderedDict([('clf__max_depth', 11),
             ('clf__max_features', 'sqrt'),
             ('clf__min_samples_leaf', 13),
             ('clf__n_estimators', 236)])
0.6923882017126546


# XGBoost

In [41]:
model_param_search = BayesSearchCV(pipe_xgboost,
                                   xgboost_search_spaces,
                                   scoring=recall_opt,
                                   cv=skf,
                                   n_iter=10,
                                   n_jobs=1,
                                   return_train_score=True,
                                   refit=True,
                                   random_state=17
                                   )

best_params,  optimizer_model = report_perf(model_param_search,
                                            title="cardio",
                                            X=X_train, y=y_train,
                                            callbacks=[VerboseCallback(100),
                                                       DeadlineStopper(60 * 10)
                                                       ])
print(recall_score(y_test, optimizer_model.predict(X_test)))

Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 4.0731
Function value obtained: -0.6797
Current minimum: -0.6797
Iteration No: 2 started. Searching for the next optimal point.
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 4.6732
Function value obtained: -0.6814
Current minimum: -0.6814
Iteration No: 3 started. Searching for the next optimal point.
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 3.9285
Function value obtained: -0.6960
Current minimum: -0.6960
Iteration No: 4 started. Searching for the next optimal point.
Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 6.3372
Function value obtained: -0.6974
Current minimum: -0.6974
Iteration No: 5 started. Searching for the next optimal point.
Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 2.6316
Function value obtained

In [39]:
best_params

OrderedDict([('clf__alpha', 1.380272949988061),
             ('clf__colsample_bylevel', 0.8186574048164795),
             ('clf__colsample_bynode', 0.8622995982444401),
             ('clf__colsample_bytree', 0.8452140717058321),
             ('clf__lambda', 2.746079488438246),
             ('clf__learning_rate', 0.21293969214950273),
             ('clf__max_depth', 4),
             ('clf__n_estimators', 124),
             ('clf__subsample', 0.6593625754470993)])