# Cлучайный лес в реальной задаче

Для этого будем использовать пример с задачей fraud detection. Это задача классификации, поэтому будем использовать метрику accuracy для оценки точности. Для начала построим самый простой классификатор, который будет нашим бейслайном. Возьмем только числовые признаки для упрощения.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# Загружаем данные
df = pd.read_csv("/Users/iakubovskii/Machine_Learning/RANEPA/Fintech_2020/Анализ данных/Данные/telecom_churn.csv")

# Выбираем сначала только колонки с числовым типом данных
cols = []
[cols.append(i) for i in df.columns if (df[i].dtype == "float64") or (df[i].dtype == 'int64')]
        
# Разделяем на признаки и объекты
X, y = df[cols], df["Churn"].values

# Инициализируем стратифицированную разбивку нашего датасета для валидации
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Инициализируем наш классификатор с дефолтными параметрами
rfc = RandomForestClassifier(random_state=42, n_jobs=-1, oob_score=True)

# Обучаем на тренировочном датасете
results = cross_val_score(rfc, X, y, cv=skf)

# Оцениваем точность на тестовом датасете
print("CV accuracy score: {:.2f}%".format(results.mean()*100))

CV accuracy score: 92.50%


Получили точность 92.5%, теперь попробуем улучшить этот результат и посмотреть, как ведут себя кривые обучения при изменении
основных параметров.

Начнем с количества деревьев:

In [None]:
import warnings
warnings.filterwarnings("ignore")
# Инициализируем валидацию
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Создаем списки для сохранения точности на тренировочном и тестовом датасете
train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
trees_grid = [100, 150, 200, 250]

# Обучаем на тренировочном датасете
for ntrees in trees_grid:
    rfc = RandomForestClassifier(n_estimators=ntrees, random_state=42, n_jobs=-1, oob_score=True)
    temp_train_acc = []
    temp_test_acc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rfc.fit(X_train, y_train)
        temp_train_acc.append(rfc.score(X_train, y_train))
        temp_test_acc.append(rfc.score(X_test, y_test))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)
    
train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print("Best accuracy on CV is {:.2f}% with {} trees".format(max(test_acc.mean(axis=1))*100, 
                                                        trees_grid[np.argmax(test_acc.mean(axis=1))]))

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(trees_grid, train_acc.mean(axis=1), alpha=0.5, color='blue', label='train')
ax.plot(trees_grid, test_acc.mean(axis=1), alpha=0.5, color='red', label='cv')
ax.fill_between(trees_grid, test_acc.mean(axis=1) - test_acc.std(axis=1), test_acc.mean(axis=1) + test_acc.std(axis=1), color='#888888', alpha=0.4)
ax.fill_between(trees_grid, test_acc.mean(axis=1) - 2*test_acc.std(axis=1), test_acc.mean(axis=1) + 2*test_acc.std(axis=1), color='#888888', alpha=0.2)
ax.legend(loc='best')
ax.set_ylim([0.88,1.02])
ax.set_ylabel("Accuracy")
ax.set_xlabel("N_estimators");

На рисунке видно, что при увеличении количества деревьев, наша точность модели на кросс-валидации немного снижается. Также видим, что на тренировочной выборке мы достигаем 100% точности, это говорит нам о переобучении нашей модели. Чтобы избежать переобучения, мы должны добавить параметры регуляризации в модель.

Начнем с параметра максимальной глубины – `max_depth`. (зафиксируем к-во деревьев 100)

In [None]:
# Создаем списки для сохранения точности на тренировочном и тестовом датасете
train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
max_depth_grid = [3, 5, 7, 9, 11, 13, 15, 17, 20, 22, 24]

# Обучаем на тренировочном датасете
for max_depth in max_depth_grid:
    rfc = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, oob_score=True, 
                                 max_depth=max_depth)
    temp_train_acc = []
    temp_test_acc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rfc.fit(X_train, y_train)
        temp_train_acc.append(rfc.score(X_train, y_train))
        temp_test_acc.append(rfc.score(X_test, y_test))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)
    
train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print("Best accuracy on CV is {:.2f}% with {} max_depth".format(max(test_acc.mean(axis=1))*100, 
                                                        max_depth_grid[np.argmax(test_acc.mean(axis=1))]))

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(max_depth_grid, train_acc.mean(axis=1), alpha=0.5, color='blue', label='train')
ax.plot(max_depth_grid, test_acc.mean(axis=1), alpha=0.5, color='red', label='cv')
ax.fill_between(max_depth_grid, test_acc.mean(axis=1) - test_acc.std(axis=1), test_acc.mean(axis=1) + test_acc.std(axis=1), color='#888888', alpha=0.4)
ax.fill_between(max_depth_grid, test_acc.mean(axis=1) - 2*test_acc.std(axis=1), test_acc.mean(axis=1) + 2*test_acc.std(axis=1), color='#888888', alpha=0.2)
ax.legend(loc='best')
ax.set_ylim([0.88,1.02])
ax.set_ylabel("Accuracy")
ax.set_xlabel("Max_depth");

Параметр `max_depth` хорошо справляется с регуляризацией модели, и мы уже не так сильно переобучаемся. Точность нашей модели немного возросла.

Еще важный параметр `min_samples_leaf`, он так же выполняет функцию регуляризатора.


In [None]:
# Создаем списки для сохранения точности на тренировочном и тестовом датасете
train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
min_samples_leaf_grid = [1, 3, 5, 7, 9, 11, 13, 15, 17, 20, 22, 24]

# Обучаем на тренировочном датасете
for min_samples_leaf in min_samples_leaf_grid:
    rfc = RandomForestClassifier(n_estimators=100, max_depth=15,
                                 random_state=42, n_jobs=-1, 
                                 oob_score=True, min_samples_leaf=min_samples_leaf)
    temp_train_acc = []
    temp_test_acc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rfc.fit(X_train, y_train)
        temp_train_acc.append(rfc.score(X_train, y_train))
        temp_test_acc.append(rfc.score(X_test, y_test))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)
    
train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print("Best accuracy on CV is {:.2f}% with {} min_samples_leaf".format(max(test_acc.mean(axis=1))*100, 
                                                        min_samples_leaf_grid[np.argmax(test_acc.mean(axis=1))]))

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(min_samples_leaf_grid, train_acc.mean(axis=1), alpha=0.5, color='blue', label='train')
ax.plot(min_samples_leaf_grid, test_acc.mean(axis=1), alpha=0.5, color='red', label='cv')
ax.fill_between(min_samples_leaf_grid, test_acc.mean(axis=1) - test_acc.std(axis=1), test_acc.mean(axis=1) + test_acc.std(axis=1), color='#888888', alpha=0.4)
ax.fill_between(min_samples_leaf_grid, test_acc.mean(axis=1) - 2*test_acc.std(axis=1), test_acc.mean(axis=1) + 2*test_acc.std(axis=1), color='#888888', alpha=0.2)
ax.legend(loc='best')
ax.set_ylim([0.88,1.02])
ax.set_ylabel("Accuracy")
ax.set_xlabel("Min_samples_leaf");

В данном случае мы не выигрываем в точности на валидации, но зато можем сильно уменьшить переобучение до 2% при сохранении точности около 90%.

Рассмотрим такой параметр как `max_features`. Для задач классификации по умолчанию используется $\large \sqrt{n}$, где n — число признаков. Давайте проверим, оптимально ли в нашем случае использовать 4 признака или нет.


In [None]:
# Создаем списки для сохранения точности на тренировочном и тестовом датасете
train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
max_features_grid = [2, 4, 6, 8, 10, 12, 14, 16]

# Обучаем на тренировочном датасете
for max_features in max_features_grid:
    rfc = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, 
                                 oob_score=True, max_features=max_features)
    temp_train_acc = []
    temp_test_acc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rfc.fit(X_train, y_train)
        temp_train_acc.append(rfc.score(X_train, y_train))
        temp_test_acc.append(rfc.score(X_test, y_test))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)
    
train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print("Best accuracy on CV is {:.2f}% with {} max_features".format(max(test_acc.mean(axis=1))*100, 
                                                        max_features_grid[np.argmax(test_acc.mean(axis=1))]))

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(max_features_grid, train_acc.mean(axis=1), alpha=0.5, color='blue', label='train')
ax.plot(max_features_grid, test_acc.mean(axis=1), alpha=0.5, color='red', label='cv')
ax.fill_between(max_features_grid, test_acc.mean(axis=1) - test_acc.std(axis=1), test_acc.mean(axis=1) + test_acc.std(axis=1), color='#888888', alpha=0.4)
ax.fill_between(max_features_grid, test_acc.mean(axis=1) - 2*test_acc.std(axis=1), test_acc.mean(axis=1) + 2*test_acc.std(axis=1), color='#888888', alpha=0.2)
ax.legend(loc='best')
ax.set_ylim([0.88,1.02])
ax.set_ylabel("Accuracy")
ax.set_xlabel("Max_features");

В нашем случае оптимальное число признаков — 4, именно с таким значением достигается наилучший результат.

Мы рассмотрели, как ведут себя кривые обучения в зависимости от изменения основных параметров. Давайте теперь с помощью `GridSearch` найдем оптимальные параметры для нашего примера.

In [None]:
# Сделаем инициализацию параметров, по которым хотим сделать полный перебор
parameters = {'max_features': [4, 7, 10, 13], 
              'min_samples_leaf': [1, 3, 5, 7], 
              'max_depth': [5,10,15,20]}
rfc = RandomForestClassifier(n_estimators=100, random_state=42, 
                             n_jobs=-1, oob_score=True)
gcv = GridSearchCV(rfc, parameters, n_jobs=-1, cv=skf, verbose=1)
gcv.fit(X, y)

gcv.best_estimator_, gcv.best_score_

In [None]:
# Важность признаков
rf_best = RandomForestClassifier(n_estimators=100, random_state=42, 
                             n_jobs=-1, oob_score=True,
                                max_features = 10, max_depth=20, min_samples_leaf=3)
rf_best.fit(X, y)

pd.DataFrame(dict(zip(X.columns, rf_best.feature_importances_)), 
             index=[0]).T.sort_values(0, ascending=False)

# Бустинг

## CatBoost

In [None]:
# CatBoost classifier

from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

X, y = load_wine()['data'], load_wine()['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

feature_wine = load_wine()['feature_names']
print(pd.DataFrame(X_train, columns = feature_wine).nunique())
print("Categorical features are absent in this dataset")

from catboost import CatBoostClassifier, Pool

# initialize Pool
train_pool = Pool(X_train, 
                  label = y_train, 
                  cat_features=None)

test_pool = Pool(X_test, 
                 label = y_test,
                 cat_features=None)

model = CatBoostClassifier(iterations=5,
                           depth=5,
                           learning_rate=0.3,
                           loss_function='MultiClass',
                           verbose=True)
# train the model
model.fit(train_pool)
# make the prediction using the resulting model
preds_class = model.predict(test_pool)
preds_proba = model.predict_proba(test_pool)
print("class = ", preds_class[:5])
print("proba = \n", preds_proba[:5])
print(confusion_matrix(y_test, preds_class))

In [None]:
# CatBoost regressor 
from sklearn.metrics import r2_score
from sklearn.datasets import fetch_california_housing
from catboost import CatBoostRegressor
X, y = fetch_california_housing()['data'], fetch_california_housing()['target']

# initialize data (california house prices)
feature_names = fetch_california_housing()['feature_names']
print(f"features are : {feature_names} \n")

df = pd.DataFrame(X, columns = feature_names)
print(df.nunique())
print("Categorical variables are absent in this dataset")


X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X,y, test_size = 0.2, random_state = 17) 

X_labels = np.arange(0, X_train.shape[0])

# initialize Pool
train_pool_reg = Pool(X_train_reg, 
                  y_train_reg, 
                  cat_features=None)

test_pool_reg = Pool(X_test_reg, 
                 cat_features=None)

model = CatBoostRegressor(iterations=500, 
                          depth=6, 
                          learning_rate=0.2, 
                          loss_function='RMSE')
#train the model
model.fit(train_pool_reg)
# make the prediction using the resulting model
preds_reg = model.predict(test_pool_reg)
print(r2_score(y_test_reg, preds_reg))

## LightGB

In [None]:
import lightgbm
train_data_lgb = lightgbm.Dataset(X_train_reg, label=y_train_reg, categorical_feature=None)
test_data_lgb = lightgbm.Dataset(X_test_reg, label=y_test_reg)

parameters = {
    'metric': 'MAE',
    'boosting': 'gbdt',
    'num_leaves': 50,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 1
}

model = lightgbm.train(parameters,
                       train_data_lgb,
                       valid_sets=test_data_lgb,
                       num_boost_round=500,
                       early_stopping_rounds=35)
preds_reg_lgbm = model.predict(X_test_reg)
print(f"R squared on test sample = {r2_score(y_test_reg, preds_reg_lgbm)}")

## XGBoost

In [None]:
from xgboost import XGBRegressor
my_model = XGBRegressor(max_depth=4, 
                        random_state=17, 
                        reg_lambda=25,
                        reg_alpha=14,
                        n_estimators=250,
                        learning_rate=0.35)
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(X_train_reg, y_train_reg, verbose=False)

preds_reg_xgboost = my_model.predict(X_test_reg)
print(f"R squared on test sample = {r2_score(y_test_reg, preds_reg_xgboost)}")

# Классификация текстов при помощи эмбеддингов

In [3]:
import datetime
import os
import gensim
import nltk
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords

# nltk.download("punkt")
# nltk.download('stopwords')

VECTOR_SIZE = 50 # (мы можем увеличить до 300 и более, но здесь ограничимся 50 для экономии времени обучения)

## 3.1 Train w2vec model

KING - QUEEN = MAN - WOMAN
![](https://i1.wp.com/www.lifestyletrading101.com/wp-content/uploads/2017/03/word2-vec-king-queen.png?resize=498%2C505)

In [4]:
project_path = "/Users/iakubovskii/Machine_Learning/Datasets/Amazon_food"
os.chdir(project_path)
with open("unlabeled_150k.csv", encoding="utf8") as file:
    text_df = pd.read_csv(file)

In [5]:
def clean_and_tokenize_text(text_df):
    stop_words = stopwords.words('english')
    text_list = [[x.lower() for x in nltk.word_tokenize(x) if x not in stop_words and x.isalnum()] for x in
                 text_df['Text']]
    return text_list
text_df_cleaned = clean_and_tokenize_text(text_df)

In [6]:
model = gensim.models.Word2Vec(text_df_cleaned,
                               window=50, # максимальное расстояние между текущим и предсказанным словом внутри предложения
                               vector_size=VECTOR_SIZE,
                               epochs=5, # количество итераций по корпусу
                               min_count=3, # игнорирует слова, которые встречаются реже, чем 3 раза
                               workers=3) # количество потоков
model.save("model1")

In [7]:
def get_vector(word, model):
    """Трансформируем каждое слово в вектор"""
    try:
        return model.wv.get_vector(word)
    except:
        return np.zeros(VECTOR_SIZE)
def calculate_avg_vectors(text_list, model):
    """Считаем средний вектор по всему документу с учетом значений векторов для каждого слова"""
    vectors = []
    for word in text_list:
        vectors.append(get_vector(word, model))
    if len(vectors) == 0:
        return np.zeros(VECTOR_SIZE)
    vec_avg = np.mean(vectors, axis=0)
    return vec_avg
def create_word_embeddings(text_list, model):
    """Создаем датафрейм с эмбеддингами (векторами документов)"""
    vectors = [calculate_avg_vectors(x, model) for x in text_list]
    vectors_df = pd.DataFrame(vectors).apply(pd.Series).reset_index()
    vectors_df.drop("index", axis=1, inplace=True)
    return vectors_df

## 3.2 Train w2v model

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
# %%time
with open("train_40k.csv", encoding="utf8") as file:
    text_df_train = pd.read_csv(file)
text_df_train_cleaned = clean_and_tokenize_text(text_df_train)
embeddings_train = create_word_embeddings(text_df_train_cleaned, model)
target = np.where(text_df_train['Score'] > 3, 1, 0)
X_train, X_test, y_train, y_test = train_test_split(embeddings_train, target,
                                                            stratify=target,
                                                            test_size=0.3)

In [60]:
# Пример со схожестью отзывов (находим косинусную схожесть между векторами)
from sklearn.metrics.pairwise import cosine_similarity
review_vector_1 = embeddings_train.iloc[200].values
def get_similariry(vector):
    return cosine_similarity(review_vector_1.reshape(1,-1), vector.reshape(1,-1))
cosine_matrix = np.apply_along_axis(get_similariry, 1, embeddings_train.values)
pd.Series(cosine_matrix[:, 0, 0]).sort_values(ascending=False)

In [91]:
text_df_train.iloc[200]['Text']

"I've used Aussie shampoos and conditioners in the past, but hadn't tried their hairspray. It holds well without being sticky and has a pleasant aroma. I priced it and that's the only thing I can find negative about this particular product. I can purchase a similar product at a lower cost."

In [62]:
text_df_train.iloc[28314]['Text']

'Over the years I have tried numerous deodorant and antiperspirant products in an effort to strike a balance between reducing sweat and also not irritating my skin. Virtually no products you find in drug stores fit the bill; I have tried them. I have been using a combination of a spray-on and a hypoallergenic mineral deodorant for a few years, but this winter that combo was just not handling the sweat (and I am not an unnaturally sweaty person). I tried Pit Boss after doing a bit of research and reading customer reviews. This product is EXCELLENT and has surpassed my expectations. I can wear it daily, it has a pleasant smell but is not overpowering like many spray-on products. The only minor gripe is the tackiness on skin and hair, which you get from any antiperspirant. But after a month+ of use it has not irritated my skin once, which is more than I can say for even most non-antiperspirant deodorants I have used. At $12-15 a pop it is not cheap, but I find it well worth the extra cost

In [63]:
text_df_train.iloc[22704]['Text']

'I now know why this product is less expensive than that found in department stores. I have used this product for several years and know the potentcy should be stronger than that in the product I received from Amazon. This cologne is much much weaker and almost no fragrance within a few hours. I truly feel that this is watered down and will not buy again. Usually I have great satisfactiion with product purchased from Amazon but his product is inferior and not the real thing.Bob LepperSpringstead, Wi'

In [67]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score
lgbm_w2v = LGBMClassifier(boosting_type='gbdt',
                                    metric='binary_loglass',
                                    max_depth=20,
                                    n_jobs=3,
                                    num_leaves=25,
                                    n_estimators=80,
                                    reg_alpha=0.3,
                                    reg_lambda=0.5,
                                    is_unbalance=True,
                                    random_state=12345)
lgbm_w2v.fit(X_train, y_train)
predictions_w2v = lgbm_w2v.predict(X_test)
predictions_w2v_proba = lgbm_w2v.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, predictions_w2v_proba)
f_score = f1_score(y_test, predictions_w2v, average='weighted')

print(f"ROC-AUC score = {auc_score}, F_score = {f_score}")

ROC-AUC score = 0.7913840368019018, F_score = 0.8074137235923218


## 3.3 Predict new data

In [68]:
with open("val_10k.csv", encoding="utf8") as file:
    text_df_validated = pd.read_csv(file)
    
text_df_validated_cleaned = clean_and_tokenize_text(text_df_validated)
embeddings_validate = create_word_embeddings(text_df_validated_cleaned, model)
y_validated = np.where(text_df_validated['Score'] > 3, 1, 0)

predictions_w2v_validated = lgbm_w2v.predict(embeddings_validate)
predictions_w2v_validated_proba = lgbm_w2v.predict_proba(embeddings_validate)[:, 1]
auc_score_validated = roc_auc_score(y_validated, predictions_w2v_validated)
f_score_validated = f1_score(y_validated, predictions_w2v_validated, average='weighted')

In [69]:
print(f"ROC-AUC score validated = {auc_score_validated}, F_score validated = {f_score_validated}")

ROC-AUC score validated = 0.7832728291979412, F_score validated = 0.8109887859697004


## 3.4 Байесовская оптимизация гиперпараметров 

In [90]:
from time import time
import pprint
from sklearn.metrics import make_scorer
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback
from skopt.space import Real, Integer
roc_auc_opt = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)

# Defining your search space
search_spaces = {"num_leaves": Integer(10,100),
                 'max_depth': Integer(5, 30),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'n_estimators': Integer(100, 200)
                 }
lgbm_clf = LGBMClassifier(boosting_type = "gbdt", objective='binary', n_jobs=-1)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
# Setting up BayesSearchCV
opt = BayesSearchCV(lgbm_clf,
                    search_spaces,
                    scoring=roc_auc_opt,
                    cv=skf,
                    n_iter=30,
                    n_jobs=-1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=False,
                    refit=True,
                    random_state=42)
# Reporting util for different optimizers
def report_perf(optimizer, X, y, title, callbacks=None):
    """
    Функция для оценки времени и параметров оптимизации
    
    optimizer = skopt объект
    X = признаки
    y = целевая переменная
    title = название эксперимента
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

best_params = report_perf(opt, X_train, y_train,'LGBM', 
                          callbacks=[VerboseCallback(100), 
                                     DeadlineStopper(60*10)])

Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 1.9641
Function value obtained: -0.8425
Current minimum: -0.8425
Iteration No: 2 started. Searching for the next optimal point.
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 0.7094
Function value obtained: -0.8389
Current minimum: -0.8425
Iteration No: 3 started. Searching for the next optimal point.
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 1.1598
Function value obtained: -0.8431
Current minimum: -0.8431
Iteration No: 4 started. Searching for the next optimal point.
Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 0.7632
Function value obtained: -0.8399
Current minimum: -0.8431
Iteration No: 5 started. Searching for the next optimal point.
Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 0.8462
Function value obtained

In [100]:
best_params

OrderedDict([('learning_rate', 0.2406383605989789),
             ('max_depth', 22),
             ('n_estimators', 200),
             ('num_leaves', 58)])

In [101]:
lgbm_best_params = LGBMClassifier(learning_rate=best_params['learning_rate'],
                                  max_depth=best_params['max_depth'],
                                  n_estimators=best_params['n_estimators'],
                                 num_leaves=best_params['num_leaves']).fit(X_train, y_train)
predictions_w2v_validated = lgbm_best_params.predict(embeddings_validate)
predictions_w2v_validated_proba = lgbm_best_params.predict_proba(embeddings_validate)[:, 1]
auc_score_validated = roc_auc_score(y_validated, predictions_w2v_validated_proba)
f_score_validated = f1_score(y_validated, predictions_w2v_validated, average='weighted')
print(f"ROC-AUC score validated = {auc_score_validated}, F_score validated = {f_score_validated}")

ROC-AUC score validated = 0.7269914961219484, F_score validated = 0.8392017153649144


Возможна ошибка из-за несовместимости пакета scikit-optimize c новой версией sklearn 0.24. Чтобы успешно юзать
байесовскую оптимизацию, нам нужно откатить sklearn до версии 0.23.4. Чтобы откатиться на старую версию, в терминале введите следующее:

pip install scikit-learn==0.23.4

In [86]:
import sklearn
sklearn.__version__

'0.23.1'