In [32]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
     ---------------------------------------- 0.0/100.3 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/100.3 kB ? eta -:--:--
     ------------------------------------ 100.3/100.3 kB 968.0 kB/s eta 0:00:00
Collecting pyaml>=16.9
  Downloading pyaml-23.12.0-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.12.0 scikit-optimize-0.9.0



[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [116]:
import pandas as pd
import numpy as np
import random
import os
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from skopt import BayesSearchCV

import warnings
warnings.filterwarnings('ignore')
from bayes_opt import BayesianOptimization

In [97]:
train = pd.read_csv('preprocessed_csv/train_preprocessed.csv')
test = pd.read_csv('preprocessed_csv/test_preprocessed.csv')

In [98]:
train[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']] = train[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']].replace(np.nan, 0)
test[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']] = test[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']].replace(np.nan, 0)

In [100]:
def encode_categorical_variables(train, test):
    categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
    
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        le = le.fit(train[col])
        train[col] = le.transform(train[col])
        
        for label in np.unique(test[col]):
            if label not in le.classes_:
                le.classes_ = np.append(le.classes_, label)
        test[col] = le.transform(test[col])
    
    return train, test

train, test = encode_categorical_variables(train, test)

In [102]:
X = train.drop(columns='is_converted', axis=1)
y = train['is_converted']

# RandomForestClassifier

In [123]:
def stratified_kfold_score(clf, X, y, n_fold):
    X, y = X.values, y.values
    strat_kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
    roc_auc_list = []

    for train_index, test_index in strat_kfold.split(X, y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        preds = clf.predict(x_test_fold)
        roc_auc_test = roc_auc_score(y_test_fold, preds)
        roc_auc_list.append(roc_auc_test)

    return np.mean(roc_auc_list)

In [124]:
def bo_params_rf(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    params = {
        'n_estimators' : int(n_estimators),
        'max_depth' : int(max_depth),
        'min_samples_split' : int(min_samples_split),
        'min_samples_leaf' : int(min_samples_leaf),
        'random_state': 42
    }
    clf = DecisionTreeClassifier(**params)
    return stratified_kfold_score(clf, X, y, 5)

In [126]:
rf_bo = BayesianOptimization(
    f=bo_params_rf,
    pbounds={
        'n_estimators': (100, 1000),
        'max_depth': (10, 100),
        'min_samples_split': (2, 100),
        'min_samples_leaf': (1, 50)
    },
    random_state=42,
)

rf_bo.maximize(
    n_iter=50,
    init_points=2,
)

print(rf_bo.max)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.9761   [0m | [0m43.71    [0m | [0m47.59    [0m | [0m73.74    [0m | [0m638.8    [0m |
| [95m2        [0m | [95m0.9817   [0m | [95m24.04    [0m | [95m8.644    [0m | [95m7.692    [0m | [95m879.6    [0m |
| [0m3        [0m | [0m0.9806   [0m | [0m24.02    [0m | [0m14.21    [0m | [0m11.79    [0m | [0m878.6    [0m |
| [95m4        [0m | [95m0.985    [0m | [95m26.45    [0m | [95m1.937    [0m | [95m14.74    [0m | [95m902.4    [0m |
| [0m5        [0m | [0m0.9818   [0m | [0m10.4     [0m | [0m2.978    [0m | [0m47.71    [0m | [0m416.9    [0m |
| [0m6        [0m | [0m0.9767   [0m | [0m97.37    [0m | [0m35.77    [0m | [0m99.4     [0m | [0m436.5    [0m |
| [0m7        [0m | [0m0.9839   [0m | [0m24.41    [0m | [0m1.121    [0m | [0m19.32    [0m | [0m90

# XGB

In [130]:
def stratified_kfold_score(xgb, X, y, n_fold):
    X, y = X.values, y.values
    strat_kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
    roc_auc_list = []

    for train_index, test_index in strat_kfold.split(X, y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        xgb.fit(x_train_fold, y_train_fold)
        preds = xgb.predict(x_test_fold)
        roc_auc_test = roc_auc_score(y_test_fold, preds)
        roc_auc_list.append(roc_auc_test)

    return np.mean(roc_auc_list)

In [131]:
def bo_params_xgb(n_estimators, max_depth, learning_rate, min_child_weight, subsample, colsample_bytree, gamma):
    params = {
        'n_estimators' : int(n_estimators),
        'max_depth' : int(max_depth),
        'learning_rate' : learning_rate,
        'min_child_weight' : int(min_child_weight),
        'subsample' : subsample,
        'colsample_bytree' : colsample_bytree,
        'gamma' : gamma,
        'random_state': 42
    }
    xgb = XGBClassifier(**params)
    return stratified_kfold_score(xgb, X, y, 5)

In [132]:
xgb_bo = BayesianOptimization(
    f=bo_params_xgb,
    pbounds={
        'n_estimators': (100, 1000),
        'max_depth': (3, 10),
        'learning_rate' : (0.01, 0.3),
        'min_child_weight' : (1, 10),
        'subsample' : (0.5, 1.0),
        'colsample_bytree' : (0.5, 1.0),
        'gamma' : (0, 5)
    },
    random_state=42,
)

xgb_bo.maximize(
    n_iter=50,
    init_points=2,
)

print(xgb_bo.max)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9885   [0m | [0m0.6873   [0m | [0m4.754    [0m | [0m0.2223   [0m | [0m7.191    [0m | [0m2.404    [0m | [0m240.4    [0m | [0m0.529    [0m |
| [0m2        [0m | [0m0.9883   [0m | [0m0.9331   [0m | [0m3.006    [0m | [0m0.2153   [0m | [0m3.144    [0m | [0m9.729    [0m | [0m849.2    [0m | [0m0.6062   [0m |
| [0m3        [0m | [0m0.9884   [0m | [0m0.9813   [0m | [0m4.18     [0m | [0m0.2118   [0m | [0m5.863    [0m | [0m2.56     [0m | [0m240.8    [0m | [0m0.6251   [0m |
| [0m4        [0m | [0m0.9862   [0m | [0m0.7316   [0m | [0m4.132    [0m | [0m0.01213  [0m | [0m7.498    [0m | [0m3.51     [0m | [0m240.7    [0m | [0m0.8691   [0m |
| [0m5        [0m | [0m0.988    [0m | [0m0.7128

| [0m45       [0m | [0m0.9863   [0m | [0m0.7275   [0m | [0m3.345    [0m | [0m0.03186  [0m | [0m8.745    [0m | [0m2.843    [0m | [0m101.5    [0m | [0m0.6912   [0m |
| [0m46       [0m | [0m0.9894   [0m | [0m0.8935   [0m | [0m4.451    [0m | [0m0.1475   [0m | [0m6.713    [0m | [0m1.595    [0m | [0m239.9    [0m | [0m0.846    [0m |
| [0m47       [0m | [0m0.9888   [0m | [0m0.9163   [0m | [0m4.467    [0m | [0m0.2813   [0m | [0m4.383    [0m | [0m1.977    [0m | [0m240.6    [0m | [0m0.6545   [0m |
| [0m48       [0m | [0m0.9891   [0m | [0m0.7652   [0m | [0m3.152    [0m | [0m0.297    [0m | [0m4.895    [0m | [0m6.994    [0m | [0m844.5    [0m | [0m0.795    [0m |
| [0m49       [0m | [0m0.9893   [0m | [0m0.854    [0m | [0m3.922    [0m | [0m0.1702   [0m | [0m6.744    [0m | [0m6.088    [0m | [0m586.3    [0m | [0m0.8601   [0m |
| [0m50       [0m | [0m0.989    [0m | [0m0.9675   [0m | [0m4.783    [0m | [0m0.13