# ✏️ Basic Setting

## 🔎 Importing Libraries

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, label_binarize
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization
from sklearn.utils.class_weight import compute_class_weight
import warnings
import datetime

In [None]:
'''%cd /content/
#lgbm with gpu on colab
!git clone https://github.com/Microsoft/LightGBM
%cd /content/LightGBM
!mkdir build
!cmake -DUSE_GPU=1
!make -j$(nproc)
!sudo apt-get -y install python-pip
!sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
%cd /content/LightGBM/python-package
!sudo python setup.py install'''

'%cd /content/\n#lgbm with gpu on colab\n!git clone https://github.com/Microsoft/LightGBM\n%cd /content/LightGBM\n!mkdir build\n!cmake -DUSE_GPU=1\n!make -j$(nproc)\n!sudo apt-get -y install python-pip\n!sudo -H pip install setuptools pandas numpy scipy scikit-learn -U\n%cd /content/LightGBM/python-package\n!sudo python setup.py install'

In [None]:
'''!pip install bayesian-optimization'''

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp36-none-any.whl size=11685 sha256=feebf53c5e87f200968d50262ef01b3e632ee5029a1a0f11f4cfbff5188741b4
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


# ✏️ Hyperparameter tuning (Bayesian Optimization)

## 🔎 Samsung Card version

In [None]:
def LGB_opt(max_depth,
           gamma,
           min_child_weight,
           max_delta_step,
           subsample,
           colsample_bytree,
           learning_rate,
         ):

    global AUCbest
    global ITERbest

    folds = 10
    cv_score = 0

    lgb = LGBMClassifier(class_weight=None,
                        colsample_bytree = max(min(colsample_bytree, 1), 0), 
                        learning_rate = learning_rate,
                        #gamma = gamma, 
                        max_depth = int(max_depth), 
                        min_child_weight = min_child_weight, 
                        subsample = max(min(subsample, 1), 0),
                        num_leaves = min(2**int(max_depth),131072),
                        n_estimators = 20000, 
                        n_jobs=-1, 
                        device='gpu',
                        random_state=42)
                        
    lgb.fit(X_train, y_train,
            verbose=False,  
            early_stopping_rounds = 100,
            eval_set=[(X_test, y_test)],
            eval_metric=custom_eval)

    val_score = lgb.evals_result_['valid_0']['roc_auc'][-1]
    print(' Stopped after %d iterations with val-auc = %f val-gini = %f' % ( len(lgb.evals_result_['valid_0']['roc_auc']), val_score, (val_score*2-1)) )
    if ( val_score > AUCbest ):
        AUCbest = val_score
        ITERbest = len(lgb.evals_result_['valid_0']['roc_auc'])

    return (val_score*2) - 1

In [None]:
def custom_eval(y_true, y_pred):
  preds = y_pred.reshape(11, -1).T
  lb = LabelBinarizer()
  lb.fit(y_true)
  label = lb.transform(y_true)
  return 'roc_auc' , roc_auc_score(label, preds), True

In [None]:
LGB_BO = BayesianOptimization(LGB_opt, {
                                    'max_depth': (2, 20),
                                    'gamma': (0.001, 10.0),
                                    'min_child_weight': (0, 20),
                                    'max_delta_step': (0, 10),
                                    'subsample': (0.4, 1.0),
                                    'colsample_bytree' :(0.4, 1.0),
                                    'learning_rate' : (0.01, 0.1)
                                    })

In [None]:
%cd /content/drive/My Drive/samsung_card/preprocess  

path = '../data/'
test_file = 'df_merged.csv'
df = pd.read_csv(path+test_file, index_col='cst_id_di')

/content/drive/My Drive/samsung_card/preprocess


In [None]:
X = df.iloc[:, 1:].values; y = df.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1234)

In [None]:
AUCbest = -1.
ITERbest = 0

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=2, n_iter=30, acq='ei', xi=0.0)

|   iter    |  target   | colsam... |   gamma   | learni... | max_de... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
 Stopped after 166 iterations with val-auc = 0.883284 val-gini = 0.766569
| [0m 1       [0m | [0m 0.7666  [0m | [0m 0.7363  [0m | [0m 1.363   [0m | [0m 0.07834 [0m | [0m 4.224   [0m | [0m 10.48   [0m | [0m 3.574   [0m | [0m 0.6068  [0m |
 Stopped after 133 iterations with val-auc = 0.876236 val-gini = 0.752472
| [0m 2       [0m | [0m 0.7525  [0m | [0m 0.4192  [0m | [0m 6.497   [0m | [0m 0.08306 [0m | [0m 3.924   [0m | [0m 14.8    [0m | [0m 1.103   [0m | [0m 0.637   [0m |
 Stopped after 1475 iterations with val-auc = 0.887828 val-gini = 0.775655
| [95m 3       [0m | [95m 0.7757  [0m | [95m 0.4664  [0m | [95m 2.01    [0m | [95m 0.01934 [0m | [95m 9.227   [0m | [95m 3.089   [0m | [95m 1.592   [0m | [95m 0.9715  [0m |
 S

## 🔎 Titanic Version

In [None]:
titanic = pd.read_csv("titanic.csv")
titanic['Sex'] = titanic['Sex'].map({"male":0,'female':1})
titanic = titanic[['Survived', 'Pclass','Sex', 'Age', 'SibSp', 'Parch']]
titanic = titanic.fillna(round(titanic['Age'].mean()))
titanic['Age'] = titanic['Age'].astype(int)

In [None]:
X = titanic.iloc[:, 1:]; y = titanic.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def LGB_opt(max_depth,
           gamma,
           min_child_weight,
           max_delta_step,
           subsample,
           colsample_bytree,
           learning_rate,
         ):

    global AUCbest
    global ITERbest

    folds = 10
    cv_score = 0

    lgb = LGBMClassifier(class_weight=None,
                        colsample_bytree = max(min(colsample_bytree, 1), 0), 
                        learning_rate = learning_rate,
                        #gamma = gamma, 
                        max_depth = int(max_depth), 
                        min_child_weight = min_child_weight, 
                        subsample = max(min(subsample, 1), 0),
                        num_leaves = min(2**int(max_depth),131072),
                        n_estimators = 20000, 
                        n_jobs=-1, 
#                         device='gpu',
                        random_state=42)
                        
    lgb.fit(X_train, y_train,
            verbose=False,  
            early_stopping_rounds = 100,
            eval_set=[(X_test, y_test)],
            eval_metric=['auc'])

    val_score = lgb.evals_result_['valid_0']['auc'][-1]
    print(' Stopped after %d iterations with val-auc = %f val-gini = %f' % ( len(lgb.evals_result_['valid_0']['auc']), val_score, (val_score*2-1)) )
    if ( val_score > AUCbest ):
        AUCbest = val_score
        ITERbest = len(lgb.evals_result_['valid_0']['auc'])

    return (val_score*2) - 1

In [None]:
LGB_BO = BayesianOptimization(LGB_opt, {
                                    'max_depth': (2, 20),
                                    'gamma': (0.001, 10.0),
                                    'min_child_weight': (0, 20),
                                    'max_delta_step': (0, 10),
                                    'subsample': (0.4, 1.0),
                                    'colsample_bytree' :(0.4, 1.0),
                                    'learning_rate' : (0.01, 0.1)
                                    })

In [None]:
AUCbest = -1.
ITERbest = 0

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=2, n_iter=30, acq='ei', xi=0.0)

|   iter    |  target   | colsam... |   gamma   | learni... | max_de... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
 Stopped after 140 iterations with val-auc = 0.873810 val-gini = 0.747619
| [0m 1       [0m | [0m 0.7476  [0m | [0m 0.9522  [0m | [0m 5.931   [0m | [0m 0.08519 [0m | [0m 1.429   [0m | [0m 9.256   [0m | [0m 5.171   [0m | [0m 0.848   [0m |
 Stopped after 210 iterations with val-auc = 0.880695 val-gini = 0.761390
| [95m 2       [0m | [95m 0.7614  [0m | [95m 0.9933  [0m | [95m 9.006   [0m | [95m 0.06377 [0m | [95m 2.197   [0m | [95m 2.862   [0m | [95m 8.896   [0m | [95m 0.7947  [0m |
 Stopped after 159 iterations with val-auc = 0.866795 val-gini = 0.733591
| [0m 3       [0m | [0m 0.7336  [0m | [0m 0.4893  [0m | [0m 8.692   [0m | [0m 0.09656 [0m | [0m 2.31    [0m | [0m 2.747   [0m | [0m 8.883   [0m | [0m 0.7896  [0m |
 St

In [None]:
print(LGB_BO.max)

{'target': 0.781981981981982, 'params': {'colsample_bytree': 0.8161923815519119, 'gamma': 2.100636409041735, 'learning_rate': 0.02254700212192725, 'max_delta_step': 8.474820710132052, 'max_depth': 10.19077333497031, 'min_child_weight': 7.728551995575659, 'subsample': 0.4674282842643168}}
