# ✏️ Basic Setting

## 🔎 Importing Libraries

In [None]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, label_binarize
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from bayes_opt import BayesianOptimization
import warnings
import time
import datetime

# ✏️ Hyperparameter tuning (Bayesian Optimization)

## 🔎 Samsung Card version

In [None]:
df = feat.merge(train, how='left', left_index=True, right_index=True)
df_nn = df[df['MRC_ID_DI'].notnull()]

In [None]:
def XGB_CV(max_depth,
           gamma,
           min_child_weight,
           max_delta_step,
           subsample,
           colsample_bytree,
           learning_rate,
         ):
    global AUCbest
    global ITERbest

    # Model defining
    xgb = XGBClassifier(max_depth = int(max_depth),
                        gamma = gamma,
                        learning_rate = learning_rate,
                        subsample = max(min(subsample, 1), 0),
                        colsample_bytree = max(min(colsample_bytree, 1), 0),
                        min_child_weight = min_child_weight,
                        max_delta_step = int(max_delta_step),
                        n_estimators = 20000,
                        random_state=42, 
#                         tree_method='gpu_hist' ,
                        silent=True)
    
    # Model Training
    xgb.fit(X_train, y_train,
            early_stopping_rounds = 100,
            eval_set=[(X_test, y_test)], 
            eval_metric=custom_eval, verbose=False)
    
    val_score = -xgb.evals_result()['validation_0']['roc_auc'][-1]
    print(' Stopped after %d iterations with val-auc = %f val-gini = %f' % ( len(xgb.evals_result()['validation_0']['roc_auc']), val_score, (val_score*2-1)) )
    if ( val_score > AUCbest ):
        AUCbest = val_score
        ITERbest = len(xgb.evals_result()['validation_0']['roc_auc'])

    return (val_score*2) - 1

In [None]:
def custom_eval(pred, dtrain):
  labels = dtrain.get_label()
  lb = LabelBinarizer()
  lb.fit(labels)
  label = lb.transform(labels)
  return 'roc_auc' , -roc_auc_score(label, pred)

In [None]:
XGB_BO = BayesianOptimization(XGB_CV,pbounds= {
                                    'max_depth': (2, 12),
                                    'gamma': (0.001, 10.0),
                                    'min_child_weight': (0, 20),
                                    'max_delta_step': (0, 10),
                                    'subsample': (0.4, 1.0),
                                    'colsample_bytree' :(0.4, 1.0),
                                    'learning_rate' : (0.01, 0.1),                                  
                                    }, 
                               verbose= 2,
                               random_state = 42)

In [None]:
X = df_nn.iloc[:, :-1].values; y = df_nn.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
AUCbest = -1.
ITERbest = 0
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    XGB_BO.maximize(init_points=2, n_iter=30, acq='ei', xi=0.0)

|   iter    |  target   | colsam... |   gamma   | learni... | max_de... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
 Stopped after 408 iterations with val-auc = 0.886572 val-gini = 0.773144
| [0m 1       [0m | [0m 0.7731  [0m | [0m 0.4939  [0m | [0m 4.715   [0m | [0m 0.06635 [0m | [0m 5.155   [0m | [0m 9.919   [0m | [0m 9.163   [0m | [0m 0.4351  [0m |
 Stopped after 330 iterations with val-auc = 0.886273 val-gini = 0.772546
| [0m 2       [0m | [0m 0.7725  [0m | [0m 0.9182  [0m | [0m 1.465   [0m | [0m 0.09187 [0m | [0m 7.554   [0m | [0m 6.131   [0m | [0m 0.1714  [0m | [0m 0.7001  [0m |
 Stopped after 333 iterations with val-auc = 0.887737 val-gini = 0.775474
| [95m 3       [0m | [95m 0.7755  [0m | [95m 0.4828  [0m | [95m 9.944   [0m | [95m 0.07899 [0m | [95m 6.583   [0m | [95m 10.98   [0m | [95m 15.32   [0m | [95m 0.6641  [0m |
 St

In [None]:
# Best hyperparameter
print(XGB_BO.max)

##🔎 Titanic Version

In [None]:
titanic = pd.read_csv("titanic.csv")
titanic['Sex'] = titanic['Sex'].map({"male":0,'female':1})
titanic = titanic[['Survived', 'Pclass','Sex', 'Age', 'SibSp', 'Parch']]
titanic = titanic.fillna(round(titanic['Age'].mean()))
titanic['Age'] = titanic['Age'].astype(int)

In [None]:
X = titanic.iloc[:, 1:]; y = titanic.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def XGB_CV(max_depth,
           gamma,
           min_child_weight,
           max_delta_step,
           subsample,
           colsample_bytree,
           learning_rate
         ):
    global AUCbest
    global ITERbest
    
    folds = 10
    cv_score = 0
    
    # Model defining
    xgb = XGBClassifier(max_depth = int(max_depth),
                        gamma = gamma,
                        learning_rate = learning_rate,
                        subsample = max(min(subsample, 1), 0),
                        colsample_bytree = max(min(colsample_bytree, 1), 0),
                        min_child_weight = min_child_weight,
                        max_delta_step = int(max_delta_step),
                        n_estimators = 20000,
                        random_state=42, 
#                         tree_method='gpu_hist' ,
                       )
    
    # Model Training
    xgb.fit(X_train, y_train,
            early_stopping_rounds = 100,
            eval_metric=["auc"], verbose=False,
            eval_set=[(X_test, y_test)])
    
    
    val_score = max(xgb.evals_result()['validation_0']['auc'])
    print(' Stopped after %d iterations with val-auc = %f val-gini = %f' % ( len(xgb.evals_result()['validation_0']['auc']), val_score, (val_score*2-1)) )
    if ( val_score > AUCbest ):
        AUCbest = val_score
        ITERbest = len(xgb.evals_result()['validation_0']['auc'])

    return (val_score*2) - 1

In [None]:
X = titanic.iloc[:, 1:]; y = titanic.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
XGB_BO = BayesianOptimization(XGB_CV,pbounds= {
                                    'max_depth': (2, 12),
                                    'gamma': (0.001, 10.0),
                                    'min_child_weight': (0, 20),
                                    'max_delta_step': (0, 10),
                                    'subsample': (0.4, 1.0),
                                    'colsample_bytree' :(0.4, 1.0),
                                    'learning_rate' : (0.01, 0.1), 
                                    }, 
                               verbose= 2,
                               random_state = 42)

In [None]:
AUCbest = -1.
ITERbest = 0
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    XGB_BO.maximize(init_points=2, n_iter=30, acq='ei', xi=0.0)

|   iter    |  target   | colsam... |   gamma   | learni... | max_de... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
 Stopped after 233 iterations with val-auc = 0.872265 val-gini = 0.744530
| [0m 1       [0m | [0m 0.7445  [0m | [0m 0.6247  [0m | [0m 9.507   [0m | [0m 0.07588 [0m | [0m 5.987   [0m | [0m 3.56    [0m | [0m 3.12    [0m | [0m 0.4349  [0m |
 Stopped after 198 iterations with val-auc = 0.864607 val-gini = 0.729214
| [0m 2       [0m | [0m 0.7292  [0m | [0m 0.9197  [0m | [0m 6.012   [0m | [0m 0.07373 [0m | [0m 0.2058  [0m | [0m 11.7    [0m | [0m 16.65   [0m | [0m 0.5274  [0m |
 Stopped after 186 iterations with val-auc = 0.869241 val-gini = 0.738482
| [0m 3       [0m | [0m 0.7385  [0m | [0m 0.4136  [0m | [0m 8.958   [0m | [0m 0.04861 [0m | [0m 6.012   [0m | [0m 3.946   [0m | [0m 2.839   [0m | [0m 0.4521  [0m |
 Stopped aft

 Stopped after 365 iterations with val-auc = 0.872844 val-gini = 0.745688
| [0m 32      [0m | [0m 0.7457  [0m | [0m 0.7978  [0m | [0m 6.43    [0m | [0m 0.09319 [0m | [0m 4.536   [0m | [0m 8.566   [0m | [0m 8.388   [0m | [0m 0.9078  [0m |


In [None]:
# Best hyperparameter
print(XGB_BO.max)

{'target': 0.780308, 'params': {'colsample_bytree': 0.8804393682155138, 'gamma': 4.085442381163929, 'learning_rate': 0.07257219838465799, 'max_delta_step': 4.972513912107027, 'max_depth': 7.1560019437361335, 'min_child_weight': 2.466140801376737, 'subsample': 0.5868904406920552}}
