In [1]:
import xgboost
import pickle
import numpy as np
import gc
import pandas as pd
import operator

from bayes_opt import BayesianOptimization

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score, recall_score, precision_score

from general_functions import create_balanced_dataset

file = open("dataset/zone_4.pickle", "rb")
zone_4 = pickle.load(file)
file = None

file = open("dataset/zone_7.pickle", "rb")
zone_7 = pickle.load(file)
file = None

# zone_7_resampled = create_balanced_dataset(["dataset/zone_7.pickle"])

# with open("dataset/zone_7_resampled.pickle", "wb") as file:
#     pickle.dump(zone_7_resampled, file)

# zone_4_resampled = create_balanced_dataset(["dataset/zone_4.pickle"])

# with open("dataset/zone_4_resampled.pickle", "wb") as file:
#     pickle.dump(zone_4_resampled, file)

with open("dataset/zone_4_resampled.pickle", "rb") as file:
    zone_4_resampled = pickle.load(file)

with open("dataset/zone_7_resampled.pickle", "rb") as file:
    zone_7_resampled = pickle.load(file)
    
experiment_arr = [(zone_4_resampled, zone_7), (zone_7_resampled, zone_4)]

In [2]:
most_important_features = ['impundment_mean_3', 'hpmf_median_4', 'impundment_mean_4', 'impundment_mean_2', 'impundment_median_4', 'slope_non_ditch', 'impundment_std_4', 'impoundment_amplified_no_streams', 'skyview_gabor', 'impundment_amplified', 'skyview_max_6', 'skyview_non_ditch', 'hpmf_min_2', 'impundment_median_2', 'hpmf_min_6', 'hpmf_min_4', 'hpmf_mean_4', 'impundment_mean_6', 'skyview_max_4', 'impundment_max_6', 'slope_mean_6', 'slope_std_6', 'skyview_gabor_no_streams', 'hpmf_filter_no_streams', 'hpmf_median_2', 'skyview_median_6', 'slope_median_6', 'hpmf_median_6', 'impundment_std_6', 'skyview_mean_6', 'hpmf_mean_6', 'slope_min_6', 'hpmf_filter', 'skyview_std_6', 'skyview_min_6']

In [3]:
def optim_function(learning_rate=.1,
                   n_estimators=100,
                   max_depth=5,
                   min_child_weight=1,
                   gamma=0,
                   subsample=.8,
                   colsample_bytree=.8,
                   scale_pos_weight=2,
                   reg_alpha=0,
                   reg_lambda=0):
    
    max_depth = int(max_depth)
    min_child_weight = int(min_child_weight)
    n_estimators = int(n_estimators)
    
    y_test_all = np.zeros((2, len(experiment_arr[0][1]))).astype(np.int8)
    pred_all = np.zeros((2, len(experiment_arr[0][1]))).astype(np.int8)
    
    for i, (training_dataset, test_dataset) in enumerate(experiment_arr):
        X_train = np.array(training_dataset.filter(items=most_important_features).loc[:, training_dataset.filter(items=most_important_features).columns != "label_3m"]).astype(np.float32)
        y_train = np.array(training_dataset["label_3m"]).astype(np.int8)
        
        training_dataset = None
        gc.collect()
        
        X_test = np.array(test_dataset.filter(items=most_important_features).loc[:, test_dataset.filter(items=most_important_features).columns != "label_3m"]).astype(np.float32)
        y_test = np.array(test_dataset["label_3m"]).astype(np.int8)
        
        test_dataset = None
        gc.collect()
        
        clf = xgboost.sklearn.XGBClassifier(max_depth=int(max_depth),
                                            learning_rate=learning_rate,
                                            n_estimators=n_estimators,
                                            gamma=gamma,
                                            min_child_weight=int(min_child_weight),
                                            subsample=subsample,
                                            colsample_bytree=colsample_bytree,
                                            scale_pos_weight=scale_pos_weight,
                                            reg_alpha=reg_alpha,
                                            reg_lambda=reg_lambda,
                                            seed=41,
                                            gpu_id=0,
                                            **{"predictor": "gpu_predictor"}
                                           )

        clf.fit(X_train, y_train)
        
        pred = clf.predict(X_test)
        
        pred = np.array(pred).astype(np.int8)
        y_test = np.array(y_test).astype(np.int8)
        
        pred_all[i] = pred
        y_test_all[i] = y_test
        
    pred_all = pred_all.reshape(-1)
    y_test_all = y_test_all.reshape(-1)
    
    kappa = cohen_kappa_score(np.array(y_test_all), np.array(pred_all))
    return kappa

In [4]:
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events


pbounds = {"learning_rate": (1e-4, 1e0),
           "n_estimators": (50.0, 500.0),
           "gamma": (0.0, 1.0),
           "min_child_weight": (1e-3, 30.0),
           "subsample": (.2, 1.0),
           "colsample_bytree": (.2, 1.0),
           "scale_pos_weight": (1.0, 4.0),
           "max_depth": (3.0, 30),
           "reg_alpha": (0.0, 1e-1),
           "reg_lambda": (0.0, 1e-1)
          }

optimizer = BayesianOptimization(
    f=optim_function,
    pbounds=pbounds,
    random_state=1,
    verbose=2,
)

optimizer.probe(
    params={"learning_rate": 0.3135,
           "n_estimators": 452,
           "gamma": 0.9683,
           "min_child_weight": 26.29,
           "subsample": 0.9025,
           "colsample_bytree": 0.8406,
           "scale_pos_weight": 1.509,
           "max_depth": 21,
           "reg_alpha": 0.008504,
           "reg_lambda": 0.003905
          },
    lazy=True,
)

#from bayes_opt.util import load_logs
#load_logs(optimizer, logs=["./xgboost_log.json"]);

#logger = JSONLogger(path="./xgboost_log.json")
#optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

In [5]:
optimizer.maximize(n_iter=120)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.4133  [0m | [0m 0.8406  [0m | [0m 0.9683  [0m | [0m 0.3135  [0m | [0m 21.0    [0m | [0m 26.29   [0m | [0m 452.0   [0m | [0m 0.008504[0m | [0m 0.003905[0m | [0m 1.509   [0m | [0m 0.9025  [0m |
| [0m 2       [0m | [0m 0.2364  [0m | [0m 0.5336  [0m | [0m 0.7203  [0m | [0m 0.000214[0m | [0m 11.16   [0m | [0m 4.404   [0m | [0m 91.55   [0m | [0m 0.01863 [0m | [0m 0.03456 [0m | [0m 2.19    [0m | [0m 0.6311  [0m |
| [95m 3       [0m | [95m 0.4181  [0m | [95m 0.5354  [0m | [95m 0.6852  [0m | [95m 0.2045  [0m | [95m 26.71   [0m | [95m 0.8226  [0m | [95m 351.7   [0m | [95m 0.04173 [0m | [95m 0.05587 [0m | [95m 1.421 

KeyboardInterrupt: 

In [6]:
optimizer.max

{'target': 0.41814894011216863,
 'params': {'colsample_bytree': 0.5353556115226359,
  'gamma': 0.6852195003967595,
  'learning_rate': 0.20453180450654426,
  'max_depth': 26.709170782555525,
  'min_child_weight': 0.8226004083445869,
  'n_estimators': 351.710379580281,
  'reg_alpha': 0.0417304802367127,
  'reg_lambda': 0.05586898284457517,
  'scale_pos_weight': 1.4211608157857012,
  'subsample': 0.35848119126790307}}