In [1]:
import xgboost
import pickle
import numpy as np
import gc
import pandas as pd
import operator

from bayes_opt import BayesianOptimization

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score, recall_score, precision_score

from general_functions import create_balanced_dataset

# Experiment

In [2]:
file = open("dataset/zone_4.pickle", "rb")
zone_4 = pickle.load(file)
file = None

file = open("dataset/zone_7.pickle", "rb")
zone_7 = pickle.load(file)
file = None

In [3]:
# zone_7_resampled = create_balanced_dataset(["dataset/zone_7.pickle"])

# with open("dataset/zone_7_resampled.pickle", "wb") as file:
#     pickle.dump(zone_7_resampled, file)

# zone_4_resampled = create_balanced_dataset(["dataset/zone_4.pickle"])

# with open("dataset/zone_4_resampled.pickle", "wb") as file:
#     pickle.dump(zone_4_resampled, file)

In [4]:
with open("dataset/zone_4_resampled.pickle", "rb") as file:
    zone_4_resampled = pickle.load(file)

with open("dataset/zone_7_resampled.pickle", "rb") as file:
    zone_7_resampled = pickle.load(file)

In [5]:
experiment_arr = [(zone_4_resampled, zone_7), (zone_7_resampled, zone_4)]

In [6]:
most_important_features = zone_4.columns.tolist()[1:]
for num_features in [81, 50, 35, 25, 20, 15, 10]:
    
    most_important_features = most_important_features[:num_features]
    print(f"Features used in experiment:\n{most_important_features}")
    print("\n")
    feature_importances = {i:0 for i in most_important_features}
    y_test_all = []
    pred_all = []
    
    
    for (training_dataset, test_dataset) in experiment_arr:
        X_train = training_dataset.filter(items=most_important_features).loc[:, training_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_train = training_dataset["label_3m"]
        training_dataset = None

        clf = xgboost.XGBClassifier(tree_method="gpu_hist",
                                    colsample_bytree = 0.9125599,
                                    gamma = 0.4130997,
                                    learning_rate = 0.2495447,
                                    max_depth = 3,
                                    min_child_weight=0.347961,
                                    n_estimators=155,
                                    reg_alpha=0.000689,
                                    scale_pos_weight=1.0,
                                    subsample=0.426999)
        
        
        
        clf.fit(X_train, y_train)

        X_test = test_dataset.filter(items=most_important_features).loc[:, test_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_test = test_dataset["label_3m"]
        
        print(f"Amount of features X_train: {len(X_train.columns)}, X_test: {len(X_test.columns)}")

        pred = clf.predict(X_test)

        for i, pred_var in enumerate(pred):
            y_test_all.append(y_test[i])
            pred_all.append(pred_var)

        importances = clf.feature_importances_

        
        for i, importance in enumerate(importances):
            feature_importances[most_important_features[i]] += importance
        
    for importance_name in most_important_features:
        feature_importances[importance_name] /= 2
    
    
    
    most_important_features = list(dict(sorted(feature_importances.items(),
                                               key=operator.itemgetter(1),reverse=True)).keys())

    print("\n")
    print(f"Importances for experiment:")
    for key in most_important_features:
        print(key, " - ", feature_importances[key])
    print("\n")
    print(f"Amount of features used:   {num_features}")
    print("Cohen's kappa score        ", cohen_kappa_score(y_test_all, pred_all))
    print("Accuracy score             ", accuracy_score(y_test_all, pred_all))
    print("Recall score               ", recall_score(y_test_all, pred_all))
    print("Precision score            ", precision_score(y_test_all, pred_all))
    print("\n------------------------------------------------------\n")

Features used in experiment:
['hpmf_raw', 'skyview_raw', 'impundment_raw', 'slope_raw', 'DEM_ditch_detection', 'DEM_ditch_detection_no_streams', 'conic_mean', 'skyview_non_ditch', 'skyview_gabor', 'conic_mean_no_streams', 'skyview_gabor_no_streams', 'skyview_mean_2', 'skyview_mean_3', 'skyview_mean_4', 'skyview_mean_6', 'skyview_median_2', 'skyview_median_4', 'skyview_median_6', 'skyview_min_2', 'skyview_min_4', 'skyview_min_6', 'skyview_max_2', 'skyview_max_4', 'skyview_max_6', 'skyview_std_2', 'skyview_std_4', 'skyview_std_6', 'impundment_amplified', 'impoundment_amplified_no_streams', 'impundment_mean_2', 'impundment_mean_3', 'impundment_mean_4', 'impundment_mean_6', 'impundment_median_2', 'impundment_median_4', 'impundment_median_6', 'impundment_min_2', 'impundment_min_4', 'impundment_min_6', 'impundment_max_2', 'impundment_max_4', 'impundment_max_6', 'impundment_std_2', 'impundment_std_4', 'impundment_std_6', 'hpmf_filter', 'hpmf_gabor', 'hpmf_gabor_no_streams', 'hpmf_filter_no_st

# Optimization

In [7]:
most_important_features = zone_4.columns.tolist()[1:]

def optim_function(learning_rate=.1,
                   n_estimators=100,
                   max_depth=5,
                   min_child_weight=1,
                   gamma=0,
                   subsample=.8,
                   colsample_bytree=.8,
                   scale_pos_weight=2,
                   reg_alpha=0,
                   reg_lambda=0):
    
    max_depth = int(max_depth)
    min_child_weight = int(min_child_weight)
    n_estimators = int(n_estimators)
    
    y_test_all = np.zeros((2, len(experiment_arr[0][1]))).astype(np.int8)
    pred_all = np.zeros((2, len(experiment_arr[0][1]))).astype(np.int8)
    
    for i, (training_dataset, test_dataset) in enumerate(experiment_arr):
        X_train = np.array(training_dataset.filter(items=most_important_features).loc[:, training_dataset.filter(items=most_important_features).columns != "label_3m"]).astype(np.float32)
        y_train = np.array(training_dataset["label_3m"]).astype(np.int8)
        
        training_dataset = None
        gc.collect()
        
        X_test = np.array(test_dataset.filter(items=most_important_features).loc[:, test_dataset.filter(items=most_important_features).columns != "label_3m"]).astype(np.float32)
        y_test = np.array(test_dataset["label_3m"]).astype(np.int8)
        
        test_dataset = None
        gc.collect()
        
        clf = xgboost.sklearn.XGBClassifier(max_depth=int(max_depth),
                                            learning_rate=learning_rate,
                                            n_estimators=n_estimators,
                                            gamma=gamma,
                                            min_child_weight=int(min_child_weight),
                                            subsample=subsample,
                                            colsample_bytree=colsample_bytree,
                                            scale_pos_weight=scale_pos_weight,
                                            reg_alpha=reg_alpha,
                                            reg_lambda=reg_lambda,
                                            tree_method="gpu_hist",
                                            seed=41,
                                            gpu_id=0,
                                            **{"predictor": "gpu_predictor"}
                                           )

        clf.fit(X_train, y_train)
        
        pred = clf.predict(X_test)
        
        pred = np.array(pred).astype(np.int8)
        y_test = np.array(y_test).astype(np.int8)
        
        pred_all[i] = pred
        y_test_all[i] = y_test
        
    pred_all = pred_all.reshape(-1)
    y_test_all = y_test_all.reshape(-1)
    
    kappa = cohen_kappa_score(np.array(y_test_all), np.array(pred_all))
    return kappa

In [9]:
pbounds = {"learning_rate": (1e-4, 1e0),
           "n_estimators": (50.0, 500.0),
           "gamma": (0.0, 1.0),
           "min_child_weight": (1e-3, 30.0),
           "subsample": (.2, 1.0),
           "colsample_bytree": (.2, 1.0),
           "scale_pos_weight": (1.0, 4.0),
           "max_depth": (3.0, 30),
           "reg_alpha": (0.0, 1e-1),
           "reg_lambda": (0.0, 1e-1)
          }

optimizer = BayesianOptimization(
    f=optim_function,
    pbounds=pbounds,
    random_state=1,
    verbose=2,
)

In [None]:
optimizer.maximize(n_iter=120)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.339   [0m | [0m 0.4996  [0m | [0m 0.9507  [0m | [0m 0.732   [0m | [0m 19.16   [0m | [0m 4.681   [0m | [0m 120.2   [0m | [0m 0.005808[0m | [0m 0.08662 [0m | [0m 2.803   [0m | [0m 0.7665  [0m |
| [0m 2       [0m | [0m 0.2759  [0m | [0m 0.2165  [0m | [0m 0.9699  [0m | [0m 0.8325  [0m | [0m 8.733   [0m | [0m 5.456   [0m | [0m 132.5   [0m | [0m 0.03042 [0m | [0m 0.05248 [0m | [0m 2.296   [0m | [0m 0.433   [0m |
| [0m 3       [0m | [0m 0.3269  [0m | [0m 0.6895  [0m | [0m 0.1395  [0m | [0m 0.2922  [0m | [0m 12.89   [0m | [0m 13.68   [0m | [0m 403.3   [0m | [0m 0.01997 [0m | [0m 0.05142 [0m | [0m 2.777   [0m | [

In [None]:
optimizer.max

In [6]:
most_important_features = zone_4.columns.tolist()[1:]
for num_features in [81, 50, 35, 25, 20, 15, 10]:
    
    most_important_features = most_important_features[:num_features]
    print(f"Features used in experiment:\n{most_important_features}")
    print("\n")
    feature_importances = {i:0 for i in most_important_features}
    y_test_all = []
    pred_all = []
    
    
    for (training_dataset, test_dataset) in experiment_arr:
        X_train = training_dataset.filter(items=most_important_features).loc[:, training_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_train = training_dataset["label_3m"]
        training_dataset = None

        clf = xgboost.XGBClassifier(tree_method="gpu_hist",
                                    colsample_bytree = 0.8406,
                                    gamma = 0.9683,
                                    learning_rate = 0.3135,
                                    max_depth = 21,
                                    min_child_weight=26.29,
                                    n_estimators=452,
                                    reg_alpha=0.008504,
                                    reg_lambda=0.003905,
                                    scale_pos_weight=1.509,
                                    subsample=0.9025)
        
        
        
        clf.fit(X_train, y_train)

        X_test = test_dataset.filter(items=most_important_features).loc[:, test_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_test = test_dataset["label_3m"]
        
        print(f"Amount of features X_train: {len(X_train.columns)}, X_test: {len(X_test.columns)}")

        pred = clf.predict(X_test)

        for i, pred_var in enumerate(pred):
            y_test_all.append(y_test[i])
            pred_all.append(pred_var)

        importances = clf.feature_importances_

        
        for i, importance in enumerate(importances):
            feature_importances[most_important_features[i]] += importance
        
    for importance_name in most_important_features:
        feature_importances[importance_name] /= 2
    
    
    
    most_important_features = list(dict(sorted(feature_importances.items(),
                                               key=operator.itemgetter(1),reverse=True)).keys())

    print("\n")
    print(f"Importances for experiment:")
    for key in most_important_features:
        print(key, " - ", feature_importances[key])
    print("\n")
    print(f"Amount of features used:   {num_features}")
    print("Cohen's kappa score        ", cohen_kappa_score(y_test_all, pred_all))
    print("Accuracy score             ", accuracy_score(y_test_all, pred_all))
    print("Recall score               ", recall_score(y_test_all, pred_all))
    print("Precision score            ", precision_score(y_test_all, pred_all))
    print("\n------------------------------------------------------\n")

Features used in experiment:
['hpmf_raw', 'skyview_raw', 'impundment_raw', 'slope_raw', 'DEM_ditch_detection', 'DEM_ditch_detection_no_streams', 'conic_mean', 'skyview_non_ditch', 'skyview_gabor', 'conic_mean_no_streams', 'skyview_gabor_no_streams', 'skyview_mean_2', 'skyview_mean_3', 'skyview_mean_4', 'skyview_mean_6', 'skyview_median_2', 'skyview_median_4', 'skyview_median_6', 'skyview_min_2', 'skyview_min_4', 'skyview_min_6', 'skyview_max_2', 'skyview_max_4', 'skyview_max_6', 'skyview_std_2', 'skyview_std_4', 'skyview_std_6', 'impundment_amplified', 'impoundment_amplified_no_streams', 'impundment_mean_2', 'impundment_mean_3', 'impundment_mean_4', 'impundment_mean_6', 'impundment_median_2', 'impundment_median_4', 'impundment_median_6', 'impundment_min_2', 'impundment_min_4', 'impundment_min_6', 'impundment_max_2', 'impundment_max_4', 'impundment_max_6', 'impundment_std_2', 'impundment_std_4', 'impundment_std_6', 'hpmf_filter', 'hpmf_gabor', 'hpmf_gabor_no_streams', 'hpmf_filter_no_st

In [None]:
most_important_features = zone_4.columns.tolist()[1:]
def optimizer_function():
    most_important_features = most_important_features[:num_features]
    print(f"Features used in experiment:\n{most_important_features}")
    print("\n")
    feature_importances = {i:0 for i in most_important_features}
    y_test_all = []
    pred_all = []
    
    
    for (training_dataset, test_dataset) in experiment_arr:
        X_train = training_dataset.filter(items=most_important_features).loc[:, training_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_train = training_dataset["label_3m"]
        training_dataset = None

        clf = xgboost.XGBClassifier(tree_method="gpu_hist",
                                    colsample_bytree = 0.9125599,
                                    gamma = 0.4130997,
                                    learning_rate = 0.2495447,
                                    max_depth = 3,
                                    min_child_weight=0.347961,
                                    n_estimators=155,
                                    reg_alpha=0.000689,
                                    scale_pos_weight=1.0,
                                    subsample=0.426999)
        
        
        
        clf.fit(X_train, y_train)

        X_test = test_dataset.filter(items=most_important_features).loc[:, test_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_test = test_dataset["label_3m"]
        
        print(f"Amount of features X_train: {len(X_train.columns)}, X_test: {len(X_test.columns)}")

        pred = clf.predict(X_test)

        for i, pred_var in enumerate(pred):
            y_test_all.append(y_test[i])
            pred_all.append(pred_var)

        importances = clf.feature_importances_

        
        for i, importance in enumerate(importances):
            feature_importances[most_important_features[i]] += importance
        
    for importance_name in most_important_features:
        feature_importances[importance_name] /= 2
    
    
    
    most_important_features = list(dict(sorted(feature_importances.items(),
                                               key=operator.itemgetter(1),reverse=True)).keys())

In [15]:
file = open("dataset/zone_4.pickle", "rb")
train_data = pickle.load(file)

file = open("dataset/zone_7.pickle", "rb")
test_data = pickle.load(file)

In [16]:
X_train, y_train = np.array(train_data.loc[:, train_data.columns != 'label_3m']), np.array(train_data["label_3m"]).astype(int)
train_data = None

X_test, y_test = np.array(test_data.loc[:, test_data.columns != 'label_3m']), np.array(test_data["label_3m"]).astype(int)
test_data = None

In [5]:
rus = RandomUnderSampler(.2, random_state=0)
X_train, y_train = rus.fit_resample(X_train, y_train)
print(len(X_train))

922182


In [19]:
clf = xgboost.XGBClassifier(tree_method="gpu_hist")

In [22]:
#clf.fit(X_train, y_train)
most_important_features = np.argsort(clf.feature_importances_)[-10:]
#np.sort(clf.feature_importances_)

In [23]:
most_important_features

array([69, 43, 57,  5, 52, 30, 31, 32, 35, 34], dtype=int64)

In [7]:
pred = clf.predict(X_test)

In [8]:
cohen_kappa_score(y_test, pred)

0.5029843078079197

In [2]:
def optim_function(learning_rate=.1,
                   n_estimators=100,
                   max_depth=5,
                   min_child_weight=1,
                   gamma=0,
                   subsample=.8,
                   colsample_bytree=.8,
                   scale_pos_weight=2,
                   reg_alpha=0):
    
    max_depth = int(max_depth)
    min_child_weight = int(min_child_weight)
    n_estimators = int(n_estimators)
    
    clf = xgboost.dask.DaskXGBClassifier(max_depth=int(max_depth),
                                        learning_rate=learning_rate,
                                        n_estimators=n_estimators,
                                        gamma=gamma,
                                        min_child_weight=int(min_child_weight),
                                        subsample=subsample,
                                        colsample_bytree=colsample_bytree,
                                        scale_pos_weight=scale_pos_weight,
                                        reg_alpha=reg_alpha,
                                        n_jobs=4,
                                        tree_method="gpu_hist",
                                        seed=41,
                                        gpu_id=0
                                       )
    
    clf.fit(X_train[:, most_important_features], y_train)
    
    pred = clf.predict(X_test[:, most_important_features])
    kappa = cohen_kappa_score(y_test, pred)
    return kappa

In [3]:
pbounds = {"learning_rate": (1e-4, 1e0),
           "n_estimators": (50, 500),
           "gamma": (0, .9),
           "min_child_weight": (0, 10),
           "subsample": (.1, .95),
           "colsample_bytree": (.1, .95),
           "scale_pos_weight": (1, 5),
           "max_depth": (3, 15),
           "reg_alpha": (0, 1e-3)
          }

optimizer = BayesianOptimization(
    f=optim_function,
    pbounds=pbounds,
    random_state=1,
    verbose=2,
)

NameError: name 'BayesianOptimization' is not defined

In [26]:
optimizer.maximize(n_iter=100)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.4422  [0m | [0m 0.4545  [0m | [0m 0.6483  [0m | [0m 0.000214[0m | [0m 6.628   [0m | [0m 1.468   [0m | [0m 91.55   [0m | [0m 0.000186[0m | [0m 2.382   [0m | [0m 0.4373  [0m |
| [95m 2       [0m | [95m 0.481   [0m | [95m 0.558   [0m | [95m 0.3773  [0m | [95m 0.6853  [0m | [95m 5.453   [0m | [95m 8.781   [0m | [95m 62.32   [0m | [95m 0.000670[0m | [95m 2.669   [0m | [95m 0.5749  [0m |
| [0m 3       [0m | [0m 0.3576  [0m | [0m 0.2193  [0m | [0m 0.1783  [0m | [0m 0.8008  [0m | [0m 14.62   [0m | [0m 3.134   [0m | [0m 361.5   [0m | [0m 0.000876[0m | [0m 4.578   [0m | [0m 0.1723  [0m |
| [0m 4       [0m | [0m 0.4776  [0m | [0m 0.1332  

KeyboardInterrupt: 

In [32]:
optimizer.max

{'target': 0.5004687061446937,
 'params': {'colsample_bytree': 0.9125598867458771,
  'gamma': 0.41309970007591623,
  'learning_rate': 0.2495446719246513,
  'max_depth': 3.27021719586084,
  'min_child_weight': 0.34796142621455917,
  'n_estimators': 155.73578545537174,
  'reg_alpha': 0.0006891651003411668,
  'scale_pos_weight': 4.1957470427759915,
  'subsample': 0.42699958983347897}}

In [None]:
def optim_function(learning_rate=.1,
                   n_estimators=100,
                   max_depth=5,
                   min_child_weight=1,
                   gamma=0,
                   subsample=.8,
                   colsample_bytree=.8,
                   scale_pos_weight=2,
                   reg_alpha=0):
    
    max_depth = int(max_depth)
    min_child_weight = int(min_child_weight)
    n_estimators = int(n_estimators)
    
    clf = xgboost.sklearn.XGBClassifier(max_depth=int(max_depth),
                                        learning_rate=learning_rate,
                                        n_estimators=n_estimators,
                                        gamma=gamma,
                                        min_child_weight=int(min_child_weight),
                                        subsample=subsample,
                                        colsample_bytree=colsample_bytree,
                                        scale_pos_weight=scale_pos_weight,
                                        reg_alpha=reg_alpha,
                                        n_jobs=4,
                                        tree_method="gpu_hist",
                                        seed=41
                                       )
    
    clf.fit(X_train[:, most_important_features], y_train)
    
    pred = clf.predict(X_test[:, most_important_features])
    kappa = cohen_kappa_score(y_test, pred)
    return kappa