In [5]:
import pandas as pd
import numpy as np
import os
import json

from utils import ModelSuplier, DataLoader, DataSaver, get_best_params_overall
from utils import get_best_params_per_dataset_for_measuring_param_tunability
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.calibration import LabelEncoder


In [2]:
data_path = "../data"
DATA = []
for file in os.listdir(data_path):
    DATA.append(pd.read_csv(os.path.join(data_path,file)))

In [3]:
ms = ModelSuplier()

pipelines = ms.pipelines

In [4]:
dl = DataLoader()

data_as_X_and_y = dl.transformed_data

Using default path


In [6]:
ds = DataSaver(os.path.join("..", "history_bayes"))

In [5]:
param_distributions = [
    {
        "model__max_depth": Integer(1, 30),
        "model__min_samples_split": Integer(2, 60),
        "model__criterion": Categorical(["gini", "entropy"]),
        "model__min_samples_leaf": Integer(1, 60)
    },
    {
        "model__n_estimators": Integer(100, 500),      
        "model__min_samples_leaf": Integer(1, 250),    
        "model__max_samples": Real(0.5, 1),        
        "model__max_features": Real(1e-6, 1)   
    },
    {
        "model__max_depth": Integer(1, 19),
        "model__min_child_weight": Integer(0, 19),
        "model__eta": Real(0.01, 0.101), 
        "model__alpha": Real(1e-4, 10, prior="log-uniform")
    }   
]

# Bayesian Search && Measuring Tunability of Hyperparameters


In [9]:
history_DecisionTree = pd.read_csv('../history/history_dataset_DecisionTree.csv')
history_RandomForest = pd.read_csv('../history/history_dataset_RandomForest.csv')
history_XGBoost = pd.read_csv('../history/history_dataset_XGBoost.csv')

In [37]:
def do_bayesian_search(clf, param_distributions):
    global DATA 
    history = []
    for j,data in enumerate(DATA):
        bs = BayesSearchCV(clf, 
                                search_spaces= param_distributions,
                                #verbose=766751,
                                random_state=42,
                                cv=5,
                                n_iter=50,
                                n_jobs=-1,
                                scoring="roc_auc"
                                )
        bs.fit(data.iloc[:, :-1],LabelEncoder().fit_transform(data.iloc[:,-1]))
        history.append(bs.cv_results_)
    return history
        

## **Decision Tree**

In [None]:
param_distributions_Decision_Tree = param_distributions[0]
tunable_parameters_DT = param_distributions_Decision_Tree.keys()
tunable_parameters_DT

dict_keys(['model__max_depth', 'model__min_samples_split', 'model__criterion', 'model__min_samples_leaf'])

In [39]:
best_params_DecisionTree, _ = get_best_params_overall(history_DecisionTree)
best_params_DT_dict = json.loads(best_params_DecisionTree.replace("'", "\""))
best_params_DT_dict = {key: [value] for key, value in best_params_DT_dict.items()}
best_params_DT_dict

{'model__criterion': ['gini'],
 'model__max_depth': [17],
 'model__min_samples_leaf': [10],
 'model__min_samples_split': [58]}

In [40]:
param_history = {}
for param in tunable_parameters_DT:
    temp_param_grid = {**best_params_DT_dict} 
    temp_param_grid.update({param:param_distributions_Decision_Tree[param]})
    print(temp_param_grid)
    print("Testing param", param)
    history = do_bayesian_search(pipelines[0][1], temp_param_grid)
    param_history.update({param: history})

{'model__criterion': ['gini'], 'model__max_depth': Integer(low=1, high=30, prior='uniform', transform='normalize'), 'model__min_samples_leaf': [10], 'model__min_samples_split': [58]}
Testing param model__max_depth




{'model__criterion': ['gini'], 'model__max_depth': [17], 'model__min_samples_leaf': [10], 'model__min_samples_split': Integer(low=2, high=60, prior='uniform', transform='normalize')}
Testing param model__min_samples_split




{'model__criterion': Categorical(categories=('gini', 'entropy'), prior=None), 'model__max_depth': [17], 'model__min_samples_leaf': [10], 'model__min_samples_split': [58]}
Testing param model__criterion




{'model__criterion': ['gini'], 'model__max_depth': [17], 'model__min_samples_leaf': Integer(low=1, high=60, prior='uniform', transform='normalize'), 'model__min_samples_split': [58]}
Testing param model__min_samples_leaf




In [41]:
params_history_frames_DT = {}
for param, history in param_history.items():
    df = pd.concat([pd.DataFrame(history[i]) for i in range(len(history))], keys=range(len(history)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    params_history_frames_DT.update({param:df})

In [42]:
for param, history in params_history_frames_DT.items():
    print(history.shape)
    history.to_csv(f'../history_bayes/history_bayes_hyperparameter_tuning_DT_{param}.csv', index=False)

(200, 18)
(200, 18)
(200, 18)
(200, 18)


In [2]:
params_history_frames_DT = {}
for param in ["model__max_depth", "model__min_samples_split","model__criterion","model__min_samples_leaf"]:
    file_path = f'../history_bayes/history_bayes_hyperparameter_tuning_DT_{param}.csv'
    if os.path.exists(file_path):
        params_history_frames_DT[param] = pd.read_csv(file_path)

# Chosen hyperparameters with tunability are presented below

In [15]:
for param in tunable_parameters_DT:
    print("Results for param:", param)
    display(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_DT[param], history_DecisionTree))
    d = get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_DT[param], history_DecisionTree)['rel_tunability (%)']
    print(d.mean())

Results for param: model__max_depth


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__criterion', 'gini'), ('m...",0.870597,0.869608,0.000988,0.113646
1,1,"OrderedDict([('model__criterion', 'gini'), ('m...",0.971864,0.971264,0.0006,0.061789
2,2,"OrderedDict([('model__criterion', 'gini'), ('m...",0.980213,0.976445,0.003769,0.385972
3,3,"OrderedDict([('model__criterion', 'gini'), ('m...",0.809291,0.797114,0.012177,1.527617


0.5222560418969984
Results for param: model__min_samples_split


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__criterion', 'gini'), ('m...",0.86964,0.869608,3.2e-05,0.003673
1,1,"OrderedDict([('model__criterion', 'gini'), ('m...",0.971417,0.971264,0.000153,0.015719
2,2,"OrderedDict([('model__criterion', 'gini'), ('m...",0.98364,0.976445,0.007195,0.736905
3,3,"OrderedDict([('model__criterion', 'gini'), ('m...",0.798019,0.797114,0.000905,0.11355


0.21746157632411292
Results for param: model__criterion


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__criterion', 'gini'), ('m...",0.869761,0.869608,0.000153,0.017587
1,1,"OrderedDict([('model__criterion', 'entropy'), ...",0.975552,0.971264,0.004287,0.441426
2,2,"OrderedDict([('model__criterion', 'entropy'), ...",0.978964,0.976445,0.00252,0.258047
3,3,"OrderedDict([('model__criterion', 'gini'), ('m...",0.797117,0.797114,3e-06,0.000377


0.17935928739085044
Results for param: model__min_samples_leaf


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__criterion', 'gini'), ('m...",0.873393,0.869608,0.003785,0.435218
1,1,"OrderedDict([('model__criterion', 'gini'), ('m...",0.973079,0.971264,0.001815,0.18683
2,2,"OrderedDict([('model__criterion', 'gini'), ('m...",0.981541,0.976445,0.005096,0.5219
3,3,"OrderedDict([('model__criterion', 'gini'), ('m...",0.814801,0.797114,0.017688,2.218949


0.8407240814595924


## **Random Forest**

In [16]:
param_distributions_Random_Forest = param_distributions[1]
tunable_parameters_RF = param_distributions_Random_Forest.keys()
tunable_parameters_RF

dict_keys(['model__n_estimators', 'model__min_samples_leaf', 'model__max_samples', 'model__max_features'])

In [46]:
best_params_RandomForest, _ = get_best_params_overall(history_RandomForest)
best_params_RF_dict = json.loads(best_params_RandomForest.replace("'", "\""))
best_params_RF_dict = {key: [value] for key, value in best_params_RF_dict.items()}
best_params_RF_dict

{'model__max_features': [0.49816568848070625],
 'model__max_samples': [0.738105348394507],
 'model__min_samples_leaf': [3],
 'model__n_estimators': [478]}

In [47]:
param_history = {}
for param in tunable_parameters_RF:
    temp_param_grid = {**best_params_RF_dict} 
    temp_param_grid.update({param:param_distributions_Random_Forest[param]})
    print(temp_param_grid)
    print("Testing param", param)
    history = do_bayesian_search(pipelines[1][1], temp_param_grid)
    param_history.update({param: history})

{'model__max_features': [0.49816568848070625], 'model__max_samples': [0.738105348394507], 'model__min_samples_leaf': [3], 'model__n_estimators': Integer(low=100, high=500, prior='uniform', transform='normalize')}
Testing param model__n_estimators




{'model__max_features': [0.49816568848070625], 'model__max_samples': [0.738105348394507], 'model__min_samples_leaf': Integer(low=1, high=250, prior='uniform', transform='normalize'), 'model__n_estimators': [478]}
Testing param model__min_samples_leaf




{'model__max_features': [0.49816568848070625], 'model__max_samples': Real(low=0.5, high=1, prior='uniform', transform='normalize'), 'model__min_samples_leaf': [3], 'model__n_estimators': [478]}
Testing param model__max_samples
{'model__max_features': Real(low=1e-06, high=1, prior='uniform', transform='normalize'), 'model__max_samples': [0.738105348394507], 'model__min_samples_leaf': [3], 'model__n_estimators': [478]}
Testing param model__max_features


In [48]:
params_history_frames_RF = {}
for param, history in param_history.items():
    df = pd.concat([pd.DataFrame(history[i]) for i in range(len(history))], keys=range(len(history)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    params_history_frames_RF.update({param:df})

In [49]:
for param, history in params_history_frames_RF.items():
    print(history.shape)
    history.to_csv(f'../history_bayes/history_bayes_hyperparameter_tuning_RF_{param}.csv', index=False)

(200, 18)
(200, 18)
(200, 18)
(200, 18)


In [18]:
params_history_frames_RF = {}
for param in ["model__n_estimators", "model__min_samples_leaf","model__max_samples","model__max_features"]:
    file_path = f'../history_bayes/history_bayes_hyperparameter_tuning_RF_{param}.csv'
    if os.path.exists(file_path):
        params_history_frames_RF[param] = pd.read_csv(file_path)

# Chosen hyperparameters with tunability are presented below

In [21]:
for param in tunable_parameters_RF:
    print("Results for param:", param)
    display(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_RF[param], history_RandomForest).head(10))
    print(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_RF[param], history_RandomForest)['rel_tunability (%)'].mean())

Results for param: model__n_estimators


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__max_features', 0.4981656...",0.925586,0.925189,0.000397,0.042895
1,1,"OrderedDict([('model__max_features', 0.4981656...",0.9866,0.98636,0.00024,0.024288
2,2,"OrderedDict([('model__max_features', 0.4981656...",0.999984,0.999973,1.1e-05,0.001053
3,3,"OrderedDict([('model__max_features', 0.4981656...",0.851506,0.849848,0.001658,0.195069


0.06582613050666147
Results for param: model__min_samples_leaf


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__max_features', 0.4981656...",0.925552,0.925189,0.000363,0.039233
1,1,"OrderedDict([('model__max_features', 0.4981656...",0.987257,0.98636,0.000897,0.090894
2,2,"OrderedDict([('model__max_features', 0.4981656...",0.999993,0.999973,1.9e-05,0.001944
3,3,"OrderedDict([('model__max_features', 0.4981656...",0.851081,0.849848,0.001233,0.145036


0.06927676073755618
Results for param: model__max_samples


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__max_features', 0.4981656...",0.925535,0.925189,0.000346,0.03735
1,1,"OrderedDict([('model__max_features', 0.4981656...",0.987079,0.98636,0.000719,0.072864
2,2,"OrderedDict([('model__max_features', 0.4981656...",0.999989,0.999973,1.5e-05,0.001539
3,3,"OrderedDict([('model__max_features', 0.4981656...",0.851124,0.849848,0.001276,0.150151


0.06547581937391314
Results for param: model__max_features


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__max_features', 0.5645816...",0.925669,0.925189,0.00048,0.051893
1,1,"OrderedDict([('model__max_features', 0.4660361...",0.986676,0.98636,0.000316,0.032027
2,2,"OrderedDict([('model__max_features', 0.2317930...",0.999994,0.999973,2.1e-05,0.002106
3,3,"OrderedDict([('model__max_features', 0.2003765...",0.851738,0.849848,0.001889,0.222315


0.07708495085468642


## **XGBoost**

In [22]:
param_distributions_XGBoost = param_distributions[2]
tunable_parameters_XGB = param_distributions_XGBoost.keys()
tunable_parameters_XGB

dict_keys(['model__max_depth', 'model__min_child_weight', 'model__eta', 'model__alpha'])

In [53]:
best_params_XGBoost, _ = get_best_params_overall(history_XGBoost)
best_params_XGB_dict = json.loads(best_params_XGBoost.replace("'", "\""))
best_params_XGB_dict = {key: [value] for key, value in best_params_XGB_dict.items()}
best_params_XGB_dict

{'model__alpha': [1.2481751282245537],
 'model__eta': [0.09798219139516953],
 'model__max_depth': [16],
 'model__min_child_weight': [0]}

In [54]:
param_history = {}
for param in tunable_parameters_XGB:
    temp_param_grid = {**best_params_XGB_dict} 
    temp_param_grid.update({param:param_distributions_XGBoost[param]})
    print(temp_param_grid)
    print("Testing param", param)
    history = do_bayesian_search(pipelines[2][1], temp_param_grid)
    param_history.update({param: history})

{'model__alpha': [1.2481751282245537], 'model__eta': [0.09798219139516953], 'model__max_depth': Integer(low=1, high=19, prior='uniform', transform='normalize'), 'model__min_child_weight': [0]}
Testing param model__max_depth




{'model__alpha': [1.2481751282245537], 'model__eta': [0.09798219139516953], 'model__max_depth': [16], 'model__min_child_weight': Integer(low=0, high=19, prior='uniform', transform='normalize')}
Testing param model__min_child_weight




{'model__alpha': [1.2481751282245537], 'model__eta': Real(low=0.01, high=0.101, prior='uniform', transform='normalize'), 'model__max_depth': [16], 'model__min_child_weight': [0]}
Testing param model__eta
{'model__alpha': Real(low=0.0001, high=10, prior='log-uniform', transform='normalize'), 'model__eta': [0.09798219139516953], 'model__max_depth': [16], 'model__min_child_weight': [0]}
Testing param model__alpha


In [55]:
params_history_frames_XGB = {}
for param, history in param_history.items():
    df = pd.concat([pd.DataFrame(history[i]) for i in range(len(history))], keys=range(len(history)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    params_history_frames_XGB.update({param:df})

In [56]:
for param, history in params_history_frames_XGB.items():
    print(history.shape)
    history.to_csv(f'../history_bayes/history_bayes_hyperparameter_tuning_XGB_{param}.csv', index=False)

(200, 18)
(200, 18)
(200, 18)
(200, 18)


In [24]:
params_history_frames_XGB = {}
for param in ["model__max_depth", "model__min_child_weight", "model__eta", "model__alpha"]:
    file_path = f'../history_bayes/history_bayes_hyperparameter_tuning_XGB_{param}.csv'
    if os.path.exists(file_path):
        params_history_frames_XGB[param] = pd.read_csv(file_path)

# Chosen hyperparameters with tunability are presented below

In [25]:
for param in tunable_parameters_XGB:
    print("Results for param:", param)
    display(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_XGB[param], history_XGBoost).head(10))
    print(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_XGB[param], history_XGBoost)['rel_tunability (%)'].mean())

Results for param: model__max_depth


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__alpha', 1.24817512822455...",0.920403,0.920403,0.0,0.0
1,1,"OrderedDict([('model__alpha', 1.24817512822455...",0.990258,0.99013,0.000128,0.012959
2,2,"OrderedDict([('model__alpha', 1.24817512822455...",0.999989,0.999985,4e-06,0.000405
3,3,"OrderedDict([('model__alpha', 1.24817512822455...",0.848711,0.848066,0.000645,0.076106


0.022367356149698613
Results for param: model__min_child_weight


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__alpha', 1.24817512822455...",0.920403,0.920403,0.0,0.0
1,1,"OrderedDict([('model__alpha', 1.24817512822455...",0.990179,0.99013,4.9e-05,0.004921
2,2,"OrderedDict([('model__alpha', 1.24817512822455...",0.999985,0.999985,0.0,0.0
3,3,"OrderedDict([('model__alpha', 1.24817512822455...",0.848066,0.848066,0.0,0.0


0.0012302735882373588
Results for param: model__eta


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__alpha': 1.2481751282245537, 'model__e...",0.920403,0.920403,0.0,0.0
1,1,"OrderedDict([('model__alpha', 1.24817512822455...",0.990309,0.99013,0.000179,0.018044
2,2,"OrderedDict([('model__alpha', 1.24817512822455...",0.999988,0.999985,2e-06,0.000243
3,3,"OrderedDict([('model__alpha', 1.24817512822455...",0.849691,0.848066,0.001626,0.191713


0.052499923044785284
Results for param: model__alpha


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__alpha', 3.45702827377128...",0.923042,0.920403,0.002639,0.286682
1,1,"OrderedDict([('model__alpha', 3.77619559032166...",0.990724,0.99013,0.000594,0.059955
2,2,"OrderedDict([('model__alpha', 0.04016662067481...",0.999995,0.999985,1e-05,0.000972
3,3,"OrderedDict([('model__alpha', 9.60246037606447...",0.851275,0.848066,0.003209,0.378449


0.1815145418229563
