In [None]:
import pandas as pd
import numpy as np
import os
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


from skopt import BayesSearchCV

from utils import ModelSuplier, DataLoader, DataSaver, get_best_params_overall

In [2]:
ms = ModelSuplier()

pipelines = ms.pipelines

In [3]:
dl = DataLoader()

data_as_X_and_y = dl.transformed_data

Using default path


In [4]:
ds = DataSaver(os.path.join("..", "history_bayes"))

# Bayes Search

In [5]:
param_distributions = [
    {
        "model__max_depth": Integer(1, 30),
        "model__min_samples_split": Integer(2, 60),
        "model__criterion": Categorical(["gini", "entropy"]),
        "model__min_samples_leaf": Integer(1, 60)
    },
    {
        "model__n_estimators": Integer(100, 500),      
        "model__min_samples_leaf": Integer(1, 250),    
        "model__max_samples": Real(0.5, 1),        
        "model__max_features": Real(1e-6, 1)   
    },
    {
        "model__max_depth": Integer(1, 19),
        "model__min_child_weight": Integer(0, 19),
        "model__eta": Real(0.01, 0.101), 
        "model__alpha": Real(1e-4, 10, prior="log-uniform")
    }   
]

In [6]:
best_params = [[],[],[],[]]
pipe_best_models = []
pipe_best_scores = []
history = [[],[],[]]
for i,(name,pipe) in enumerate(pipelines):
    print("Training:",  name)
    for j,(X,y) in enumerate(data_as_X_and_y):
        bayes = BayesSearchCV(pipe, search_spaces=param_distributions[i], cv=5, random_state=42, n_jobs=-1, scoring="roc_auc", n_iter = 100)
        bayes.fit(X,y)
        pipe_best_scores.append(bayes.best_score_)
        pipe_best_models.append(bayes.best_estimator_)
        best_params[j].append(bayes.best_params_)
        history[i].append(bayes.cv_results_)   

Training: <class 'sklearn.tree._classes.DecisionTreeClassifier'>
Training: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Training: <class 'xgboost.sklearn.XGBClassifier'>


In [7]:
history_datasets = []
for h in history:
    df = pd.concat([pd.DataFrame(h[i]) for i in range(len(h))], keys=range(len(h)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    history_datasets.append(df)
print(f"{pipelines[0][0]} shape: {history_datasets[0].shape}")
print(f"{pipelines[1][0]} shape: {history_datasets[1].shape}")
print(f"{pipelines[2][0]} shape: {history_datasets[2].shape}")

<class 'sklearn.tree._classes.DecisionTreeClassifier'> shape: (400, 18)
<class 'sklearn.ensemble._forest.RandomForestClassifier'> shape: (400, 18)
<class 'xgboost.sklearn.XGBClassifier'> shape: (400, 18)


In [8]:
ds.save(history_datasets,"history_bayes_all", ['DecisionTree','RandomForest','XGBoost'])

In [9]:
history_DecisionTree = pd.read_csv('../history_bayes/history_bayes_all_DecisionTree.csv')
history_RandomForest = pd.read_csv('../history_bayes/history_bayes_all_RandomForest.csv')
history_XGBoost = pd.read_csv('../history_bayes/history_bayes_all_XGBoost.csv')

In [10]:
histories = [history_DecisionTree, history_RandomForest, history_XGBoost]
for h in histories:
    display(h.head())

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.051749,0.00985,0.012026,0.003668,gini,22,56,20,"OrderedDict([('model__criterion', 'gini'), ('m...",0.881774,0.867468,0.8529,0.841401,0.84143,0.856995,0.015658,75
1,0,0.05535,0.003888,0.012694,0.004222,entropy,27,19,57,"OrderedDict([('model__criterion', 'entropy'), ...",0.846139,0.887877,0.843574,0.873271,0.875938,0.86536,0.017467,41
2,0,0.054376,0.007966,0.008077,0.002496,gini,28,7,27,"OrderedDict([('model__criterion', 'gini'), ('m...",0.816065,0.884611,0.820518,0.848472,0.833217,0.840577,0.024733,89
3,0,0.035938,0.005676,0.004239,0.005202,entropy,6,36,49,"OrderedDict([('model__criterion', 'entropy'), ...",0.875657,0.871553,0.881774,0.858611,0.858002,0.869119,0.009411,10
4,0,0.040676,0.007762,0.003847,0.004722,entropy,14,32,44,"OrderedDict([('model__criterion', 'entropy'), ...",0.879577,0.886454,0.847136,0.854619,0.856414,0.86484,0.015318,44


Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_features,param_model__max_samples,param_model__min_samples_leaf,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.781497,0.002486,0.039199,0.003153,0.410105,0.863863,233,226,"OrderedDict([('model__max_features', 0.4101045...",0.867507,0.861564,0.863548,0.82724,0.848666,0.853705,0.014662,91
1,0,4.122937,0.103305,0.049606,0.006193,0.837389,0.941658,77,480,"OrderedDict([('model__max_features', 0.8373885...",0.910416,0.886914,0.881591,0.854807,0.886324,0.88401,0.017718,78
2,0,1.842538,0.032945,0.034816,0.002351,0.444833,0.959361,27,273,"OrderedDict([('model__max_features', 0.4448330...",0.924432,0.907251,0.896352,0.882229,0.910997,0.904252,0.014207,71
3,0,2.033847,0.040292,0.045258,0.008035,0.812396,0.585936,150,421,"OrderedDict([('model__max_features', 0.8123961...",0.868518,0.855988,0.863606,0.827995,0.845733,0.852368,0.01441,92
4,0,2.207882,0.037947,0.038629,0.001503,0.799554,0.719015,132,386,"OrderedDict([('model__max_features', 0.7995536...",0.886217,0.868489,0.874946,0.837442,0.861665,0.865752,0.016303,86


Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_model__eta,param_model__max_depth,param_model__min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.243194,0.011578,0.0159,0.002357,0.011234,0.076223,18,6,"OrderedDict([('model__alpha', 0.01123362169089...",0.93809,0.917521,0.905344,0.893235,0.915788,0.913996,0.014846,67
1,0,0.138391,0.003582,0.013792,0.00405,1.537948,0.090382,6,18,"OrderedDict([('model__alpha', 1.53794844658007...",0.934257,0.911094,0.899052,0.879035,0.90848,0.906384,0.017917,83
2,0,0.124196,0.006529,0.009029,0.003336,0.016756,0.093604,3,8,"OrderedDict([('model__alpha', 0.01675569944093...",0.931508,0.897978,0.900427,0.874108,0.908742,0.902552,0.0185,89
3,0,0.179649,0.011137,0.01402,0.002437,1.1534,0.02564,12,15,"OrderedDict([('model__alpha', 1.15339998595595...",0.922641,0.890718,0.897857,0.863485,0.896579,0.894256,0.018886,93
4,0,0.184225,0.007301,0.011428,0.002775,0.994872,0.049861,10,14,"OrderedDict([('model__alpha', 0.99487199982341...",0.937877,0.913465,0.904512,0.878687,0.908035,0.908515,0.018935,76


# Tunability of algorithms with Random as a default

In [2]:
history_DecisionTree = pd.read_csv('../history/history_dataset_DecisionTree.csv')
history_RandomForest = pd.read_csv('../history/history_dataset_RandomForest.csv')
history_XGBoost = pd.read_csv('../history/history_dataset_XGBoost.csv')

In [3]:
history_bayes_DecisionTree = pd.read_csv('../history_bayes/history_bayes_all_DecisionTree.csv')
history_bayes_RandomForest = pd.read_csv('../history_bayes/history_bayes_all_RandomForest.csv')
history_bayes_XGBoost = pd.read_csv('../history_bayes/history_bayes_all_XGBoost.csv')

In [4]:
def get_best_params_per_dataset_bayes(bayes_df, random_df):
    bayes_df['params_str'] = bayes_df['params'].apply(lambda x: str(x))
    best_params_per_dataset = bayes_df.sort_values(['dataset', 'rank_test_score'], ascending=[True, True]).groupby('dataset').first().reset_index()
    best_params_per_dataset.rename(columns={'params_str': 'best_params', 'mean_test_score': 'best_score'}, inplace=True)
    best_params_per_dataset = best_params_per_dataset[['dataset', 'best_params', 'best_score']]
    default_params, _ = get_best_params_overall(random_df)
    score_for_default_params = random_df[random_df['params_str'] == default_params][['dataset', 'mean_test_score']].rename(columns={'mean_test_score': 'default_score'})
    best_params_per_dataset = best_params_per_dataset.merge(score_for_default_params, on='dataset', how='left')
    best_params_per_dataset['abs_tunability'] = best_params_per_dataset['best_score'] - best_params_per_dataset['default_score']
    best_params_per_dataset['rel_tunability (%)'] = best_params_per_dataset['abs_tunability'] / best_params_per_dataset['default_score'] * 100 
    return best_params_per_dataset

### Best configuration of the hyperparameters for each of the dataset is presented below with it's tunability percentage

In [5]:
best_params_per_dataset_DecisionTree = get_best_params_per_dataset_bayes(history_bayes_DecisionTree, history_DecisionTree)
best_params_per_dataset_DecisionTree

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__criterion', 'entropy'), ...",0.872846,0.869608,0.003238,0.372328
1,1,"OrderedDict([('model__criterion', 'entropy'), ...",0.977406,0.971264,0.006141,0.632311
2,2,"OrderedDict([('model__criterion', 'entropy'), ...",0.988745,0.976445,0.012301,1.259758
3,3,"OrderedDict([('model__criterion', 'entropy'), ...",0.815506,0.797114,0.018392,2.30733


In [6]:
best_params_per_dataset_RandomForest = get_best_params_per_dataset_bayes(history_bayes_RandomForest, history_RandomForest)
best_params_per_dataset_RandomForest

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__max_features', 0.3887534...",0.927016,0.925189,0.001827,0.197422
1,1,"OrderedDict([('model__max_features', 0.3491819...",0.988389,0.98636,0.002029,0.205665
2,2,"OrderedDict([('model__max_features', 0.0691272...",1.0,0.999973,2.7e-05,0.002672
3,3,"OrderedDict([('model__max_features', 0.2164098...",0.850654,0.849848,0.000806,0.09481


In [7]:
best_params_per_dataset_XGBoost = get_best_params_per_dataset_bayes(history_bayes_XGBoost, history_XGBoost)
best_params_per_dataset_XGBoost

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"OrderedDict([('model__alpha', 3.81673014884859...",0.922421,0.920403,0.002017,0.219166
1,1,"OrderedDict([('model__alpha', 10.0), ('model__...",0.990251,0.99013,0.000121,0.012221
2,2,"OrderedDict([('model__alpha', 0.00093187786868...",0.999996,0.999985,1.1e-05,0.001053
3,3,"OrderedDict([('model__alpha', 7.30378140198421...",0.850581,0.848066,0.002515,0.296564
