In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.model_selection import cross_validate
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform, loguniform

from IPython.display import display

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

import numpy as np
import os
import pandas as pd

import json

# Data loading


In [40]:
data_path = "../data"
DATA = []
for file in os.listdir(data_path):
    DATA.append(pd.read_csv(os.path.join(data_path,file)))

In [24]:
for data in DATA:
    print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           5000 non-null   object 
 1   Location       5000 non-null   object 
 2   MinTemp        4975 non-null   float64
 3   MaxTemp        4993 non-null   float64
 4   Rainfall       4955 non-null   float64
 5   Evaporation    2853 non-null   float64
 6   Sunshine       2612 non-null   float64
 7   WindGustDir    4665 non-null   object 
 8   WindGustSpeed  4667 non-null   float64
 9   WindDir9am     4608 non-null   object 
 10  WindDir3pm     4868 non-null   object 
 11  WindSpeed9am   4961 non-null   float64
 12  WindSpeed3pm   4914 non-null   float64
 13  Humidity9am    4943 non-null   float64
 14  Humidity3pm    4882 non-null   float64
 15  Pressure9am    4493 non-null   float64
 16  Pressure3pm    4496 non-null   float64
 17  Cloud9am       3103 non-null   float64
 18  Cloud3pm

# Creating pipelines

In [41]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy="constant", fill_value="missing")),
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])


col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])

# Defining algorithms for the experiments

In [42]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
]

pipelines = []

In [43]:
for classifier in classifiers:
    pipelines.append((type(classifier), Pipeline([("transformer", col_trans), ("model", classifier)])))

In [62]:
pipelines

[(sklearn.tree._classes.DecisionTreeClassifier,
  Pipeline(steps=[('transformer',
                   ColumnTransformer(transformers=[('num_pipeline',
                                                    Pipeline(steps=[('impute',
                                                                     SimpleImputer()),
                                                                    ('scale',
                                                                     MinMaxScaler())]),
                                                    <sklearn.compose._column_transformer.make_column_selector object at 0x00000227F24DB190>),
                                                   ('cat_pipeline',
                                                    Pipeline(steps=[('impute',
                                                                     SimpleImputer(fill_value='missing',
                                                                                   strategy='constant')),
                  

In [63]:
scores = []

for pipe in pipelines:
    pipe_score = []
    for data in DATA:
        score = cross_validate(pipe[1], data.iloc[:, :-1], LabelEncoder().fit_transform(data.iloc[:,-1]), cv = 5, scoring="roc_auc")
        pipe_score.append(score["test_score"].mean())

    scores.append((pipe[0], pipe_score))

In [29]:
os.listdir("../data")

['banking_final.csv',
 'flights_final.csv',
 'mushrooms_final.csv',
 'weather_final.csv']

In [64]:
scores

[(sklearn.tree._classes.DecisionTreeClassifier,
  [0.6856017268248299,
   0.9118589079864705,
   0.9854175523838447,
   0.6788933190280879]),
 (sklearn.ensemble._forest.RandomForestClassifier,
  [0.9175702490538278, 0.9863045570267867, 1.0, 0.8444766611174457]),
 (xgboost.sklearn.XGBClassifier,
  [0.9103290065917472,
   0.9891123481904669,
   0.9999846138273105,
   0.8337667489284263])]

# Random Searching - searching for new defaults

In [3]:
param_distributions = [
    {
        "model__max_depth": randint(1, 31),
        "model__min_samples_split": randint(2, 61),
        "model__criterion": ["gini", "entropy"],
        "model__min_samples_leaf": randint(1, 61)
    },
    {
        "model__n_estimators": randint(100, 501),      
        "model__min_samples_leaf": randint(1, 251),    
        "model__max_samples": uniform(0.5, 0.5),        
        "model__max_features": uniform(1e-6, 1 - 1e-6)   
    },
    {
        "model__max_depth": randint(1, 20),
        "model__min_child_weight": randint(0, 20),
        "model__eta": uniform(0.01, 0.1), 
        "model__alpha": loguniform(1e-4, 10)
}
]

In [None]:
best_params = [[],[],[],[]]
pipe_best_models = []
pipe_best_scores = []
history = [[],[],[]]
for i,pipe in enumerate(pipelines):    
    for j,data in enumerate(DATA):
        rs = RandomizedSearchCV(pipe[1], 
                                param_distributions= param_distributions[i],
                                verbose=766751,
                                random_state=42,
                                cv=5,
                                n_iter=1000,
                                n_jobs=-1,
                                scoring="roc_auc"
                                )
        rs.fit(data.iloc[:, :-1],LabelEncoder().fit_transform(data.iloc[:,-1]))
        pipe_best_scores.append(rs.best_score_)
        pipe_best_models.append(rs.best_estimator_)
        best_params[j].append(rs.best_params_)
        history[i].append(rs.cv_results_)    


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


In [None]:
history_datasets = []
for h in history:
    df = pd.concat([pd.DataFrame(h[i]) for i in range(len(h))], keys=range(len(h)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    history_datasets.append(df)
print(f"{pipelines[0][0]} shape: {history_datasets[0].shape}")
print(f"{pipelines[1][0]} shape: {history_datasets[1].shape}")
print(f"{pipelines[2][0]} shape: {history_datasets[2].shape}")

<class 'sklearn.tree._classes.DecisionTreeClassifier'> shape: (4000, 18)
<class 'sklearn.ensemble._forest.RandomForestClassifier'> shape: (4000, 18)
<class 'xgboost.sklearn.XGBClassifier'> shape: (4000, 18)


In [112]:
model_names = ['DecisionTree','RandomForest','XGBoost']

# Saving history to csv files

In [25]:
for i, df in enumerate(history_datasets):
    df.to_csv(f'../history/history_dataset_{model_names[i]}.csv', index=False)

# Reading history from csv

In [10]:
history_DecisionTree = pd.read_csv('../history/history_dataset_DecisionTree.csv')
history_RandomForest = pd.read_csv('../history/history_dataset_RandomForest.csv')
history_XGBoost = pd.read_csv('../history/history_dataset_XGBoost.csv')

In [118]:
pd.set_option('display.max_colwidth', None)

print(history_DecisionTree[history_DecisionTree['mean_test_score'] == 0.5].shape[0])
print(history_RandomForest[history_RandomForest['mean_test_score'] == 0.5].shape[0])
print(history_XGBoost[history_XGBoost['mean_test_score'] == 0.5].shape[0])

0
0
0


In [119]:
history_XGBoost.shape

(4000, 18)

In [89]:
history_RandomForest.head()

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_features,param_model__max_samples,param_model__min_samples_leaf,param_model__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,1.28959,0.006801,0.043627,0.006232,0.374541,0.975357,107,171,"{'model__max_features': 0.3745407443072436, 'model__max_samples': 0.9753571532049581, 'model__min_samples_leaf': 107, 'model__n_estimators': 171}",0.899256,0.880371,0.878696,0.852988,0.882239,0.87871,0.014826,319
1,0,2.010566,0.072784,0.064181,0.010854,0.598659,0.578009,211,314,"{'model__max_features': 0.5986588855385524, 'model__max_samples': 0.5780093202212182, 'model__min_samples_leaf': 211, 'model__n_estimators': 314}",0.858273,0.854135,0.854909,0.819419,0.844958,0.846339,0.014165,899
2,0,1.672137,0.018012,0.075803,0.002932,0.058085,0.933088,100,459,"{'model__max_features': 0.05808455408458729, 'model__max_samples': 0.9330880728874675, 'model__min_samples_leaf': 100, 'model__n_estimators': 459}",0.847102,0.862086,0.859705,0.83479,0.858834,0.852504,0.010264,794
3,0,8.327508,0.266172,0.105625,0.01053,0.708073,0.510292,2,443,"{'model__max_features': 0.7080728697234677, 'model__max_samples': 0.5102922471479012, 'model__min_samples_leaf': 2, 'model__n_estimators': 443}",0.932011,0.941071,0.910503,0.913722,0.92813,0.925087,0.011442,5
4,0,3.010676,0.071347,0.065256,0.007643,0.832443,0.60617,192,376,"{'model__max_features': 0.832442808357781, 'model__max_samples': 0.6061695553391381, 'model__min_samples_leaf': 192, 'model__n_estimators': 376}",0.862619,0.852963,0.852562,0.8194,0.840124,0.845533,0.014891,917


In [10]:
history_XGBoost.head()

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__alpha,param_model__eta,param_model__max_depth,param_model__min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.165986,0.013078,0.015945,0.005451,383.530058,0.950762,11,72,"{'model__alpha': 383.5300582621992, 'model__eta': 0.9507624369700627, 'model__max_depth': 11, 'model__min_child_weight': 72}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,714
1,0,0.1605,0.013494,0.011362,0.004381,613.027264,0.156843,3,87,"{'model__alpha': 613.0272643802655, 'model__eta': 0.15684284098887946, 'model__max_depth': 3, 'model__min_child_weight': 87}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,714
2,0,0.134376,0.005874,0.01431,0.003767,59.478595,0.866307,4,104,"{'model__alpha': 59.47859542273625, 'model__eta': 0.8663068331325768, 'model__max_depth': 4, 'model__min_child_weight': 104}",0.840699,0.839393,0.830057,0.802664,0.842297,0.831022,0.014805,160
3,0,0.140599,0.005001,0.016829,0.010731,725.067296,0.021541,2,88,"{'model__alpha': 725.0672962256506, 'model__eta': 0.0215409547505917, 'model__max_depth': 2, 'model__min_child_weight': 88}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,714
4,0,0.145276,0.009771,0.017556,0.002008,852.422241,0.213108,12,21,"{'model__alpha': 852.4222407421319, 'model__eta': 0.2131083107655044, 'model__max_depth': 12, 'model__min_child_weight': 21}",0.5,0.5,0.5,0.5,0.5,0.5,0.0,714


# New defaults below

In [13]:
def get_best_params_overall(df):
    df['params_str'] = df['params'].apply(lambda x: str(x))
    grouped_mean = df.groupby(['params_str'])['mean_test_score'].mean().reset_index()
    grouped_mean.sort_values(by='mean_test_score', ascending=False, inplace=True)
    return grouped_mean.iloc[0, 0], grouped_mean.iloc[0, 1]

In [121]:
best_params_DecisionTree, best_params_DecisionTree_score = get_best_params_overall(history_DecisionTree)
print(f"Best params for DecisionTree: {best_params_DecisionTree}") 
print(f"with score: {best_params_DecisionTree_score}")

Best params for DecisionTree: {'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}
with score: 0.9036077061469168


In [122]:
best_params_RandomForest, best_params_RandomForest_score = get_best_params_overall(history_RandomForest)
print(f"Best params for RandomForest: {best_params_RandomForest}") 
print(f"with score: {best_params_RandomForest_score}")

Best params for RandomForest: {'model__max_features': 0.49816568848070625, 'model__max_samples': 0.738105348394507, 'model__min_samples_leaf': 3, 'model__n_estimators': 478}
with score: 0.9403427016521764


In [123]:
best_params_XGBoost, best_params_XGBoost_score = get_best_params_overall(history_XGBoost)
print(f"Best params for XGBoost: {best_params_XGBoost}")
print(f"With score: {best_params_XGBoost_score}")

Best params for XGBoost: {'model__alpha': 1.2481751282245537, 'model__eta': 0.09798219139516953, 'model__max_depth': 16, 'model__min_child_weight': 0}
With score: 0.9396460899136064


# Tunability

##### Now let's compute tunability of each of the ML algorithms. We'll start with looking for the optimal configuration of the hyperparameters for each of the dataset

In [12]:
def get_best_params_per_dataset(df):
    df['params_str'] = df['params'].apply(lambda x: str(x))
    best_params_per_dataset = df.sort_values(['dataset', 'rank_test_score'], ascending=[True, True]).groupby('dataset').first().reset_index()
    best_params_per_dataset.rename(columns={'params_str': 'best_params', 'mean_test_score': 'best_score'}, inplace=True)
    best_params_per_dataset = best_params_per_dataset[['dataset', 'best_params', 'best_score']]
    default_params, _ = get_best_params_overall(df)
    score_for_default_params = df[df['params_str'] == default_params][['dataset', 'mean_test_score']].rename(columns={'mean_test_score': 'default_score'})
    best_params_per_dataset = best_params_per_dataset.merge(score_for_default_params, on='dataset', how='left')
    best_params_per_dataset['abs_tunability'] = best_params_per_dataset['best_score'] - best_params_per_dataset['default_score']
    best_params_per_dataset['rel_tunability (%)'] = best_params_per_dataset['abs_tunability'] / best_params_per_dataset['default_score'] * 100 
    return best_params_per_dataset

### Best configuration of the hyperparameters for each of the dataset is presented below with it's tunability percentage

In [125]:
best_params_per_dataset_DecisionTree = get_best_params_per_dataset(history_DecisionTree)
best_params_per_dataset_DecisionTree

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__criterion': 'gini', 'model__max_depth': 16, 'model__min_samples_leaf': 1, 'model__min_samples_split': 54}",0.875509,0.869608,0.005901,0.678539
1,1,"{'model__criterion': 'entropy', 'model__max_depth': 27, 'model__min_samples_leaf': 36, 'model__min_samples_split': 20}",0.976032,0.971264,0.004768,0.490882
2,2,"{'model__criterion': 'gini', 'model__max_depth': 27, 'model__min_samples_leaf': 3, 'model__min_samples_split': 18}",0.98783,0.976445,0.011385,1.16596
3,3,"{'model__criterion': 'entropy', 'model__max_depth': 7, 'model__min_samples_leaf': 45, 'model__min_samples_split': 47}",0.816965,0.797114,0.019851,2.490372


In [126]:
best_params_per_dataset_RandomForest = get_best_params_per_dataset(history_RandomForest)
best_params_per_dataset_RandomForest

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__max_features': 0.2487150238733573, 'model__max_samples': 0.8085724933020151, 'model__min_samples_leaf': 1, 'model__n_estimators': 412}",0.925725,0.925189,0.000536,0.057961
1,1,"{'model__max_features': 0.33700383942810985, 'model__max_samples': 0.9144416829413047, 'model__min_samples_leaf': 1, 'model__n_estimators': 232}",0.988073,0.98636,0.001713,0.173679
2,2,"{'model__max_features': 0.08962724952860296, 'model__max_samples': 0.8588178929543148, 'model__min_samples_leaf': 2, 'model__n_estimators': 176}",0.999999,0.999973,2.6e-05,0.002591
3,3,"{'model__max_features': 0.5387022592509163, 'model__max_samples': 0.7489062540171452, 'model__min_samples_leaf': 6, 'model__n_estimators': 266}",0.851208,0.849848,0.001359,0.159969


In [127]:
best_params_per_dataset_XGBoost = get_best_params_per_dataset(history_XGBoost)
best_params_per_dataset_XGBoost

Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__alpha': 1.2481751282245537, 'model__eta': 0.09798219139516953, 'model__max_depth': 16, 'model__min_child_weight': 0}",0.920403,0.920403,0.0,0.0
1,1,"{'model__alpha': 4.384474294125061, 'model__eta': 0.10963982936790555, 'model__max_depth': 12, 'model__min_child_weight': 2}",0.990395,0.99013,0.000266,0.02682
2,2,"{'model__alpha': 0.005033310605507513, 'model__eta': 0.10404485057361726, 'model__max_depth': 11, 'model__min_child_weight': 0}",0.999994,0.999985,9e-06,0.000891
3,3,"{'model__alpha': 0.04733513937713775, 'model__eta': 0.057661948412641204, 'model__max_depth': 13, 'model__min_child_weight': 11}",0.850615,0.848066,0.00255,0.300633


# Testing the tunability of individual hyperparameters

In [44]:
history_DecisionTree = pd.read_csv('../history/history_dataset_DecisionTree.csv')
history_RandomForest = pd.read_csv('../history/history_dataset_RandomForest.csv')
history_XGBoost = pd.read_csv('../history/history_dataset_XGBoost.csv')

In [64]:
def do_random_search(clf, param_distributions):
    global DATA 
    history = []
    for j,data in enumerate(DATA):
        rs = RandomizedSearchCV(clf, 
                                param_distributions= param_distributions,
                                #verbose=766751,
                                random_state=42,
                                cv=5,
                                n_iter=200,
                                n_jobs=-1,
                                scoring="roc_auc"
                                )
        rs.fit(data.iloc[:, :-1],LabelEncoder().fit_transform(data.iloc[:,-1]))
        history.append(rs.cv_results_)
    return history

## Decision tree

In [6]:
param_distributions_Decision_Tree = param_distributions[0]
tunable_parameters_DT = param_distributions_Decision_Tree.keys()
tunable_parameters_DT

dict_keys(['model__max_depth', 'model__min_samples_split', 'model__criterion', 'model__min_samples_leaf'])

In [48]:
param_distributions_Decision_Tree

{'model__max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen at 0x133824f10>,
 'model__min_samples_split': <scipy.stats._distn_infrastructure.rv_discrete_frozen at 0x132188e90>,
 'model__criterion': ['gini', 'entropy'],
 'model__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_discrete_frozen at 0x133826a10>}

In [21]:
best_params_DT_dict = json.loads(best_params_DecisionTree.replace("'", "\""))
best_params_DT_dict = {key: [value] for key, value in best_params_DT_dict.items()}
best_params_DT_dict

{'model__criterion': ['gini'],
 'model__max_depth': [17],
 'model__min_samples_leaf': [10],
 'model__min_samples_split': [58]}

In [None]:
param_history = {}
for param in tunable_parameters_DT:
    temp_param_grid = {**best_params_DT_dict} 
    temp_param_grid.update({param:param_distributions_Decision_Tree[param]})
    print(temp_param_grid)
    print("Testing param", param)
    history = do_random_search(pipelines[0][1], temp_param_grid)
    param_history.update({param: history})

{'model__criterion': ['gini'], 'model__max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x00000208E79B4550>, 'model__min_samples_leaf': [10], 'model__min_samples_split': [58]}
Testing param model__max_depth


  _data = np.array(data, dtype=dtype, copy=copy,


{'model__criterion': ['gini'], 'model__max_depth': [17], 'model__min_samples_leaf': [10], 'model__min_samples_split': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x00000208E6845090>}
Testing param model__min_samples_split
{'model__criterion': ['gini', 'entropy'], 'model__max_depth': [17], 'model__min_samples_leaf': [10], 'model__min_samples_split': [58]}
Testing param model__criterion




{'model__criterion': ['gini'], 'model__max_depth': [17], 'model__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x00000208E79B6710>, 'model__min_samples_split': [58]}
Testing param model__min_samples_leaf


In [62]:
len(param_history["model__max_depth"][])

17

In [None]:
params_history_frames_DT = {}
for param, history in param_history.items():
    df = pd.concat([pd.DataFrame(history[i]) for i in range(len(history))], keys=range(len(history)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    params_history_frames_DT.update({param:df})

In [None]:
params_history_frames_DT["model__criterion"]

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__min_samples_split,param_model__min_samples_leaf,param_model__max_depth,param_model__criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.105099,0.00878,0.015429,0.003671,58,10,17,gini,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'gini'}",0.787883,0.800885,0.789166,0.804345,0.807687,0.797993,0.008035,1
1,0,0.114604,0.012346,0.01294,0.002294,58,10,17,entropy,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'entropy'}",0.771594,0.789281,0.775129,0.783114,0.793746,0.782573,0.00832,2
2,1,0.040952,0.007055,0.006948,0.00158,58,10,17,gini,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'gini'}",0.975798,0.971953,0.96436,0.975191,0.968272,0.971114,0.004312,2
3,1,0.04423,0.005163,0.005204,0.000452,58,10,17,entropy,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'entropy'}",0.981627,0.97352,0.967011,0.976724,0.973755,0.974527,0.004761,1
4,2,0.046389,0.003247,0.011586,0.004546,58,10,17,gini,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'gini'}",0.861568,0.902895,0.847828,0.873421,0.863093,0.869761,0.01846,1
5,2,0.045041,0.002042,0.00671,0.001326,58,10,17,entropy,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'entropy'}",0.855224,0.894338,0.847901,0.851003,0.86877,0.863447,0.01701,2
6,3,0.077533,0.007429,0.011376,0.002776,58,10,17,gini,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'gini'}",0.97886,0.988254,0.971594,0.969296,0.973894,0.97638,0.006729,2
7,3,0.080284,0.009185,0.00884,0.000378,58,10,17,entropy,"{'model__min_samples_split': 58, 'model__min_samples_leaf': 10, 'model__max_depth': 17, 'model__criterion': 'entropy'}",0.975028,0.980824,0.973653,0.986539,0.975463,0.978301,0.004788,1


In [None]:
params_history_frames_DT['model__max_depth']

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.107094,0.016469,0.012695,0.002983,gini,7,10,58,"{'model__criterion': 'gini', 'model__max_depth': 7, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.787423,0.812503,0.797988,0.813216,0.821115,0.806449,0.012092,14
1,0,0.122068,0.007226,0.017022,0.004310,gini,20,10,58,"{'model__criterion': 'gini', 'model__max_depth': 20, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.788119,0.795348,0.789710,0.801270,0.799576,0.794804,0.005206,125
2,0,0.122834,0.010963,0.014593,0.004015,gini,29,10,58,"{'model__criterion': 'gini', 'model__max_depth': 29, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.787998,0.798676,0.789166,0.797537,0.796837,0.794043,0.004512,147
3,0,0.102597,0.011336,0.012477,0.003648,gini,15,10,58,"{'model__criterion': 'gini', 'model__max_depth': 15, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.783157,0.794033,0.788769,0.808174,0.801999,0.795226,0.008970,107
4,0,0.094956,0.013095,0.009644,0.001571,gini,11,10,58,"{'model__criterion': 'gini', 'model__max_depth': 11, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.791269,0.798647,0.799395,0.804118,0.793543,0.797394,0.004542,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,3,0.083718,0.008882,0.012290,0.002154,gini,29,10,58,"{'model__criterion': 'gini', 'model__max_depth': 29, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.983842,0.989135,0.977992,0.969296,0.980356,0.980124,0.006586,57
796,3,0.052561,0.008390,0.012371,0.003251,gini,4,10,58,"{'model__criterion': 'gini', 'model__max_depth': 4, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.777016,0.772864,0.771021,0.777688,0.753694,0.770457,0.008746,172
797,3,0.093595,0.002104,0.009494,0.001203,gini,30,10,58,"{'model__criterion': 'gini', 'model__max_depth': 30, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.983842,0.989135,0.977992,0.969430,0.980170,0.980114,0.006541,65
798,3,0.050612,0.004125,0.010582,0.002098,gini,5,10,58,"{'model__criterion': 'gini', 'model__max_depth': 5, 'model__min_samples_leaf': 10, 'model__min_samples_split': 58}",0.816716,0.815860,0.822976,0.829819,0.790297,0.815134,0.013395,166


In [None]:
params_history_frames_DT['model__min_samples_leaf']

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.084143,0.006159,0.015894,0.003470,gini,17,39,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 39, 'model__min_samples_split': 58}",0.797292,0.810846,0.798794,0.826523,0.826962,0.812083,0.012859,18
1,0,0.082901,0.002799,0.019535,0.004657,gini,17,52,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 52, 'model__min_samples_split': 58}",0.797548,0.806310,0.795543,0.828683,0.820422,0.809701,0.012922,59
2,0,0.118521,0.006652,0.016944,0.003846,gini,17,29,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 29, 'model__min_samples_split': 58}",0.793556,0.798264,0.800928,0.813783,0.825703,0.806447,0.011732,119
3,0,0.111362,0.003045,0.012652,0.002128,gini,17,15,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 15, 'model__min_samples_split': 58}",0.784932,0.801552,0.798437,0.825930,0.808774,0.803925,0.013448,147
4,0,0.102354,0.007219,0.014424,0.003492,gini,17,43,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 43, 'model__min_samples_split': 58}",0.801661,0.810440,0.796803,0.829411,0.824846,0.812632,0.012700,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,3,0.087806,0.010458,0.013088,0.002083,gini,17,43,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 43, 'model__min_samples_split': 58}",0.927517,0.936200,0.944952,0.922405,0.952967,0.936808,0.011150,141
796,3,0.089958,0.005880,0.014912,0.003691,gini,17,29,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 29, 'model__min_samples_split': 58}",0.951572,0.962508,0.950054,0.954153,0.957318,0.955121,0.004441,98
797,3,0.090991,0.012452,0.013759,0.002187,gini,17,36,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 36, 'model__min_samples_split': 58}",0.948896,0.947442,0.935759,0.938638,0.939536,0.942054,0.005167,121
798,3,0.091313,0.007543,0.014628,0.003508,gini,17,13,58,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 13, 'model__min_samples_split': 58}",0.973505,0.987007,0.963525,0.970254,0.972125,0.973283,0.007670,41


In [None]:
params_history_frames_DT['model__min_samples_split']

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__criterion,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,0.104002,0.008128,0.015526,0.002978,gini,17,10,40,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 40}",0.764943,0.796521,0.765789,0.787509,0.794672,0.781887,0.013824,57
1,0,0.125067,0.012540,0.013333,0.003034,gini,17,10,53,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 53}",0.775253,0.789669,0.781699,0.801932,0.797884,0.789287,0.009884,31
2,0,0.160459,0.020617,0.014780,0.004172,gini,17,10,30,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 30}",0.760258,0.780376,0.753331,0.773156,0.780169,0.769458,0.010884,103
3,0,0.141070,0.010011,0.015746,0.002139,gini,17,10,16,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 16}",0.752134,0.776199,0.745130,0.760574,0.780668,0.762941,0.013636,143
4,0,0.119967,0.012509,0.015516,0.003821,gini,17,10,44,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 44}",0.770317,0.795258,0.768850,0.789057,0.797720,0.784240,0.012305,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,3,0.115553,0.017846,0.029131,0.022153,gini,17,10,29,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 29}",0.982429,0.990588,0.979028,0.972740,0.982205,0.981398,0.005775,98
796,3,0.130162,0.012242,0.016124,0.006800,gini,17,10,3,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 3}",0.983239,0.990349,0.980472,0.975890,0.982976,0.982585,0.004694,65
797,3,0.099085,0.004707,0.015498,0.004152,gini,17,10,43,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 43}",0.978965,0.990485,0.975483,0.971258,0.977482,0.978735,0.006421,144
798,3,0.093473,0.007794,0.012530,0.002267,gini,17,10,46,"{'model__criterion': 'gini', 'model__max_depth': 17, 'model__min_samples_leaf': 10, 'model__min_samples_split': 46}",0.978767,0.990485,0.975483,0.971254,0.976722,0.978542,0.006457,161


In [8]:
def get_best_params_per_dataset_for_measuring_param_tunability(df, history):
    df['params_str'] = df['params'].apply(lambda x: str(x))
    best_params_per_dataset = df.sort_values(['dataset', 'rank_test_score'], ascending=[True, True]).groupby('dataset').first().reset_index()
    best_params_per_dataset.rename(columns={'params_str': 'best_params', 'mean_test_score': 'best_score'}, inplace=True)
    best_params_per_dataset = best_params_per_dataset[['dataset', 'best_params', 'best_score']]
    default_params, _ = get_best_params_overall(history)
    score_for_default_params = history[history['params_str'] == default_params][['dataset', 'mean_test_score']].rename(columns={'mean_test_score': 'default_score'})
    best_params_per_dataset = best_params_per_dataset.merge(score_for_default_params, on='dataset', how='left')
    condition = best_params_per_dataset['best_score'] < best_params_per_dataset['default_score']
    best_params_per_dataset.loc[condition, 'best_score'] = best_params_per_dataset['default_score']
    best_params_per_dataset.loc[condition, 'best_params'] = default_params
    best_params_per_dataset['abs_tunability'] = (best_params_per_dataset['best_score'] - best_params_per_dataset['default_score'])
    best_params_per_dataset['rel_tunability (%)'] = best_params_per_dataset['abs_tunability'] / best_params_per_dataset['default_score'] * 100 
    return best_params_per_dataset

### Saving results to file

In [None]:
for param, history in params_history_frames_DT.items():
    print(history.shape)
    history.to_csv(f'../history/history_hyperparameter_tuning_DT_{param}.csv', index=False)

(800, 19)
(800, 19)
(8, 19)
(800, 19)


In [4]:
params_history_frames_DT = {}
for param in ["model__max_depth", "model__min_samples_split","model__criterion","model__min_samples_leaf"]:
    file_path = f'../history/history_hyperparameter_tuning_DT_{param}.csv'
    if os.path.exists(file_path):
        params_history_frames_DT[param] = pd.read_csv(file_path)

# Chosen hyperparameters with tunability are presented below

## Short reminder. First results are for **Decision Tree**

In [16]:
for param in tunable_parameters_DT:
    print("Results for param:", param)
    display(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_DT[param], history_DecisionTree).head(10))
    print(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_DT[param], history_DecisionTree)['rel_tunability (%)'].mean())

Results for param: model__max_depth


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__criterion': 'gini', 'model__max_depth...",0.870655,0.869608,0.001046,0.120325
1,1,"{'model__criterion': 'gini', 'model__max_depth...",0.972119,0.971264,0.000855,0.088001
2,2,"{'model__criterion': 'gini', 'model__max_depth...",0.980219,0.976445,0.003774,0.386553
3,3,"{'model__criterion': 'gini', 'model__max_depth...",0.809058,0.797114,0.011944,1.498442


0.5233300674738862
Results for param: model__min_samples_split


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__criterion': 'gini', 'model__max_depth...",0.869703,0.869608,9.5e-05,0.010908
1,1,"{'model__criterion': 'gini', 'model__max_depth...",0.971474,0.971264,0.00021,0.021614
2,2,"{'model__criterion': 'gini', 'model__max_depth...",0.983828,0.976445,0.007383,0.756104
3,3,"{'model__criterion': 'gini', 'model__max_depth...",0.797682,0.797114,0.000568,0.071285


0.21497769336818326
Results for param: model__criterion


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__min_samples_split': 58, 'model__min_s...",0.869608,0.869608,0.0,0.0
1,1,"{'model__min_samples_split': 58, 'model__min_s...",0.975514,0.971264,0.00425,0.437538
2,2,"{'model__min_samples_split': 58, 'model__min_s...",0.979113,0.976445,0.002669,0.273307
3,3,"{'model__criterion': 'gini', 'model__max_depth...",0.797114,0.797114,0.0,0.0


0.17771128119431467
Results for param: model__min_samples_leaf


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__criterion': 'gini', 'model__max_depth...",0.873927,0.869608,0.004319,0.49666
1,1,"{'model__criterion': 'gini', 'model__max_depth...",0.973532,0.971264,0.002267,0.233443
2,2,"{'model__criterion': 'gini', 'model__max_depth...",0.981708,0.976445,0.005264,0.539067
3,3,"{'model__criterion': 'gini', 'model__max_depth...",0.813505,0.797114,0.016391,2.056296


0.8313664969773633


## **Random Forest**

In [17]:
param_distributions_Random_Forest = param_distributions[1]
tunable_parameters_RF = param_distributions_Random_Forest.keys()
tunable_parameters_RF

dict_keys(['model__n_estimators', 'model__min_samples_leaf', 'model__max_samples', 'model__max_features'])

In [25]:
best_params_RF_dict = json.loads(best_params_RandomForest.replace("'", "\""))
best_params_RF_dict = {key: [value] for key, value in best_params_RF_dict.items()}
best_params_RF_dict

{'model__max_features': [0.49816568848070625],
 'model__max_samples': [0.738105348394507],
 'model__min_samples_leaf': [3],
 'model__n_estimators': [478]}

In [None]:
param_history = {}
for param in tunable_parameters_RF:
    temp_param_grid = {**best_params_RF_dict} 
    temp_param_grid.update({param:param_distributions_Random_Forest[param]})
    print(temp_param_grid)
    print("Testing param", param)
    history = do_random_search(pipelines[1][1], temp_param_grid)
    param_history.update({param: history})

{'model__max_features': [0.49816568848070625], 'model__max_samples': [0.738105348394507], 'model__min_samples_leaf': [3], 'model__n_estimators': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x00000208E79B6CD0>}
Testing param model__n_estimators


{'model__max_features': [0.49816568848070625], 'model__max_samples': [0.738105348394507], 'model__min_samples_leaf': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x00000208E79B72D0>, 'model__n_estimators': [478]}
Testing param model__min_samples_leaf
{'model__max_features': [0.49816568848070625], 'model__max_samples': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x00000208E79B7890>, 'model__min_samples_leaf': [3], 'model__n_estimators': [478]}
Testing param model__max_samples


  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


{'model__max_features': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x00000208E79B7E90>, 'model__max_samples': [0.738105348394507], 'model__min_samples_leaf': [3], 'model__n_estimators': [478]}
Testing param model__max_features


  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
params_history_frames_RF = {}
for param, history in param_history.items():
    df = pd.concat([pd.DataFrame(history[i]) for i in range(len(history))], keys=range(len(history)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    params_history_frames_RF.update({param:df})

In [None]:
for param, history in params_history_frames_RF.items():
    print(history.shape)
    history.to_csv(f'../history/history_hyperparameter_tuning_RF_{param}.csv', index=False)

In [19]:
params_history_frames_RF = {}
for param in ["model__n_estimators", "model__min_samples_leaf","model__max_samples","model__max_features"]:
    file_path = f'../history/history_hyperparameter_tuning_RF_{param}.csv'
    if os.path.exists(file_path):
        params_history_frames_RF[param] = pd.read_csv(file_path)

In [20]:
for param in tunable_parameters_RF:
    print("Results for param:", param)
    display(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_RF[param], history_RandomForest).head(10))
    print(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_RF[param], history_RandomForest)['rel_tunability (%)'].mean())

Results for param: model__n_estimators


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__max_features': 0.49816568848070625, '...",0.926011,0.925189,0.000822,0.088824
1,1,"{'model__max_features': 0.49816568848070625, '...",0.986698,0.98636,0.000338,0.03425
2,2,"{'model__max_features': 0.49816568848070625, '...",0.999983,0.999973,1e-05,0.000972
3,3,"{'model__max_features': 0.49816568848070625, '...",0.851273,0.849848,0.001425,0.167688


0.07293348135510604
Results for param: model__min_samples_leaf


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__max_features': 0.49816568848070625, '...",0.926471,0.925189,0.001282,0.13852
1,1,"{'model__max_features': 0.49816568848070625, '...",0.987534,0.98636,0.001174,0.119052
2,2,"{'model__max_features': 0.49816568848070625, '...",0.99999,0.999973,1.7e-05,0.001701
3,3,"{'model__max_features': 0.49816568848070625, '...",0.851094,0.849848,0.001245,0.146538


0.1014524194347725
Results for param: model__max_samples


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__max_features': 0.49816568848070625, '...",0.925849,0.925189,0.00066,0.071352
1,1,"{'model__max_features': 0.49816568848070625, '...",0.987003,0.98636,0.000642,0.065124
2,2,"{'model__max_features': 0.49816568848070625, '...",0.999988,0.999973,1.5e-05,0.001458
3,3,"{'model__max_features': 0.49816568848070625, '...",0.851302,0.849848,0.001454,0.171035


0.07724223682261862
Results for param: model__max_features


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__max_features': 0.4894532708248027, 'm...",0.925715,0.925189,0.000526,0.05681
1,1,"{'model__max_features': 0.3745407443072436, 'm...",0.986765,0.98636,0.000405,0.041084
2,2,"{'model__max_features': 0.09767301633426986, '...",0.999994,0.999973,2.1e-05,0.002106
3,3,"{'model__max_features': 0.11586994365607019, '...",0.851665,0.849848,0.001817,0.213761


0.07843993039905967


## **XGBoost**

In [21]:
param_distributions_XGBoost = param_distributions[2]
tunable_parameters_XGB = param_distributions_XGBoost.keys()
tunable_parameters_XGB

dict_keys(['model__max_depth', 'model__min_child_weight', 'model__eta', 'model__alpha'])

In [66]:
best_params_XGB_dict = json.loads(best_params_XGBoost.replace("'", "\""))
best_params_XGB_dict = {key: [value] for key, value in best_params_XGB_dict.items()}
best_params_XGB_dict

{'model__alpha': [3.963066389815045],
 'model__eta': [0.10674596881842251],
 'model__max_depth': [12],
 'model__min_child_weight': [1]}

In [67]:
param_history = {}
for param in tunable_parameters_XGB:
    temp_param_grid = {**best_params_XGB_dict} 
    temp_param_grid.update({param:param_distributions_XGBoost[param]})
    print(temp_param_grid)
    print("Testing param", param)
    history = do_random_search(pipelines[2][1], temp_param_grid)
    param_history.update({param: history})

{'model__alpha': [3.963066389815045], 'model__eta': [0.10674596881842251], 'model__max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001AC3E0C2610>, 'model__min_child_weight': [1]}
Testing param model__max_depth


  _data = np.array(data, dtype=dtype, copy=copy,


{'model__alpha': [3.963066389815045], 'model__eta': [0.10674596881842251], 'model__max_depth': [12], 'model__min_child_weight': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001AC3E0C34D0>}
Testing param model__min_child_weight


  _data = np.array(data, dtype=dtype, copy=copy,


{'model__alpha': [3.963066389815045], 'model__eta': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001AC3E0C3890>, 'model__max_depth': [12], 'model__min_child_weight': [1]}
Testing param model__eta


  _data = np.array(data, dtype=dtype, copy=copy,


{'model__alpha': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x000001AC3E0C1690>, 'model__eta': [0.10674596881842251], 'model__max_depth': [12], 'model__min_child_weight': [1]}
Testing param model__alpha


  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


In [97]:
params_history_frames_XGB = {}
for param, history in param_history.items():
    df = pd.concat([pd.DataFrame(history[i]) for i in range(len(history))], keys=range(len(history)), names=['dataset'])
    df = df.reset_index()
    df.drop(columns='level_1', inplace=True)
    params_history_frames_XGB.update({param:df})

In [98]:
for param, history in params_history_frames_XGB.items():
    print(history.shape)
    history.to_csv(f'../history/history_hyperparameter_tuning_XGB_{param}.csv', index=False)

(800, 18)
(800, 18)
(800, 18)
(800, 18)


In [22]:
params_history_frames_XGB = {}
for param in ["model__max_depth", "model__min_child_weight", "model__eta", "model__alpha"]:
    file_path = f'../history/history_hyperparameter_tuning_XGB_{param}.csv'
    if os.path.exists(file_path):
        params_history_frames_XGB[param] = pd.read_csv(file_path)

In [23]:
for param in tunable_parameters_XGB:
    print("Results for param:", param)
    display(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_XGB[param], history_XGBoost).head(10))
    print(get_best_params_per_dataset_for_measuring_param_tunability(params_history_frames_XGB[param], history_XGBoost)['rel_tunability (%)'].mean())

Results for param: model__max_depth


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__alpha': 3.963066389815045, 'model__et...",0.921431,0.920403,0.001028,0.111686
1,1,"{'model__alpha': 3.963066389815045, 'model__et...",0.990526,0.99013,0.000396,0.040025
2,2,"{'model__alpha': 1.2481751282245537, 'model__e...",0.999985,0.999985,0.0,0.0
3,3,"{'model__alpha': 3.963066389815045, 'model__et...",0.848334,0.848066,0.000268,0.031644


0.045838851005807944
Results for param: model__min_child_weight


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__alpha': 3.963066389815045, 'model__et...",0.921714,0.920403,0.001311,0.142395
1,1,"{'model__alpha': 3.963066389815045, 'model__et...",0.990526,0.99013,0.000396,0.040025
2,2,"{'model__alpha': 1.2481751282245537, 'model__e...",0.999985,0.999985,0.0,0.0
3,3,"{'model__alpha': 1.2481751282245537, 'model__e...",0.848066,0.848066,0.0,0.0


0.045604903458540576
Results for param: model__eta


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__alpha': 3.963066389815045, 'model__et...",0.922175,0.920403,0.001771,0.192454
1,1,"{'model__alpha': 3.963066389815045, 'model__et...",0.99052,0.99013,0.00039,0.039369
2,2,"{'model__alpha': 1.2481751282245537, 'model__e...",0.999985,0.999985,0.0,0.0
3,3,"{'model__alpha': 3.963066389815045, 'model__et...",0.850173,0.848066,0.002107,0.248461


0.12007089310680277
Results for param: model__alpha


Unnamed: 0,dataset,best_params,best_score,default_score,abs_tunability,rel_tunability (%)
0,0,"{'model__alpha': 3.477998816092334, 'model__et...",0.921544,0.920403,0.00114,0.123886
1,1,"{'model__alpha': 2.7293781650374753, 'model__e...",0.990449,0.99013,0.000319,0.032233
2,2,"{'model__alpha': 1.2481751282245537, 'model__e...",0.999985,0.999985,0.0,0.0
3,3,"{'model__alpha': 6.732248920775331, 'model__et...",0.849878,0.848066,0.001812,0.213652


0.0924425447962211
