# AutoML - Praca Domowa nr 1

Załadowanie niezbędnych bibliotek

In [1]:
from typing import Tuple, Dict, Any, List, Optional, Literal

import pandas as pd
import numpy as np
import numpy.typing as npt

from sklearn.datasets import load_iris, load_diabetes, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from skopt import BayesSearchCV
from skopt.space import Real, Integer
from scipy.stats import uniform
from xgboost import XGBClassifier

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# for BayesSearchCV to work:
np.int = np.int64

In [3]:
# path to a directory with results files
base_path = '.'

Testowane będą 3 algorytmy uczenia maszynowego tj:
- Random Forest
- XGBoostClassifier
- K-Nearest Neighbours
  
Testowanie odbędzie się na 4 różnych zbiorach danych tj:
- iris (klasyfikacja)
- diabetes  z zakodowaną zmienną celu (klasyfikacja)
- wine (klasyfikacja)
- breast cancer (klasyfikacja)

# Datasets

## Iris

In [4]:
X_iris, y_iris = load_iris(return_X_y=True, as_frame=True)
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.33, random_state=123
)

In [5]:
X_train_iris.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 110 to 109
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  100 non-null    float64
 1   sepal width (cm)   100 non-null    float64
 2   petal length (cm)  100 non-null    float64
 3   petal width (cm)   100 non-null    float64
dtypes: float64(4)
memory usage: 3.9 KB


In [6]:
X_train_iris.head(n=10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
110,6.5,3.2,5.1,2.0
23,5.1,3.3,1.7,0.5
9,4.9,3.1,1.5,0.1
86,6.7,3.1,4.7,1.5
91,6.1,3.0,4.6,1.4
89,5.5,2.5,4.0,1.3
79,5.7,2.6,3.5,1.0
101,5.8,2.7,5.1,1.9
65,6.7,3.1,4.4,1.4
115,6.4,3.2,5.3,2.3


In [7]:
X_train_iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,100.0,100.0,100.0,100.0
mean,5.892,3.05,3.854,1.228
std,0.796289,0.403144,1.708471,0.725868
min,4.5,2.2,1.0,0.1
25%,5.1,2.8,1.6,0.4
50%,5.85,3.0,4.45,1.35
75%,6.425,3.3,5.1,1.8
max,7.7,4.4,6.9,2.5


## Diabetes

In [8]:
X_diabetes, y_diabetes = load_diabetes(return_X_y=True, as_frame=True)
y_median = np.median(y_diabetes)
y_diabetes[y_diabetes < y_median] = 0 
y_diabetes[y_diabetes >= y_median] = 1
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(
    X_diabetes, y_diabetes, test_size=0.33, random_state=123
)

In [9]:
X_train_diabetes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 296 entries, 150 to 365
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     296 non-null    float64
 1   sex     296 non-null    float64
 2   bmi     296 non-null    float64
 3   bp      296 non-null    float64
 4   s1      296 non-null    float64
 5   s2      296 non-null    float64
 6   s3      296 non-null    float64
 7   s4      296 non-null    float64
 8   s5      296 non-null    float64
 9   s6      296 non-null    float64
dtypes: float64(10)
memory usage: 25.4 KB


In [10]:
X_train_diabetes.head(n=10)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
150,0.023546,-0.044642,0.070319,0.025315,-0.034592,-0.014466,-0.032356,-0.002592,-0.019198,-0.009362
185,-0.074533,0.05068,-0.018062,0.008101,-0.019456,-0.0248,-0.065491,0.034309,0.067318,-0.017646
12,0.016281,-0.044642,-0.02884,-0.009113,-0.004321,-0.009769,0.044958,-0.039493,-0.030748,-0.042499
30,-0.060003,-0.044642,0.044451,-0.019442,-0.009825,-0.007577,0.022869,-0.039493,-0.027129,-0.009362
144,0.030811,0.05068,0.046607,-0.015999,0.020446,0.050669,-0.058127,0.07121,0.006207,0.007207
192,0.056239,0.05068,-0.030996,0.008101,0.01907,0.021233,0.033914,-0.039493,-0.029526,-0.059067
157,-0.001882,0.05068,-0.033151,-0.018306,0.031454,0.04284,-0.013948,0.019917,0.010227,0.027917
241,0.030811,0.05068,-0.008362,0.004658,0.014942,0.027496,0.008142,-0.008127,-0.029526,0.056912
287,0.045341,-0.044642,-0.006206,-0.015999,0.125019,0.125198,0.019187,0.034309,0.032432,-0.00522
236,0.027178,-0.044642,0.006728,0.035644,0.079612,0.07071,0.015505,0.034309,0.040673,0.011349


In [11]:
X_train_diabetes.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,296.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0,296.0
mean,-0.000839,0.001409,0.00084,-3.3e-05,-0.002452,-0.001886,-0.001678,-0.000111,-0.00073,0.000602
std,0.046059,0.047714,0.048117,0.048085,0.0494,0.049198,0.048399,0.049292,0.047973,0.050952
min,-0.103593,-0.044642,-0.084886,-0.112399,-0.126781,-0.115613,-0.102307,-0.076395,-0.126097,-0.137767
25%,-0.035483,-0.044642,-0.033421,-0.040099,-0.035968,-0.034508,-0.036038,-0.039493,-0.034522,-0.034215
50%,0.005383,-0.044642,-0.006206,-0.00567,-0.005697,-0.00836,-0.010266,-0.002592,-0.004682,-0.001078
75%,0.034443,0.05068,0.03529,0.033062,0.024574,0.026556,0.023789,0.034309,0.031967,0.032059
max,0.096197,0.05068,0.170555,0.132044,0.153914,0.198788,0.181179,0.185234,0.133597,0.135612


## Wine

In [12]:
X_wine, y_wine = load_wine(return_X_y=True, as_frame=True)
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
    X_wine, y_wine, test_size=0.33, random_state=123
)

In [13]:
X_train_wine.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119 entries, 30 to 109
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       119 non-null    float64
 1   malic_acid                    119 non-null    float64
 2   ash                           119 non-null    float64
 3   alcalinity_of_ash             119 non-null    float64
 4   magnesium                     119 non-null    float64
 5   total_phenols                 119 non-null    float64
 6   flavanoids                    119 non-null    float64
 7   nonflavanoid_phenols          119 non-null    float64
 8   proanthocyanins               119 non-null    float64
 9   color_intensity               119 non-null    float64
 10  hue                           119 non-null    float64
 11  od280/od315_of_diluted_wines  119 non-null    float64
 12  proline                       119 non-null    float64
dtypes: float6

In [14]:
X_train_wine.head(n=10)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
30,13.73,1.5,2.7,22.5,101.0,3.0,3.25,0.29,2.38,5.7,1.19,2.71,1285.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0
19,13.64,3.1,2.56,15.2,116.0,2.7,3.03,0.17,1.66,5.1,0.96,3.36,845.0
104,12.51,1.73,1.98,20.5,85.0,2.2,1.92,0.32,1.48,2.94,1.04,3.57,672.0
29,14.02,1.68,2.21,16.0,96.0,2.65,2.33,0.26,1.98,4.7,1.04,3.59,1035.0
35,13.48,1.81,2.41,20.5,100.0,2.7,2.98,0.26,1.86,5.1,1.04,3.47,920.0
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
174,13.4,3.91,2.48,23.0,102.0,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750.0
45,14.21,4.04,2.44,18.9,111.0,2.85,2.65,0.3,1.25,5.24,0.87,3.33,1080.0
112,11.76,2.68,2.92,20.0,103.0,1.75,2.03,0.6,1.05,3.8,1.23,2.5,607.0


In [15]:
X_train_wine.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0,119.0
mean,13.034622,2.260084,2.33479,19.09916,99.596639,2.308571,2.110588,0.347563,1.652017,4.999244,0.962571,2.643613,755.142857
std,0.81004,1.110269,0.273619,3.146117,14.321561,0.634066,0.948818,0.122467,0.550935,2.288184,0.227941,0.707503,326.409
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.42,1.28,0.56,1.29,278.0
25%,12.37,1.52,2.2,16.8,88.0,1.745,1.305,0.26,1.35,3.19,0.79,2.035,497.5
50%,13.05,1.78,2.32,19.0,98.0,2.41,2.19,0.32,1.62,4.68,0.98,2.83,680.0
75%,13.715,2.9,2.5,21.0,107.5,2.8,2.92,0.43,1.98,6.2,1.115,3.19,1027.5
max,14.39,5.8,3.22,30.0,151.0,3.85,3.93,0.66,2.96,13.0,1.71,4.0,1680.0


## Breast Cancer

In [16]:
X_breast, y_breast = load_breast_cancer(return_X_y=True, as_frame=True)
X_train_breast, X_test_breast, y_train_breast, y_test_breast = train_test_split(X_breast, y_breast, test_size=0.33, random_state=123)

In [17]:
X_train_breast.info()

<class 'pandas.core.frame.DataFrame'>
Index: 381 entries, 184 to 510
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              381 non-null    float64
 1   mean texture             381 non-null    float64
 2   mean perimeter           381 non-null    float64
 3   mean area                381 non-null    float64
 4   mean smoothness          381 non-null    float64
 5   mean compactness         381 non-null    float64
 6   mean concavity           381 non-null    float64
 7   mean concave points      381 non-null    float64
 8   mean symmetry            381 non-null    float64
 9   mean fractal dimension   381 non-null    float64
 10  radius error             381 non-null    float64
 11  texture error            381 non-null    float64
 12  perimeter error          381 non-null    float64
 13  area error               381 non-null    float64
 14  smoothness error         381 

In [18]:
X_train_breast.head(n=10)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
184,15.28,22.41,98.92,710.6,0.09057,0.1052,0.05375,0.03263,0.1727,0.06317,...,17.8,28.03,113.8,973.1,0.1301,0.3299,0.363,0.1226,0.3175,0.09772
142,11.43,17.31,73.66,398.0,0.1092,0.09486,0.02031,0.01861,0.1645,0.06562,...,12.78,26.76,82.66,503.0,0.1413,0.1792,0.07708,0.06402,0.2584,0.08096
237,20.48,21.46,132.5,1306.0,0.08355,0.08348,0.09042,0.06022,0.1467,0.05177,...,24.22,26.17,161.7,1750.0,0.1228,0.2311,0.3158,0.1445,0.2238,0.07127
361,13.3,21.57,85.24,546.1,0.08582,0.06373,0.03344,0.02424,0.1815,0.05696,...,14.2,29.2,92.94,621.2,0.114,0.1667,0.1212,0.05614,0.2637,0.06658
30,18.63,25.11,124.8,1088.0,0.1064,0.1887,0.2319,0.1244,0.2183,0.06197,...,23.15,34.01,160.5,1670.0,0.1491,0.4257,0.6133,0.1848,0.3444,0.09782
474,10.88,15.62,70.41,358.9,0.1007,0.1069,0.05115,0.01571,0.1861,0.06837,...,11.94,19.35,80.78,433.1,0.1332,0.3898,0.3365,0.07966,0.2581,0.108
148,14.44,15.18,93.97,640.1,0.0997,0.1021,0.08487,0.05532,0.1724,0.06081,...,15.85,19.85,108.6,766.9,0.1316,0.2735,0.3103,0.1599,0.2691,0.07683
120,11.41,10.82,73.34,403.3,0.09373,0.06685,0.03512,0.02623,0.1667,0.06113,...,12.82,15.97,83.74,510.5,0.1548,0.239,0.2102,0.08958,0.3016,0.08523
191,12.77,21.41,82.02,507.4,0.08749,0.06601,0.03112,0.02864,0.1694,0.06287,...,13.75,23.5,89.04,579.5,0.09388,0.08978,0.05186,0.04773,0.2179,0.06871
217,10.2,17.48,65.05,321.2,0.08054,0.05907,0.05774,0.01071,0.1964,0.06315,...,11.48,24.47,75.4,403.7,0.09527,0.1397,0.1925,0.03571,0.2868,0.07809


In [19]:
y_train_breast

184    0
142    1
237    0
361    1
30     0
      ..
98     1
322    1
382    1
365    0
510    1
Name: target, Length: 381, dtype: int32

In [20]:
X_train_breast.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,...,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0
mean,14.105399,19.42168,91.805643,651.738845,0.095946,0.103926,0.088372,0.048608,0.180774,0.062802,...,16.178268,25.65916,106.658819,868.461155,0.130507,0.247398,0.26585,0.113193,0.286329,0.083099
std,3.466119,4.470236,23.839378,341.324729,0.013811,0.05217,0.079589,0.03796,0.028648,0.007435,...,4.733025,6.22966,32.815385,555.419856,0.021851,0.14637,0.204311,0.063814,0.061804,0.017046
min,7.691,10.72,47.92,170.4,0.05263,0.0265,0.0,0.0,0.106,0.04996,...,8.678,12.49,54.49,223.6,0.08409,0.04327,0.0,0.0,0.1565,0.05504
25%,11.61,16.21,74.34,412.6,0.08588,0.0663,0.02995,0.0207,0.1601,0.05754,...,12.97,21.1,83.74,510.5,0.1144,0.146,0.1144,0.06402,0.2471,0.07061
50%,13.48,18.9,87.19,561.0,0.09578,0.09218,0.06181,0.03326,0.1781,0.0614,...,15.01,25.41,97.96,694.4,0.1298,0.207,0.2247,0.09744,0.278,0.07944
75%,15.75,21.84,104.3,788.5,0.1046,0.1294,0.1204,0.07064,0.1953,0.06623,...,18.51,29.43,123.8,1035.0,0.1431,0.3214,0.3779,0.1599,0.3105,0.09158
max,27.42,39.28,186.9,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2184,0.9379,1.252,0.291,0.6638,0.173


## Combining datasets

In [21]:
datasets = {
    'iris': {
        'X_train': X_train_iris,
        'y_train': y_train_iris,
        'X_test': X_test_iris,
        'y_test': y_test_iris,
    },
    'diabetes': {
        'X_train': X_train_diabetes,
        'y_train': y_train_diabetes,
        'X_test': X_test_diabetes,
        'y_test': y_test_diabetes,
    },
    'wine': {
        'X_train': X_train_wine,
        'y_train': y_train_wine,
        'X_test': X_test_wine,
        'y_test': y_test_wine,
    },
    'breast': {
        'X_train': X_train_breast,
        'y_train': y_train_breast,
        'X_test': X_test_breast,
        'y_test': y_test_breast,
    },
}

# Parameter search spaces

| Algorytm         | Hiperparametr     | Typ        | Dolna Granica | Górna Granica |
|------------------|-------------------|------------|---------------|---------------|
| kknn             | k                 | integer    | 1             | 30            |
| XGBoost          | n_estimators      | integer    | 1             | 5000          |
| XGBoost          | eta               | float      | $2^{-10}$     | 1             |
| XGBoost          | subsample         | float      | 0.1           | 1             |
| XGBoost          | booster           | string     | -             | -             |
| XGBoost          | max_depth         | integer    | 1             | 16            |
| XGBoost          | min_child_weight  | float      | 1             | $2^7          |
| XGBoost          | colsample_bytree  | float      | 0             | 1             |
| XGBoost          | alpha             | float      | $2^{-10}$     | $2^{10}$      |
| RandomForest     | n_estimators      | integer    | 1             | 2000          |
| RandomForest     | max_depth         | integer    | 3             | p             |
| RandomForest     | max_samples       | float      | 0.1           | 1             |
| RandomForest     | max_features      | float      | 0             | 1             |
| RandomForest     | min_samples_leaf  | float      | 0.1           | 0.5           |

In [22]:
hyperparameters_random = {
    "XGboost": {
        "n_estimators": np.arange(1, 5001),
        "eta": uniform(loc=2e-10, scale=1.0),
        "subsample": uniform(loc=0.1, scale=0.8999),
        "booster": ["gbtree"],
        "max_depth": np.arange(1, 16),
        "min_child_weight": uniform(loc=1, scale=2^7),
        "colsample_bytree": uniform(loc=0, scale=1),
        "alpha": uniform(loc=2e-10, scale=2e10),
    },
    "Random Forest": {
        "n_estimators": [int(x) for x in np.linspace(start=1, stop=2000, num=1000)],
        "max_depth": np.arange(3, len(X_train_iris.columns), 1),
        "max_features": np.linspace(start=0.1, stop=1., num=20),
        "max_samples": np.linspace(start=0.1, stop=1., num=20),
        "min_samples_leaf": np.linspace(start=0.1, stop=0.5, num=20),
    },
    "KNN": {
        "n_neighbors": np.arange(1, 31)
    },
}

In [23]:
hyperparameters_bayes = {
    "XGboost": {
        "n_estimators": Integer(1, 5000),
        "eta": Real(2e-10, 1.0),
        "subsample": Real(0.1, 1.0),
        "booster": ["gbtree"],
        "max_depth": Integer(1, 15),
        "min_child_weight": Real(1, 2^7),
        "colsample_bytree": Real(0., 1.),
        "alpha": Real(2e-10, 2e10),
    },
    "Random Forest": {
        "n_estimators": Integer(1, 2000),
        "max_depth": Integer(3, len(X_train_iris.columns)),
        "max_features": Real(1e-6, 1),
        "max_samples": Real(1e-6, 1),
        "min_samples_leaf": Real(0.1, 0.5),
    },
    "KNN": {
        "n_neighbors": Integer(1, 30),
    },
}

# Utility functions

## Hyperparameter optimisation

In [24]:
def random_search_tuning(model, params: dict,
                         X_train: pd.DataFrame, y_train: pd.Series,
                         scoring: str = None, n_iter: int = 100, 
                         random_state: int = None) -> Tuple[list, list, list,dict, Any]:
    """
    Args:
        model (BaseEstimator): The model whose hyperparameters are to be optimized.
        params: A dictionary containing the hyperparameters to be tuned with 
            corresponding value ranges or lists.
        X_train: The input features for training the model.
        y_train: The target values for training the model.
        scoring: The scoring metric used for evaluation during grid search.
        n_iter: The number of random combinations of hyperparameters to try.
        random_state: Seed for random number generation, ensuring reproducibility.
    
    Returns:
        A tuple containing the following:
        - A list of different checked hyperparameter combinations.
        - Test scores corresponding to each hyperparameter combination.
        - Train scores corresponding to each hyperparameter combination.
        - A dictionary of the best hyperparameters found during the search.
        - The best estimator with the tuned hyperparameters.
    """
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions= params,
        n_iter=n_iter, 
        scoring=scoring,
        random_state=random_state,
        cv=5, 
        n_jobs=-1, 
        return_train_score=True
    )
    random_search.fit(X_train, y_train)
    return (random_search.cv_results_['params'], random_search.cv_results_['mean_test_score'],
            random_search.cv_results_['mean_train_score'], random_search.best_params_, 
            random_search.best_estimator_)

In [25]:
def bayes_search_tuning(model, params: dict,
                        X_train: pd.DataFrame, y_train: pd.Series,
                        scoring: str = None, n_iter: int = 100, 
                        random_state: int = None) -> Tuple[list, list, list, dict, Any]:
    """
    Args:
        model (BaseEstimator): The model whose hyperparameters are to be optimized.
        params: A dictionary containing the hyperparameters to be tuned with 
            corresponding value ranges or lists.
        X_train: The input features for training the model.
        y_train: The target values for training the model.
        scoring: The scoring metric used for evaluation during grid search.
        n_iter: The number of random combinations of hyperparameters to try.
        random_state: Seed for random number generation, ensuring reproducibility.

    Returns:
        A tuple containing the following:
        - A list of different checked hyperparameter combinations.
        - Test scores corresponding to each hyperparameter combination.
        - Train scores corresponding to each hyperparameter combination.
        - A dictionary of the best hyperparameters found during the search.
        - The best estimator with the tuned hyperparameters.
    """
    bayes_search = BayesSearchCV(
        estimator=model,
        search_spaces=params,
        n_iter=n_iter, 
        scoring=scoring,
        random_state=random_state,
        cv=5, 
        n_jobs=-1, 
        return_train_score=True,
    )
    bayes_search.fit(X_train, y_train)
    return (bayes_search.cv_results_['params'], bayes_search.cv_results_['mean_test_score'],
            bayes_search.cv_results_['mean_train_score'], bayes_search.best_params_, 
            bayes_search.best_estimator_)

## Tunability (general)

In [26]:
def _train_model_with_params(theta: dict, X_train, y_train, X_test, y_test, model_class):
    """R_(j)(theta)"""
    model = model_class(**theta)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    return score

In [27]:
def find_optimal_default_conf(params_hist: List[dict], datasets_: Dict[str, dict], model_class):
    """
    mean(R_(1)(theta*), ..., R_(m)(theta*))
    theta*
    """
    all_scores = np.array([])
    for params in params_hist:
        scores_param = np.array([])
        for dataset in datasets_.values():
            X_train = dataset['X_train']
            X_test = dataset['X_test']
            y_train = dataset['y_train']
            y_test = dataset['y_test']
            score = _train_model_with_params(params, X_train, y_train, X_test, y_test, model_class)
            scores_param = np.append(scores_param, score)
        mean_scores_param = np.mean(scores_param)
        all_scores = np.append(all_scores, mean_scores_param)
    
    best_risk_idx = np.argmax(all_scores)
    theta_star = params_hist[best_risk_idx]
    best_risk = all_scores[best_risk_idx] 
    return best_risk, theta_star

In [28]:
def measure_tunability_algorithm(theta_star: dict, thetas_star_per_df: dict,
                                 datasets_: Dict[str, dict], model_class):
    """
    d_(1), ..., d_(m)
    """
    d = np.array([])    
    for name, dataset_ in datasets_.items():
        X_train = dataset_['X_train']
        X_test = dataset_['X_test']
        y_train = dataset_['y_train']
        y_test = dataset_['y_test']
        score_global = _train_model_with_params(
            theta_star, 
            X_train, y_train, 
            X_test, y_test, 
            model_class,
        )
        score_local = _train_model_with_params(
            thetas_star_per_df[name], 
            X_train, y_train, 
            X_test, y_test, 
            model_class,
        )
        d_j = score_local - score_global
        d = np.append(d, d_j)
    return d

## Tunability (per hyperparameter)

In [29]:
def find_optimal_default_conf_param(params_hist: List[dict], param_i_name: str, theta_star: dict,
                                    datasets_: Dict[str, dict], model_class) -> List[dict]:
    """
    param_i_name = name of the param to optimise (of theta_i)

    Returns:
        theta_i_(1)_star, ..., theta_i_(m)_star
    """
    # auc scores per dataset
    scores_all_per_df = {key: np.array([]) for key in datasets_.keys()}
    # all unique values of param_i
    # this is to remove duplicates to reduce the training times
    param_i_all_vals = list(set(p[param_i_name] for p in params_hist))
    for param_i_val in param_i_all_vals:
        # copy theta star
        params = dict(theta_star)
        # substitute param_i with original value from params
        params[param_i_name] = param_i_val

        for name, dataset in datasets_.items():
            X_train = dataset['X_train']
            X_test = dataset['X_test']
            y_train = dataset['y_train']
            y_test = dataset['y_test']
            score = _train_model_with_params(
                params, 
                X_train, y_train, 
                X_test, y_test, 
                model_class
            )
            scores_all_per_df[name] = np.append(scores_all_per_df[name], score)

    thetas_i_star_per_df = [dict(theta_star) for _ in datasets_]
    param_i_star_per_df = [param_i_all_vals[np.argmax(scores_all_per_df[df_name])]
                           for df_name in datasets_.keys()]
    for idx, theta_i_j in enumerate(thetas_i_star_per_df):
        theta_i_j[param_i_name] = param_i_star_per_df[idx]
    return thetas_i_star_per_df

In [30]:
def measure_param_tunability(thetas_i_star_per_df: List[dict], theta_star: dict,
                             datasets_: Dict[str, dict], model_class) -> npt.NDArray[float]:
    """
    d_i_(1), ..., d_i_(m)
    """
    d_i = np.array([])
    for idx, dataset in enumerate(datasets_.values()):
        X_train = dataset['X_train']
        X_test = dataset['X_test']
        y_train = dataset['y_train']
        y_test = dataset['y_test']
        score_global = _train_model_with_params(
            theta_star, 
            X_train, y_train, 
            X_test, y_test, 
            model_class
        )
        score_local = _train_model_with_params(
            thetas_i_star_per_df[idx], 
            dataset['X_train'], 
            dataset['y_train'], 
            dataset['X_test'], 
            dataset['y_test'], 
            model_class
        )
        d_i_j = score_local - score_global
        d_i = np.append(d_i, d_i_j)
    return d_i

## Plots

In [31]:
def plot_boxplot_algo_tunability(algo_tunability: Dict[str, npt.NDArray[float]], 
                                 optimization_type: Literal["Bayes", "Random"]) -> None:
    """
    Args:
        algo_tunability: Models tunability values
        optimization_type: Type of optimization which was done
    """
    df = pd.DataFrame(algo_tunability).melt()\
        .rename(columns={'variable': 'Algorithm', 'value': 'Tunability'})
    fig = px.box(df, x='Algorithm', y='Tunability',
                 title=f'Tunability of algorithms ({optimization_type})')
    fig.show()

In [32]:
def plot_boxplots_param_tunability(model_name: str, 
                                   params_tunability: Dict[str, npt.NDArray[float]],
                                   optimization_type: Literal["Bayes", "Random"]) -> None:
    """
    Args:
        model_name: Model name to be used in a plot title
        params_tunability: Model hyperparamters tunability values
        optimization_type: Which optimisation was used (Bayes or Random) - this is used for a plot title
    """
    df = pd.DataFrame(params_tunability)\
        .melt()\
        .rename(columns={'variable': 'Hyperparameter', 'value': 'Tunability'})
    fig = px.box(df, x='Hyperparameter', y='Tunability',
                 title=f'{model_name} hyperparameters tunability ({optimization_type})')
    fig.show()

In [33]:
def plot_history(scores_train: Dict[str, npt.NDArray[float]],
                 scores_test: Dict[str, npt.NDArray[float]],
                 optimization_name: Literal['Bayesian', 'Random']) -> None:
                 
    datasets_names = list(scores_train.keys())

    fig = make_subplots(rows=2, cols=2, subplot_titles=[f'Dataset: {dataset}' for dataset in datasets_names])

    train_color, test_color = 'blue', 'red'  # Set colors for Train Scores and Test Scores
    show_legend = True  
    for i, dataset in enumerate(datasets_names):
        row = i // 2 + 1
        col = i % 2 + 1

        # Add Train Scores trace
        fig.add_trace(go.Scatter(x=list(range(1, len(scores_train[dataset]) + 1)), y=scores_train[dataset], mode='lines', name='Train Scores', showlegend=show_legend, line=dict(color=train_color)), row=row, col=col)

        # Add Test Scores trace
        fig.add_trace(go.Scatter(x=list(range(1, len(scores_test[dataset]) + 1)), y=scores_test[dataset], mode='lines', name='Test Scores', showlegend=show_legend, line=dict(color=test_color)), row=row, col=col)

        show_legend = False

    fig.update_layout(title_text=f'Scores for {optimization_name} Optimization')
    fig.update_xaxes(title_text='Epoch', row=2, col=1)
    fig.update_xaxes(title_text='Epoch', row=2, col=2)
    fig.update_yaxes(title_text='Scores', row=1, col=1)
    fig.update_yaxes(title_text='Scores', row=2, col=1)
    fig.update_yaxes(range = [0, 1])

    fig.show()

In [34]:
def plot_stability(scores_test: Dict[str, npt.NDArray[float]],
                   optimization_name: Literal['Bayesian', 'Random'],
                   interval: int = 10) -> None:
    datasets_names = list(scores_test.keys())
    color = 'red'
    fig = make_subplots(rows=2, cols=2, subplot_titles=[f'Dataset: {dataset}' for dataset in datasets_names])

    for i, dataset in enumerate(datasets_names):
        row = i // 2 + 1
        col = i % 2 + 1
        
        max_values = [max(scores_test[dataset][0:j+interval]) for j in range(0, len(scores_test[dataset]), interval)]
        fig.add_trace(go.Scatter(x=np.arange(1, len(max_values) + 1)*interval, y=max_values, mode="markers+lines", line=dict(color=color)), row=row, col=col)

    fig.update_layout(title_text=f'Stability of {optimization_name} Optimization', showlegend=False)
    fig.update_xaxes(title_text='N_iter', row=2, col=1)
    fig.update_xaxes(title_text='N_iter', row=2, col=2)
    fig.update_yaxes(title_text='Scores', row=1, col=1)
    fig.update_yaxes(title_text='Scores', row=2, col=1)
    fig.update_yaxes(range = [0, 1])

    fig.show()

# Searching for optimal parameters

In [35]:
def _get_empty_results_dict() -> Dict[str, Any]:
    """
    results = {
        'full_params_history': [...],
        'scores_test': {
            'iris': [...],
            'diabetes': [...],
            ...
        }
        'scores_train': {
            'iris': [...],
            'diabetes': [...],
            ...
        },
        'best_params': {
            'iris': [...],
            'diabetes': [...],
            ...
        },
        'best_model': {
            'iris': [...],
            'diabetes': [...],
            ...
        },
    }
    """
    return {
        'full_params_history': [],
        'scores_test': {},
        'scores_train': {},
        'best_params': {},
        'best_model': {},
    }

results_random = {
    'KNN': _get_empty_results_dict(),
    'XGboost': _get_empty_results_dict(),
    'Random Forest': _get_empty_results_dict(),
}

results_bayes = {
    'KNN': _get_empty_results_dict(),
    'XGboost': _get_empty_results_dict(),
    'Random Forest': _get_empty_results_dict(),
}

In [36]:
def run_random_search(model_name: str, model_class, random_state: Optional[int] = None,
                      n_iter: Optional[int] = None, verbose=True):
    for dataset_name, dataset in datasets.items():
        if verbose:
            print(f'Current dataset: {dataset_name}')
        params_history, scores_test, train_scores, best_params, best_model = random_search_tuning(
            model=model_class(),
            params=hyperparameters_random[model_name],
            X_train=dataset['X_train'],
            y_train=dataset['y_train'],
            random_state=random_state,
            n_iter=n_iter,
        )
        results_random[model_name]['full_params_history'].extend(params_history)
        results_random[model_name]['scores_test'][dataset_name] = scores_test
        results_random[model_name]['scores_train'][dataset_name] = train_scores
        results_random[model_name]['best_params'][dataset_name] = best_params
        results_random[model_name]['best_model'][dataset_name] = best_model
    

In [37]:
def run_bayes_search(model_name: str, model_class, random_state: Optional[int] = None,
                     n_iter: Optional[int] = None, verbose=True):
    for dataset_name, dataset in datasets.items():
        if verbose:
            print(f'Current dataset: {dataset_name}')
        params_history, scores_test, scores_train, best_params, best_model = bayes_search_tuning(
            model=model_class(),
            params=hyperparameters_bayes[model_name],
            X_train=dataset['X_train'],
            y_train=dataset['y_train'],
            random_state=random_state,
            n_iter=n_iter,
        )
        results_bayes[model_name]['full_params_history'].extend(params_history)
        results_bayes[model_name]['scores_test'][dataset_name] = scores_test
        results_bayes[model_name]['scores_train'][dataset_name] = scores_train
        results_bayes[model_name]['best_params'][dataset_name] = best_params
        results_bayes[model_name]['best_model'][dataset_name] = best_model

## KNN

In [56]:
run_random_search('KNN', model_class=KNeighborsClassifier, random_state=28, n_iter=20)

In [59]:
run_bayes_search('KNN', model_class=KNeighborsClassifier, random_state=28, n_iter=20)

## XGBoost

In [46]:
run_random_search('XGboost', model_class=XGBClassifier, random_state=28, n_iter=150)

In [47]:
run_bayes_search('XGboost', model_class=XGBClassifier, random_state=28, n_iter=150)

## Random Forest

In [44]:
run_random_search('Random Forest', model_class=RandomForestClassifier, random_state=28, n_iter=150)

In [None]:
run_bayes_search('Random Forest', model_class=RandomForestClassifier, random_state=28, n_iter=150)

## Loading results from files

To speed up above computations they were split and conducted separately on two different devices. Their results are loaded and merged below:

In [38]:
import json

with open(f'{base_path}/results_bayes_knn_and_xgboost.json') as file:
    file_contents = json.load(file)
    results_bayes['KNN'] = file_contents['KNN']
    results_bayes['XGboost'] = file_contents['XGboost']
with open(f'{base_path}/results_bayes_rf.json') as file:
    file_contents = json.load(file)
    results_bayes['Random Forest'] = file_contents['Random Forest']

with open(f'{base_path}/results_random_KNN.json') as file:
    file_contents = json.load(file)
    results_random['KNN'] = file_contents['KNN']
with open(f'{base_path}/results_random_xgb.json') as file:
    file_contents = json.load(file)
    results_random['XGboost'] = file_contents['XGboost']
with open(f'{base_path}/results_random_rf.json') as file:
    file_contents = json.load(file)
    results_random['Random Forest'] = file_contents['Random Forest']

# Calculating tunabilities

## Algorithm tunabilites

### KNN: Random Search

In [48]:
_, knn_random_theta_star = find_optimal_default_conf(
    params_hist=results_random['KNN']['full_params_history'],
    datasets_=datasets,
    model_class=KNeighborsClassifier,
)

In [49]:
knn_random_algo_tunability = measure_tunability_algorithm(knn_random_theta_star, 
                                results_random['KNN']['best_params'], 
                                datasets, 
                                KNeighborsClassifier)

### KNN: Bayes Search

In [51]:
_, knn_bayes_theta_star = find_optimal_default_conf(
    params_hist=results_bayes['KNN']['full_params_history'],
    datasets_=datasets,
    model_class=KNeighborsClassifier,
)

In [52]:
knn_bayes_algo_tunability = measure_tunability_algorithm(knn_bayes_theta_star, 
                                results_bayes['KNN']['best_params'], 
                                datasets, 
                                KNeighborsClassifier)

### XGBoost: Random Search

In [54]:
_, xgboost_random_theta_star = find_optimal_default_conf(
    params_hist=results_random['XGboost']['full_params_history'],
    datasets_=datasets,
    model_class=XGBClassifier,
)

In [None]:
xgboost_random_algo_tunability = measure_tunability_algorithm(
    xgboost_random_theta_star, 
    results_random['XGboost']['best_params'], 
    datasets, 
    XGBClassifier,
)

### XGBoost: Bayes Search

In [55]:
_, xgboost_bayes_theta_star = find_optimal_default_conf(
    params_hist=results_bayes['XGboost']['full_params_history'],
    datasets_=datasets,
    model_class=XGBClassifier,
)

In [None]:
xgboost_bayes_algo_tunability = measure_tunability_algorithm(
    xgboost_bayes_theta_star, 
    results_bayes['XGboost']['best_params'], 
    datasets, 
    XGBClassifier,
)

### Random Forest: Random Search

In [55]:
_, rf_random_theta_star = find_optimal_default_conf(
    params_hist=results_random['Random Forest']['full_params_history'],
    datasets_=datasets,
    model_class=RandomForestClassifier,
)

In [None]:
rf_random_algo_tunability = measure_tunability_algorithm(rf_random_theta_star, 
                                results_random['Random Forest']['best_params'], 
                                datasets, 
                                RandomForestClassifier)

### Random Forest: Bayes Search

In [None]:
_, rf_bayes_theta_star = find_optimal_default_conf(
    params_hist=results_bayes['Random Forest']['full_params_history'],
    datasets_=datasets,
    model_class=RandomForestClassifier,
)

In [None]:
rf_bayes_algo_tunability = measure_tunability_algorithm(rf_bayes_theta_star, 
                                results_bayes['Random Forest']['best_params'], 
                                datasets, 
                                RandomForestClassifier)

In [None]:
tunability_of_algo_random = {
    'KNN': knn_random_algo_tunability,
    'XGboost': xgboost_random_algo_tunability,
    'Random Forest': rf_random_algo_tunability,
}

In [None]:
tunability_of_algo_bayes = {
    'KNN': knn_bayes_algo_tunability,
    'XGboost': xgboost_bayes_algo_tunability,
    'Random Forest': rf_bayes_algo_tunability,
}

In [None]:
plot_boxplot_algo_tunability(tunability_of_algo_random, "Random")

In [None]:
plot_boxplot_algo_tunability(tunability_of_algo_bayes, "Bayes")

## Hyperparameter tunabilites

Above computations have been conducted on another device. 

Some of their results are loaded below, because they are needed for further computations:

In [39]:
with open(f'{base_path}/thetas_star_random.json') as file:
    data = json.load(file)
    knn_random_theta_star = data['KNN']
    xgboost_random_theta_star = data['XGboost']
    rf_random_theta_star = data['Random Forest']

In [40]:
with open(f'{base_path}/thetas_star_bayes.json') as file:
    data = json.load(file)
    knn_bayes_theta_star = data['KNN']
    xgboost_bayes_theta_star = data['XGboost']
    rf_bayes_theta_star = data['Random Forest']

### KNN: Random Search

In [45]:
knn_random_params_tun = {
    param: np.array([]) for param in hyperparameters_random['KNN'].keys()
}
knn_random_params_tun

{'n_neighbors': array([], dtype=float64)}

In [46]:
for param_i in list(knn_random_params_tun.keys()):
    theta_i_stars_per_df = find_optimal_default_conf_param(
        param_i_name=param_i,
        params_hist=results_random['KNN']['full_params_history'],
        theta_star=knn_random_theta_star,
        datasets_=datasets,
        model_class=KNeighborsClassifier,
    )
    param_i_tunability = measure_param_tunability(
        theta_i_stars_per_df,
        theta_star=knn_random_theta_star,
        datasets_=datasets,
        model_class=KNeighborsClassifier,
    )
    knn_random_params_tun[param_i] = param_i_tunability

In [43]:
plot_boxplots_param_tunability('KNN', knn_random_params_tun, "Random")

### KNN: Bayes Search

In [47]:
knn_bayes_params_tun = {
    param: np.array([]) for param in hyperparameters_bayes['KNN'].keys()
}
knn_bayes_params_tun

{'n_neighbors': array([], dtype=float64)}

In [48]:
for param_i in list(knn_bayes_params_tun.keys()):
    theta_i_stars_per_df = find_optimal_default_conf_param(
        param_i_name=param_i,
        params_hist=results_bayes['KNN']['full_params_history'],
        theta_star=knn_bayes_theta_star,
        datasets_=datasets,
        model_class=KNeighborsClassifier,
    )
    param_i_tunability = measure_param_tunability(
        theta_i_stars_per_df,
        theta_star=knn_bayes_theta_star,
        datasets_=datasets,
        model_class=KNeighborsClassifier,
    )
    knn_bayes_params_tun[param_i] = param_i_tunability

In [47]:
plot_boxplots_param_tunability('KNN', knn_bayes_params_tun, "Bayes")

### XGBoost: Random Search

In [49]:
xgboost_random_params_tun = {
    param: np.array([]) for param in hyperparameters_random['XGboost'].keys()
}
xgboost_random_params_tun

{'n_estimators': array([], dtype=float64),
 'eta': array([], dtype=float64),
 'subsample': array([], dtype=float64),
 'booster': array([], dtype=float64),
 'max_depth': array([], dtype=float64),
 'min_child_weight': array([], dtype=float64),
 'colsample_bytree': array([], dtype=float64),
 'alpha': array([], dtype=float64)}

In [50]:
for param_i in list(xgboost_random_params_tun.keys()):
    print(f'Current param: {param_i}')
    theta_i_stars_per_df = find_optimal_default_conf_param(
        param_i_name=param_i,
        params_hist=results_random['XGboost']['full_params_history'],
        theta_star=xgboost_random_theta_star,
        datasets_=datasets,
        model_class=XGBClassifier,
    )
    param_i_tunability = measure_param_tunability(
        theta_i_stars_per_df,
        theta_star=xgboost_random_theta_star,
        datasets_=datasets,
        model_class=XGBClassifier,
    )
    xgboost_random_params_tun[param_i] = param_i_tunability

Current param: n_estimators
Current param: eta
Current param: subsample
Current param: booster
Current param: max_depth
Current param: min_child_weight
Current param: colsample_bytree
Current param: alpha


In [50]:
plot_boxplots_param_tunability('XGboost', xgboost_random_params_tun, "Random")

### XGboost: Bayes Search

In [51]:
xgboost_bayes_params_tun = {
    param: np.array([]) for param in hyperparameters_bayes['XGboost'].keys()
}
xgboost_bayes_params_tun

{'n_estimators': array([], dtype=float64),
 'eta': array([], dtype=float64),
 'subsample': array([], dtype=float64),
 'booster': array([], dtype=float64),
 'max_depth': array([], dtype=float64),
 'min_child_weight': array([], dtype=float64),
 'colsample_bytree': array([], dtype=float64),
 'alpha': array([], dtype=float64)}

In [52]:
for param_i in list(xgboost_bayes_params_tun.keys()):
    print(f'Current param: {param_i}')
    theta_i_stars_per_df = find_optimal_default_conf_param(
        param_i_name=param_i,
        params_hist=results_bayes['XGboost']['full_params_history'],
        theta_star=xgboost_bayes_theta_star,
        datasets_=datasets,
        model_class=XGBClassifier,
    )
    param_i_tunability = measure_param_tunability(
        theta_i_stars_per_df,
        theta_star=xgboost_bayes_theta_star,
        datasets_=datasets,
        model_class=XGBClassifier,
    )
    xgboost_bayes_params_tun[param_i] = param_i_tunability

Current param: n_estimators
Current param: eta
Current param: subsample
Current param: booster
Current param: max_depth
Current param: min_child_weight
Current param: colsample_bytree
Current param: alpha


In [55]:
plot_boxplots_param_tunability('XGboost', xgboost_bayes_params_tun, "Bayes")

### Random Forest: Random Search

In [53]:
rf_random_params_tun = {
    param: np.array([]) for param in hyperparameters_random['Random Forest'].keys()
}
rf_random_params_tun

{'n_estimators': array([], dtype=float64),
 'max_depth': array([], dtype=float64),
 'max_features': array([], dtype=float64),
 'max_samples': array([], dtype=float64),
 'min_samples_leaf': array([], dtype=float64)}

In [54]:
for param_i in list(rf_random_params_tun.keys()):
    print(f'Current param: {param_i}')
    theta_i_stars_per_df = find_optimal_default_conf_param(
        param_i_name=param_i,
        params_hist=results_random['Random Forest']['full_params_history'],
        theta_star=rf_random_theta_star,
        datasets_=datasets,
        model_class=RandomForestClassifier,
    )
    param_i_tunability = measure_param_tunability(
        theta_i_stars_per_df,
        theta_star=rf_random_theta_star,
        datasets_=datasets,
        model_class=RandomForestClassifier,
    )
    rf_random_params_tun[param_i] = param_i_tunability

Current param: n_estimators
Current param: max_depth
Current param: max_features
Current param: max_samples
Current param: min_samples_leaf


In [56]:
plot_boxplots_param_tunability('Random Forest', rf_random_params_tun, "Random")

### Random Forest: Bayes Search

In [41]:
rf_bayes_params_tun = {
    param: np.array([]) for param in hyperparameters_bayes['Random Forest'].keys()
}
rf_bayes_params_tun

{'n_estimators': array([], dtype=float64),
 'max_depth': array([], dtype=float64),
 'max_features': array([], dtype=float64),
 'max_samples': array([], dtype=float64),
 'min_samples_leaf': array([], dtype=float64)}

In [42]:
for param_i in list(rf_bayes_params_tun.keys()):
    print(f'Current param: {param_i}')
    theta_i_stars_per_df = find_optimal_default_conf_param(
        param_i_name=param_i,
        params_hist=results_bayes['Random Forest']['full_params_history'],
        theta_star=rf_bayes_theta_star,
        datasets_=datasets,
        model_class=RandomForestClassifier,
    )
    param_i_tunability = measure_param_tunability(
        theta_i_stars_per_df,
        theta_star=rf_bayes_theta_star,
        datasets_=datasets,
        model_class=RandomForestClassifier,
    )
    rf_bayes_params_tun[param_i] = param_i_tunability

Current param: n_estimators
Current param: max_depth
Current param: max_features
Current param: max_samples
Current param: min_samples_leaf


In [43]:
plot_boxplots_param_tunability('Random Forest', rf_bayes_params_tun, "Bayes")

In [56]:
tunability_of_params_random = {
    'KNN': knn_random_params_tun,
    'XGboost': xgboost_random_params_tun,
    'Random Forest': rf_random_params_tun,
}
tunability_of_params_bayes = {
    'KNN': knn_bayes_params_tun,
    'XGboost': xgboost_bayes_params_tun,
    'Random Forest': rf_bayes_params_tun,
}

In [60]:
class NumpyEncoder(json.JSONEncoder):
    """
    https://stackoverflow.com/questions/26646362/numpy-array-is-not-json-serializable
    """
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

with open(f'{base_path}/tunability_of_params_random.json', 'w') as file:
    json.dump(tunability_of_params_random, file, indent=4, cls=NumpyEncoder)
with open(f'{base_path}/tunability_of_params_bayes.json', 'w') as file:
    json.dump(tunability_of_params_bayes, file, indent=4, cls=NumpyEncoder)

# Checking stability

## Random Search

### KNN

In [61]:
plot_stability(results_random['KNN']['scores_test'], 
               'Random', interval=4)

### XGBoost

In [62]:
plot_stability(results_random['XGboost']['scores_test'], 
               'Random', interval=10)

### RandomForest

In [63]:
plot_stability(results_random['Random Forest']['scores_test'], 
               'Random', interval=10)

## Bayes Search

### KNN

In [64]:
plot_stability(results_bayes['KNN']['scores_test'], 
               'Bayesian', interval=10)

### XGBoost

In [65]:
plot_stability(results_bayes['XGboost']['scores_test'], 
               'Bayesian', interval=10)

### Random Forest

In [66]:
plot_stability(results_bayes['Random Forest']['scores_test'],
               'Bayesian', interval=10)

# Tuning History

## Random Search

### KNN

In [67]:
plot_history(results_random['KNN']['scores_train'],
             results_random['KNN']['scores_test'],
             'Random')

### XGBoost

In [68]:
plot_history(results_random['XGboost']['scores_train'],
             results_random['XGboost']['scores_test'],
             'Random')

### RandomForest

In [69]:
plot_history(results_random['Random Forest']['scores_train'], 
             results_random['Random Forest']['scores_test'],
             'Random')

## Bayes Search

### KNN

In [70]:
plot_history(results_bayes['KNN']['scores_train'],
             results_bayes['KNN']['scores_test'],
             'Bayesian')

### XGBoost

In [71]:
plot_history(results_bayes['XGboost']['scores_train'],
             results_bayes['XGboost']['scores_test'],
             'Bayesian')

### RandomForest

In [72]:
plot_history(results_bayes['Random Forest']['scores_train'],
             results_bayes['Random Forest']['scores_test'],
             'Bayesian')