In [190]:
import warnings
warnings.filterwarnings("ignore")

In [191]:
import openml as oml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import ElasticNet
from skopt import BayesSearchCV
from skopt.space.space import Real
from scipy.stats import uniform
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

In [192]:
def read_data(file_id):
    df = oml.datasets.get_dataset(dataset_id=file_id, download_data=True, download_qualities = True, download_features_meta_data=True)
    df,_,_,_ = df.get_data(dataset_format ="dataframe")
    return df

### Wczytuję zbiory danych

In [193]:
df1 = read_data(1063)
df2 = read_data(879)
df3 = read_data(15)
df4 = read_data(37)

In [194]:
df1['problems'] = [1  if x == 'yes' else 0 for x in df1['problems']]

In [195]:
df1.isna().sum()

loc                 0
v(g)                0
ev(g)               0
iv(g)               0
n                   0
v                   0
l                   0
d                   0
i                   0
e                   0
b                   0
t                   0
lOCode              0
lOComment           0
lOBlank             0
lOCodeAndComment    0
uniq_Op             0
uniq_Opnd           0
total_Op            0
total_Opnd          0
branchCount         0
problems            0
dtype: int64

In [196]:
df1

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,lOCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,problems
0,1.1,1.4,1.4,1.4,1.3,1.30,1.30,1.30,1.30,1.30,...,2.0,2,2,2,1.2,1.2,1.2,1.2,1.4,0
1,1.0,1.0,1.0,1.0,1.0,1.00,1.00,1.00,1.00,1.00,...,1.0,1,1,1,1.0,1.0,1.0,1.0,1.0,1
2,415.0,59.0,50.0,51.0,1159.0,8411.31,0.01,103.53,81.24,870848.58,...,359.0,35,9,10,47.0,106.0,692.0,467.0,106.0,1
3,230.0,33.0,10.0,16.0,575.0,3732.82,0.03,39.82,93.74,148644.06,...,174.0,15,34,5,23.0,67.0,343.0,232.0,65.0,1
4,175.0,26.0,12.0,13.0,500.0,3123.96,0.03,29.48,105.96,92103.07,...,142.0,7,19,4,18.0,58.0,310.0,190.0,51.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,4.0,1.0,1.0,1.0,5.0,11.61,0.50,2.00,5.80,23.22,...,2.0,0,0,0,4.0,1.0,4.0,1.0,1.0,1
518,4.0,1.0,1.0,1.0,4.0,8.00,0.67,1.50,5.33,12.00,...,2.0,0,0,0,3.0,1.0,3.0,1.0,1.0,1
519,4.0,1.0,1.0,1.0,4.0,8.00,0.67,1.50,5.33,12.00,...,2.0,0,0,0,3.0,1.0,3.0,1.0,1.0,1
520,4.0,1.0,1.0,1.0,5.0,11.61,0.67,1.50,7.74,17.41,...,2.0,0,0,0,3.0,2.0,3.0,2.0,1.0,1


In [197]:
df2['binaryClass'] = [1  if x == 'P' else 0 for x in df2['binaryClass']]

In [198]:
df2.isna().sum()

oz1            0
oz2            0
oz3            0
oz4            0
oz5            0
oz6            0
oz7            0
oz8            0
oz9            0
oz10           0
oz11           0
oz12           0
oz13           0
oz14           0
oz15           0
oz16           0
oz17           0
oz18           0
oz19           0
oz20           0
oz21           0
oz22           0
oz23           0
oz24           0
oz25           0
binaryClass    0
dtype: int64

In [199]:
df2

Unnamed: 0,oz1,oz2,oz3,oz4,oz5,oz6,oz7,oz8,oz9,oz10,...,oz17,oz18,oz19,oz20,oz21,oz22,oz23,oz24,oz25,binaryClass
0,1.369332,1.348053,1.245546,0.255949,0.095249,1.344574,1.267686,0.301357,-0.939330,0.277674,...,0.466503,0.735694,1.147137,-0.252866,-1.709447,1.427959,1.817673,1.691976,-1.488347,1
1,0.699379,1.356297,0.464627,-0.518112,-0.100324,1.327687,0.016989,-0.111052,-0.230803,1.408724,...,-0.928591,-0.698394,0.123011,-1.643069,1.373670,-0.970713,-0.070211,-1.361266,0.443797,1
2,0.122215,0.390102,0.355066,0.528076,-0.465258,0.099297,1.588123,1.088600,-0.872816,-1.416401,...,0.948611,0.333686,-1.316199,1.276892,0.065366,1.638574,1.189680,-1.273613,-0.281817,1
3,0.569990,-0.012569,-0.159028,1.547892,-0.702307,-1.337401,-0.640130,0.470843,-1.349861,-1.541814,...,-1.661619,1.507752,1.469623,0.377710,1.148096,-1.291779,0.689598,0.782770,1.558731,0
4,-0.841718,-0.698022,-0.652951,0.021105,1.706582,0.778097,-0.688424,-1.693133,-0.787110,-1.397107,...,0.352606,0.912313,0.734768,-0.351515,0.540305,0.107928,1.334917,1.657940,1.154481,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-0.239268,0.132987,-0.734252,-0.853620,0.438660,0.658420,-1.049204,0.182437,1.251415,0.816688,...,-1.537533,1.051780,-1.793538,-0.156870,-0.442729,1.686997,1.304788,0.945624,-1.061607,0
496,1.132122,0.639799,-0.966933,-1.016170,1.507822,1.613090,1.652634,0.266832,0.756470,0.134773,...,0.676295,0.352450,0.204556,-0.447309,-1.444957,-0.798681,0.083117,-0.871992,-1.201687,1
497,0.774384,0.203377,0.779871,0.858155,0.993768,1.211152,-1.212718,-1.272663,1.235402,0.017784,...,1.422050,-1.559507,-0.377867,0.118937,1.458773,0.036788,-0.194141,-1.091008,-0.174352,1
498,-1.013707,-0.694703,-1.156980,-1.168977,0.184395,0.820507,1.381893,-0.028399,0.335243,0.492245,...,1.076988,0.855167,-0.227880,1.317175,0.470612,1.755487,0.073124,0.198448,1.404181,0


In [200]:
df3['Class'] = [1  if x == 'malignant' else 0 for x in df3['Class']]

In [201]:
df3.isna().sum()

Clump_Thickness           0
Cell_Size_Uniformity      0
Cell_Shape_Uniformity     0
Marginal_Adhesion         0
Single_Epi_Cell_Size      0
Bare_Nuclei              16
Bland_Chromatin           0
Normal_Nucleoli           0
Mitoses                   0
Class                     0
dtype: int64

In [202]:
min_value = df3['Bare_Nuclei'].min()
df3['Bare_Nuclei'].fillna(min_value, inplace=True)

In [203]:
df3

Unnamed: 0,Clump_Thickness,Cell_Size_Uniformity,Cell_Shape_Uniformity,Marginal_Adhesion,Single_Epi_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0
1,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,0
2,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,0
3,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,0
4,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...
694,3.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,0
695,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,0
696,5.0,10.0,10.0,3.0,7.0,3.0,8.0,10.0,2.0,1
697,4.0,8.0,6.0,4.0,3.0,4.0,10.0,6.0,1.0,1


In [204]:
df4['class'] = [1  if x == 'tested_positive' else 0 for x in df4['class']]

In [205]:
df4.isna().sum()

preg     0
plas     0
pres     0
skin     0
insu     0
mass     0
pedi     0
age      0
class    0
dtype: int64

In [206]:
df4

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,0
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,0
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,1


### Modelowanie

In [207]:
X1_train, X1_test, y1_train, y1_test = train_test_split(df1.iloc[:,:-1], df1.iloc[:,-1], test_size=0.3, random_state=0)
X2_train, X2_test, y2_train, y2_test = train_test_split(df2.iloc[:,:-1], df2.iloc[:,-1], test_size=0.3, random_state=0)
X3_train, X3_test, y3_train, y3_test = train_test_split(df3.iloc[:,:-1], df3.iloc[:,-1], test_size=0.3, random_state=0)
X4_train, X4_test, y4_train, y4_test = train_test_split(df4.iloc[:,:-1], df4.iloc[:,-1], test_size=0.3, random_state=0)

#### Zbiór hiperparametrów do tuningowania

In [208]:
params_RF = {
    'n_estimators': np.arange(1, 2001),
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(3, 11),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11),
    'bootstrap': [True, False],
    'max_samples': np.arange(0, 1.1, 0.1),   
}

params_EN = {
    "alpha": [2**x for x in range(-10, 11)],
    "l1_ratio": uniform(0,1), # Real(0, 1) for bayesian optimization
}

params_DT = {
    'max_depth': np.arange(1, 31), 
    'max_features': [None, 'log2', 'sqrt'],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': np.arange(2, 61),  
    'min_samples_leaf': np.arange(1, 61),
}

In [209]:
# R(j)(teta *)

def get_overall_score(agg_results, results):
    max_index = agg_results['mean_auc'].idxmax()
    best_params = agg_results.loc[max_index, 'params']
    best_score = results.loc[max_index, 'mean_test_score']
    return best_params, best_score

In [210]:
# R(j)(teta (j) *)

def get_score(results):
    max_index = results['mean_test_score'].idxmax()
    best_params = results.loc[max_index, 'params']
    best_score = results['mean_test_score'].max()
    return best_params, best_score

In [211]:
def compute_on_algorithm(clf,params,n_iter,X_train,y_train, algorithm):
    # AA function to perform given classifier on all dataframes, as a result it returns combined mean score on each parameter vector

    # Dynamically adjust the search space based on the value of 'bootstrap'
    if all(key in params for key in ['bootstrap','max_samples']):
        if params['bootstrap']:
            params['max_samples'] = [None]  # Set max_samples to None if bootstrap is True
    
    # Dynamically adjust the search space based on 'algorithm' - Random Search and Bayesian Optimisation sometimes require different parameter initialization
    if all(key in params for key in ['l1_ratio']):
        if algorithm == 'bayesian':
            params['l1_ratio'] = Real(0,1)  # Set l1_ratio to Real(0,1) if using bayesian optimization
        if algorithm == 'random':
            params['l1_ratio'] = uniform(0,1)  # Set l1_ratio to uniform(0,1) if using random search

    results_df=pd.DataFrame()
    individual_results =[]

    for x,y in zip(X_train,y_train):
        if algorithm == 'bayesian':
            results = bayesian_optimization(clf=clf, params=params, n_iter=n_iter, X_train=x, y_train=y)
        if algorithm == 'random':
            results = randomized_search(clf=clf, params=params, n_iter=n_iter, X_train=x, y_train=y)
        results_df = pd.concat([results_df,results]).reset_index(drop=True)
        individual_results.append(results)

    results_df['params'] = results_df['params'].astype(str)
    results_df['mean_auc'] = results_df.groupby('params')['mean_test_score'].transform('mean')

    return results_df, individual_results

In [212]:
def boxplot(individual_result_list, plotTitle):
    # extract mean_test_score of each set
    df1_scores = individual_result_list[0][~individual_result_list[0].applymap(lambda x: pd.isna(x) if x is not None else False).any(axis=1)]['mean_test_score']
    df2_scores = individual_result_list[1][~individual_result_list[1].applymap(lambda x: pd.isna(x) if x is not None else False).any(axis=1)]['mean_test_score']
    df3_scores = individual_result_list[2][~individual_result_list[2].applymap(lambda x: pd.isna(x) if x is not None else False).any(axis=1)]['mean_test_score']
    df4_scores = individual_result_list[3][~individual_result_list[3].applymap(lambda x: pd.isna(x) if x is not None else False).any(axis=1)]['mean_test_score']

    # Create a DataFrame
    df = pd.DataFrame({
        'Dataset': ['1'] * len(df1_scores) + ['2'] * len(df2_scores) + ['3'] * len(df3_scores) + ['4'] * len(df4_scores),
        'Mean AUC': np.concatenate([df1_scores, df2_scores, df3_scores, df4_scores])
    })

    # Create boxplot
    fig = px.box(df, x='Dataset', y='Mean AUC', points="all", title=str(plotTitle), width=800, height=1000)

    # Show the plot
    fig.show()

In [213]:
def get_tunability(agg_results, results):
    tunability = get_overall_score(agg_results, results)[1] - get_score(results)[1]
    return abs(tunability)

In [214]:
def tunability_plot(data_list, title):
    # Create subplots
    fig = make_subplots(rows=len(data_list), cols=1, subplot_titles=[data['algorithm'] for data in data_list])

    # Lists to store all values across subplots
    all_values = []

    # Iterate through each subplot
    for i, data in enumerate(data_list, start=1):
        subplot_row = i

        # Add trace
        trace = px.bar(data, x='value', y='dataset', orientation='h').data[0]
        fig.add_trace(trace, row=subplot_row, col=1)

        # Update trace with marker color
        fig.update_traces(marker_color=['rgba(0, 128, 0, 0.5)' if val > 0 else 'rgba(255, 0, 0, 0.5)' for val in data['value']],
                          row=subplot_row, col=1)

        # Collect values for each subplot
        all_values.extend(data['value'])

    # Set the same scale for all subplots
    min_value = min(all_values)
    max_value = max(all_values)

    # Update the layout with the same scale for all subplots
    for i in range(1, len(data_list) + 1):
        fig.update_xaxes(range=[min_value, max_value], row=i, col=1)

    # Update the overall layout
    fig.update_layout(height=600, width=800, showlegend=False, title=str(title))

    # Show the plot
    fig.show()


In [80]:
def append_to_alg_tunability(tunability_list,overall_results, list_of_results, alg_name):

    new_entry = {'dataset': ['Dataset 1','Dataset 2', 'Dataset 3', 'Dataset 4'],
            'value': [get_tunability(overall_results, list_of_results[i]) for i in range(len(list_of_results))],
            'algorithm': str(alg_name)}

    # Check if there already is a dictionary for the specified algorithm
    index_to_replace = next((i for i, entry in enumerate(tunability_list) if entry['algorithm'] == str(alg_name)), None)

    # If a matching dictionary is found, replace it with a new one, else add new dictionary to the list
    if index_to_replace is not None:
        list[index_to_replace] = new_entry
    else:
        tunability_list.append(new_entry)

In [28]:
X_train = [X1_train, X2_train, X3_train, X4_train]
y_train = [y1_train, y2_train, y3_train, y4_train]

In [82]:
# store tunability for each algorithm
rs_alg_tunability = []
bo_alg_tunability =[]

### 1. Random Classifier

In [206]:
def randomized_search(clf, params, n_iter, X_train, y_train, scoring='roc_auc'):

    random_search = RandomizedSearchCV(estimator=clf, param_distributions=params, n_iter=n_iter, cv=5, random_state=0, scoring=scoring, verbose=1)
    random_search.fit(X_train, y_train)

    return pd.DataFrame(random_search.cv_results_)

#### 1.1 Random Forest

In [62]:
rs_rf_results, rs_rf_individuals = compute_on_algorithm(clf=RandomForestClassifier(),params=params_RF,n_iter=100,X_train=X_train,y_train=y_train,algorithm='random')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


#### Wyniki

teta*

In [263]:
for i in range(len(rs_rf_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_overall_score(rs_rf_results, rs_rf_individuals[i])[0])
    print("AUC: ", get_overall_score(rs_rf_results, rs_rf_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  {'n_estimators': 1493, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_samples': None, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.8491603350866939
---------------------------dataset 2---------------------------
teta:  {'n_estimators': 1493, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_samples': None, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.9117887609206846
---------------------------dataset 3---------------------------
teta:  {'n_estimators': 1493, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_samples': None, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.9949969148498561
---------------------------dataset 4---------------------------
teta:  {'n_estimators': 1493, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_samples': None, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.8289006584268396


teta dla poszczególnych zbiorów:

In [264]:
for i in range(len(rs_rf_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_score(rs_rf_individuals[i])[0])
    print("AUC: ", get_score(rs_rf_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  {'n_estimators': 1124, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_samples': None, 'max_depth': 3, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.8551509838301188
---------------------------dataset 2---------------------------
teta:  {'n_estimators': 259, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_samples': None, 'max_depth': 9, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.9278857211513705
---------------------------dataset 3---------------------------
teta:  {'n_estimators': 1398, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_samples': None, 'max_depth': 9, 'criterion': 'gini', 'bootstrap': False}
AUC:  0.9961131393116688
---------------------------dataset 4---------------------------
teta:  {'n_estimators': 1748, 'min_samples_split': 6, 'min_samples_leaf': 8, 'max_samples': None, 'max_depth': 4, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.8344860692619476


tunowalność:

In [45]:
for i in range(len(rs_rf_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("tunability: ", get_tunability(rs_rf_results, rs_rf_individuals[i]))

---------------------------dataset 1---------------------------
tunability:  0.00599064874342492
---------------------------dataset 2---------------------------
tunability:  0.016096960230685853
---------------------------dataset 3---------------------------
tunability:  0.0011162244618126982
---------------------------dataset 4---------------------------
tunability:  0.005585410835107996


In [83]:
# append the results to the list
append_to_alg_tunability(rs_alg_tunability,rs_rf_results,rs_rf_individuals,'Random Forest')

AUC w zależności od zbioru

In [267]:
boxplot(rs_rf_individuals,"RS: Average performance for Random Forest")

#### 1.2 Elastic Net

In [279]:
rs_en_results, rs_en_individuals = compute_on_algorithm(clf=ElasticNet(),params=params_EN,n_iter=100,X_train=X_train,y_train=y_train,algorithm='random')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


#### Wyniki

teta*

In [281]:
for i in range(len(rs_en_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_overall_score(rs_en_results, rs_en_individuals[i])[0])
    print("AUC: ", get_overall_score(rs_en_results, rs_en_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  {'alpha': 2, 'l1_ratio': 0.009356704856532616}
AUC:  0.8522779198113678
---------------------------dataset 2---------------------------
teta:  {'alpha': 2, 'l1_ratio': 0.009356704856532616}
AUC:  0.716706299803074
---------------------------dataset 3---------------------------
teta:  {'alpha': 2, 'l1_ratio': 0.009356704856532616}
AUC:  0.9976823046054737
---------------------------dataset 4---------------------------
teta:  {'alpha': 2, 'l1_ratio': 0.009356704856532616}
AUC:  0.8257429901496897


teta dla poszczególnych zbiorów:

In [282]:
for i in range(len(rs_en_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_score(rs_en_individuals[i])[0])
    print("AUC: ", get_score(rs_en_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  {'alpha': 512, 'l1_ratio': 0.6458941130666561}
AUC:  0.8630295203390433
---------------------------dataset 2---------------------------
teta:  {'alpha': 2, 'l1_ratio': 0.009356704856532616}
AUC:  0.716706299803074
---------------------------dataset 3---------------------------
teta:  {'alpha': 2, 'l1_ratio': 0.009356704856532616}
AUC:  0.9976823046054737
---------------------------dataset 4---------------------------
teta:  {'alpha': 0.0009765625, 'l1_ratio': 0.09956908911081108}
AUC:  0.828618328427312


tunowalność:

In [44]:
for i in range(len(rs_en_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("tunability: ", get_tunability(rs_en_results, rs_en_individuals[i]))

---------------------------dataset 1---------------------------
tunability:  0.00815227384921513
---------------------------dataset 2---------------------------
tunability:  0.0005523723588239937
---------------------------dataset 3---------------------------
tunability:  0.0003859843966016463
---------------------------dataset 4---------------------------
tunability:  0.003319665905048619


In [84]:
# append the results to the list
append_to_alg_tunability(rs_alg_tunability,rs_en_results,rs_en_individuals,'Elastic Net')

AUC w zależności od zbioru

In [285]:
boxplot(rs_en_individuals,"RS: Average performance for Elastic Net")

#### 1.3 Decision Tree

In [93]:
rs_dt_results, rs_dt_individuals = compute_on_algorithm(clf=DecisionTreeClassifier(),params=params_DT,n_iter=100,X_train=X_train,y_train=y_train,algorithm='random')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


#### Wyniki

teta*

In [286]:
for i in range(len(rs_dt_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_overall_score(rs_dt_results, rs_dt_individuals[i])[0])
    print("AUC: ", get_overall_score(rs_dt_results, rs_dt_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  {'splitter': 'best', 'min_samples_split': 35, 'min_samples_leaf': 48, 'max_features': None, 'max_depth': 7, 'criterion': 'entropy'}
AUC:  0.8287665247279508
---------------------------dataset 2---------------------------
teta:  {'splitter': 'best', 'min_samples_split': 35, 'min_samples_leaf': 48, 'max_features': None, 'max_depth': 7, 'criterion': 'entropy'}
AUC:  0.8481230296996849
---------------------------dataset 3---------------------------
teta:  {'splitter': 'best', 'min_samples_split': 35, 'min_samples_leaf': 48, 'max_features': None, 'max_depth': 7, 'criterion': 'entropy'}
AUC:  0.9640302987453723
---------------------------dataset 4---------------------------
teta:  {'splitter': 'best', 'min_samples_split': 35, 'min_samples_leaf': 48, 'max_features': None, 'max_depth': 7, 'criterion': 'entropy'}
AUC:  0.7966926217767515


teta dla poszczególnych zbiorów:

In [287]:
for i in range(len(rs_dt_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_score(rs_dt_individuals[i])[0])
    print("AUC: ", get_score(rs_dt_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  {'splitter': 'best', 'min_samples_split': 58, 'min_samples_leaf': 57, 'max_features': None, 'max_depth': 19, 'criterion': 'gini'}
AUC:  0.8487537224124015
---------------------------dataset 2---------------------------
teta:  {'splitter': 'best', 'min_samples_split': 35, 'min_samples_leaf': 19, 'max_features': None, 'max_depth': 23, 'criterion': 'gini'}
AUC:  0.8784197404781928
---------------------------dataset 3---------------------------
teta:  {'splitter': 'random', 'min_samples_split': 56, 'min_samples_leaf': 12, 'max_features': None, 'max_depth': 5, 'criterion': 'entropy'}
AUC:  0.9865280491567257
---------------------------dataset 4---------------------------
teta:  {'splitter': 'random', 'min_samples_split': 13, 'min_samples_leaf': 17, 'max_features': None, 'max_depth': 29, 'criterion': 'gini'}
AUC:  0.8006798547618308


tunowalność:

In [43]:
for i in range(len(rs_dt_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("tunability: ", get_tunability(rs_dt_results, rs_dt_individuals[i]))

---------------------------dataset 1---------------------------
tunability:  0.019987197684450786
---------------------------dataset 2---------------------------
tunability:  0.03029671077850793
---------------------------dataset 3---------------------------
tunability:  0.02249775041135338
---------------------------dataset 4---------------------------
tunability:  0.003987232985079303


In [85]:
# append the results to the list
append_to_alg_tunability(rs_alg_tunability,rs_dt_results,rs_dt_individuals,'Decision Tree')

AUC w zależności od zbioru

In [290]:
boxplot(rs_dt_individuals,"RS: Average performance for Decision Tree")

## 2. Bayesian optimization

In [291]:
def bayesian_optimization(clf, params, n_iter, X_train, y_train, scoring='roc_auc'):

    bayes_search = BayesSearchCV(estimator=clf, search_spaces=params, n_iter=n_iter, cv=5, random_state=0,scoring=scoring,verbose=1)
    bayes_search.fit(X_train, y_train)
    
    return pd.DataFrame(bayes_search.cv_results_)

### 2.1 RandomForest

In [210]:
# perform bayes optimization on RandomForest classifier
bo_rf_results, bo_rf_individuals = compute_on_algorithm(clf=RandomForestClassifier(),params=params_RF,n_iter=100,X_train=X_train,y_train=y_train,algorithm='bayesian')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

#### Wyniki

teta*

In [292]:
for i in range(len(bo_rf_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_overall_score(rs_rf_results, bo_rf_individuals[i])[0])
    print("AUC: ", get_overall_score(rs_rf_results, bo_rf_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  {'n_estimators': 1493, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_samples': None, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.8533350032005791
---------------------------dataset 2---------------------------
teta:  {'n_estimators': 1493, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_samples': None, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.9263420927664591
---------------------------dataset 3---------------------------
teta:  {'n_estimators': 1493, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_samples': None, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.9962050510763746
---------------------------dataset 4---------------------------
teta:  {'n_estimators': 1493, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_samples': None, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}
AUC:  0.8331482190951837


teta dla poszczególnych zbiorów:

In [293]:
for i in range(len(bo_rf_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_score(bo_rf_individuals[i])[0])
    print("AUC: ", get_score(bo_rf_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  OrderedDict([('bootstrap', True), ('criterion', 'entropy'), ('max_depth', 3), ('max_samples', None), ('min_samples_leaf', 9), ('min_samples_split', 8), ('n_estimators', 1779)])
AUC:  0.85679691631182
---------------------------dataset 2---------------------------
teta:  OrderedDict([('bootstrap', False), ('criterion', 'gini'), ('max_depth', 10), ('max_samples', None), ('min_samples_leaf', 1), ('min_samples_split', 3), ('n_estimators', 920)])
AUC:  0.9302313440094288
---------------------------dataset 3---------------------------
teta:  OrderedDict([('bootstrap', False), ('criterion', 'gini'), ('max_depth', 9), ('max_samples', None), ('min_samples_leaf', 1), ('min_samples_split', 5), ('n_estimators', 697)])
AUC:  0.9963915312628547
---------------------------dataset 4---------------------------
teta:  OrderedDict([('bootstrap', True), ('criterion', 'entropy'), ('max_depth', 3), ('max_samples', None), ('min_samples_le

tunowalność:

In [42]:
for i in range(len(bo_rf_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("tunability: ", get_tunability(rs_rf_results, bo_rf_individuals[i]))

---------------------------dataset 1---------------------------
tunability:  0.0034619131112408885
---------------------------dataset 2---------------------------
tunability:  0.0038892512429696824
---------------------------dataset 3---------------------------
tunability:  0.00018648018648015352
---------------------------dataset 4---------------------------
tunability:  0.003102763137761144


In [86]:
# append the results to the list
append_to_alg_tunability(bo_alg_tunability,rs_rf_results,bo_rf_individuals,'Random Forest')

AUC w zależności od zbioru

In [296]:
boxplot(bo_rf_individuals,"BO: Average performance for Random Forest")

### 2.2 Elastic Net

In [None]:
# Perform bayes optimization on Elastic Net classifier
bo_en_results, bo_en_individuals = compute_on_algorithm(clf=ElasticNet(),params=params_EN,n_iter=100,X_train=X_train,y_train=y_train, algorithm='bayesian')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

### Wyniki

teta*

In [297]:
for i in range(len(bo_en_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_overall_score(rs_en_results, bo_en_individuals[i])[0])
    print("AUC: ", get_overall_score(rs_en_results, bo_en_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  {'alpha': 2, 'l1_ratio': 0.009356704856532616}
AUC:  0.8630450603545835
---------------------------dataset 2---------------------------
teta:  {'alpha': 2, 'l1_ratio': 0.009356704856532616}
AUC:  0.7103506822297145
---------------------------dataset 3---------------------------
teta:  {'alpha': 2, 'l1_ratio': 0.009356704856532616}
AUC:  0.995293896308746
---------------------------dataset 4---------------------------
teta:  {'alpha': 2, 'l1_ratio': 0.009356704856532616}
AUC:  0.8289115131796236


teta dla poszczególnych zbiorów:

In [298]:
for i in range(len(bo_en_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_score(bo_en_individuals[i])[0])
    print("AUC: ", get_score(bo_en_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  OrderedDict([('alpha', 64.0), ('l1_ratio', 0.9993364300217557)])
AUC:  0.8640517386857398
---------------------------dataset 2---------------------------
teta:  OrderedDict([('alpha', 0.03125), ('l1_ratio', 0.4893190882888728)])
AUC:  0.7140596254951094
---------------------------dataset 3---------------------------
teta:  OrderedDict([('alpha', 2.0), ('l1_ratio', 0.0)])
AUC:  0.9979620728837592
---------------------------dataset 4---------------------------
teta:  OrderedDict([('alpha', 0.015625), ('l1_ratio', 0.0014536475115524141)])
AUC:  0.8289115131796236


tunowalność:

In [41]:
for i in range(len(bo_en_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("tunability: ", get_tunability(rs_en_results, bo_en_individuals[i]))

---------------------------dataset 1---------------------------
tunability:  0.0010222183466964685
---------------------------dataset 2---------------------------
tunability:  0.21405962549510937
---------------------------dataset 3---------------------------
tunability:  0.0021565847325669685
---------------------------dataset 4---------------------------
tunability:  0.0006115170417229088


In [87]:
# append the results to the list
append_to_alg_tunability(bo_alg_tunability,rs_en_results,bo_en_individuals,'Elastic Net')

AUC w zależności od zbioru

In [301]:
boxplot(bo_en_individuals,"BO: Average performance for Elastic Net")

### 2.3 DecisionTree

In [177]:
# Perform bayes optimization on Decicion Tree Classifier
bo_dt_results, bo_dt_individuals = compute_on_algorithm(clf=DecisionTreeClassifier(),params=params_DT,n_iter=100,X_train=X_train,y_train=y_train,algorithm= 'bayesian')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

### Wyniki

teta*

In [302]:
for i in range(len(bo_dt_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_overall_score(rs_dt_results, bo_dt_individuals[i])[0])
    print("AUC: ", get_overall_score(rs_dt_results, bo_dt_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  {'splitter': 'best', 'min_samples_split': 35, 'min_samples_leaf': 48, 'max_features': None, 'max_depth': 7, 'criterion': 'entropy'}
AUC:  0.8240327293980128
---------------------------dataset 2---------------------------
teta:  {'splitter': 'best', 'min_samples_split': 35, 'min_samples_leaf': 48, 'max_features': None, 'max_depth': 7, 'criterion': 'entropy'}
AUC:  0.8533872716454118
---------------------------dataset 3---------------------------
teta:  {'splitter': 'best', 'min_samples_split': 35, 'min_samples_leaf': 48, 'max_features': None, 'max_depth': 7, 'criterion': 'entropy'}
AUC:  0.9803703465651996
---------------------------dataset 4---------------------------
teta:  {'splitter': 'best', 'min_samples_split': 35, 'min_samples_leaf': 48, 'max_features': None, 'max_depth': 7, 'criterion': 'entropy'}
AUC:  0.6825067332571708


teta dla poszczególnych zbiorów:

In [303]:
for i in range(len(bo_dt_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("teta: ", get_score(bo_dt_individuals[i])[0])
    print("AUC: ", get_score(bo_dt_individuals[i])[1])

---------------------------dataset 1---------------------------
teta:  OrderedDict([('criterion', 'gini'), ('max_depth', 30), ('max_features', 'log2'), ('min_samples_leaf', 30), ('min_samples_split', 21), ('splitter', 'best')])
AUC:  0.8571091814867385
---------------------------dataset 2---------------------------
teta:  OrderedDict([('criterion', 'gini'), ('max_depth', 22), ('max_features', None), ('min_samples_leaf', 19), ('min_samples_split', 49), ('splitter', 'best')])
AUC:  0.8857730728961055
---------------------------dataset 3---------------------------
teta:  OrderedDict([('criterion', 'gini'), ('max_depth', 19), ('max_features', 'sqrt'), ('min_samples_leaf', 6), ('min_samples_split', 17), ('splitter', 'best')])
AUC:  0.9878399449814891
---------------------------dataset 4---------------------------
teta:  OrderedDict([('criterion', 'entropy'), ('max_depth', 4), ('max_features', None), ('min_samples_leaf', 43), ('min_samples_split', 29), ('splitter', 'best')])
AUC:  0.79741691

tunowalność:

In [40]:
for i in range(len(bo_dt_individuals)):
    print(f"---------------------------dataset {i+1}---------------------------")
    print("tunability: ", get_tunability(rs_dt_results, bo_dt_individuals[i]))

---------------------------dataset 1---------------------------
tunability:  0.03307645208872578
---------------------------dataset 2---------------------------
tunability:  0.03238580125069368
---------------------------dataset 3---------------------------
tunability:  0.007469598416289469
---------------------------dataset 4---------------------------
tunability:  0.11491018105299966


In [88]:
# append the results to the list
append_to_alg_tunability(bo_alg_tunability,rs_dt_results,bo_dt_individuals,'Decision Tree')

AUC w zależności od zbioru

In [306]:
boxplot(bo_dt_individuals,"BO: Average performance for Decision Tree")

### Tunability

In [215]:
#plot tunabilities for each parameter tunning method
# tunability_plot(rs_alg_tunability,"Tunability using Random Search")
tunability_plot(rs_alg_tunability,"Tunability using Random search")
print('-------------------------------------------------------------------------------------------------------')
tunability_plot(bo_alg_tunability,"Tunability using Bayes optimisation")

-------------------------------------------------------------------------------------------------------


In [216]:
# Plot tunabilities for each algorithm 

# Merge two lists and create a new list with specified column names
merged_list = []
methodnames=['Random Search','Bayesian optimization']
for i, data_list in enumerate([rs_alg_tunability, bo_alg_tunability]):
    for data in data_list:
        for j, (dataset, value) in enumerate(zip(data['dataset'], data['value'])):
            merged_list.append({'dataset': dataset,
                                'algorithm': data['algorithm'],
                                'method': methodnames[i],
                                'value': value})

# Create a DataFrame from the merged list
df = pd.DataFrame(merged_list)

# Get unique algorithms in the DataFrame
algorithms = df['algorithm'].unique()

# Create grouped bar plots for each algorithm
for algorithm in algorithms:
    algorithm_df = df[df['algorithm'] == algorithm]
    
    # Create a list to store the traces for each 'dataset'
    traces = []
    
    # Create a trace for each 'list'
    for list_name in algorithm_df['method'].unique():
        subset_df = algorithm_df[algorithm_df['method'] == list_name]
        trace = go.Bar(
            x=subset_df['dataset'],
            y=subset_df['value'],
            name=f'{list_name}',
            marker=dict(color='rgba(144, 238, 144, 0.7)' if list_name=='Random Search'  else 'rgba(0, 128, 0, 0.7)')
        )
        traces.append(trace)
    
    # Create layout
    layout = go.Layout(
        barmode='group',
        title=f'Tunability plot for {algorithm}',
        xaxis=dict(title='Dataset'),
        yaxis=dict(title='Value')
    )
    
    # Create figure
    fig = go.Figure(data=traces, layout=layout)
    fig.update_layout(width=800, height=700)
    
    # Show the plot
    fig.show()