In [42]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split

from skopt import BayesSearchCV
from skopt.space import Real

from time import time

import pandas as pd
import numpy as np
import ast
import openml

from scipy.stats import uniform, loguniform

In [43]:
rn_st = 8

In [44]:
# Setting preprocessing

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))

])

col_trans = ColumnTransformer(transformers=[
        ('num_pipeline', num_pipeline, make_column_selector(dtype_include=np.number)),
        ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include='category'))
    ],
    remainder='drop',
    n_jobs=-1
)

def label_encode(y):
    label_encoder = LabelEncoder()
    return label_encoder.fit_transform(y)

target_transformer = FunctionTransformer(label_encode, validate=False)

In [45]:
# Loading dataset 1
d1 = openml.datasets.get_dataset(31)
X, y, _, _ = d1.get_data(target=d1.default_target_attribute)

# print(X.dtypes)
# print('Dane:', X.head())
# print('Target: ', y.head())

class_distribution = y.value_counts()
print("Class distribution:\n", class_distribution)
# class_distribution.plot(kind='bar', title="Class Distribution")

Class distribution:
 class
good    700
bad     300
Name: count, dtype: int64


In [46]:
# Loading dataset 2
d2 = openml.datasets.get_dataset(37)
X, y, _, _ = d2.get_data(target=d2.default_target_attribute)

# print(X.dtypes)
# print('Dane:', X.head())
# print('Target: ', y.head())

class_distribution = y.value_counts()
print("Class distribution:\n", class_distribution)
# class_distribution.plot(kind='bar', title="Class Distribution")

Class distribution:
 class
tested_negative    500
tested_positive    268
Name: count, dtype: int64


In [47]:
# Loading dataset 3
d3 = openml.datasets.get_dataset(44)
X, y, _, _ = d3.get_data(target=d3.default_target_attribute)

# print(X.dtypes)
# print('Dane:', X.head())
# print('Target: ', y.head())

class_distribution = y.value_counts()
print("Class distribution:\n", class_distribution)
# class_distribution.plot(kind='bar', title="Class Distribution")

Class distribution:
 class
0    2788
1    1813
Name: count, dtype: int64


In [48]:
# Loading dataset 4
d4 = openml.datasets.get_dataset(40597)
X, y, _, _ = d4.get_data(target="Class1")
X = X.drop(columns=['Class2', 'Class3', 'Class4', 'Class5', 'Class6', 'Class7', 'Class8', 'Class9'])

# print(X.dtypes)
# print('Dane:', X.head())
# print('Target: ', y.head())

class_distribution = y.value_counts()
print("Class distribution:\n", class_distribution)
# class_distribution.plot(kind='bar', title="Class Distribution")

Class distribution:
 Class1
False    1655
True      762
Name: count, dtype: int64


In [49]:
# Setting parameters spaces (or not) and test type

'''
# This was when I was testing SVM, ignore 

param_distribution = {
    #'model__C': loguniform(2**-10, 2**10),
    #'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #'model__degree': [2, 3, 4, 5],
    'model__gamma': loguniform(2**-10, 2**10)
}

param_space = {
    #'model__C': Real(2**-10, 2**10, prior='log-uniform', dtype=float),
    #'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #'model__degree': [2, 3, 4, 5],
    'model__gamma': Real(2**-10, 2**10, prior='log-uniform', dtype=float),
}
'''
param_distribution = {
    'model__C': loguniform(2**-10, 2**10),
    'model__l1_ratio': uniform(0, 1)
}

param_space = {
    'model__C': Real(2**-10, 2**10, prior='log-uniform', dtype=float),
    'model__l1_ratio': Real(0, 1, dtype=float)
}

test_type = ""

In [50]:
# Retrieving defaults (only for hyperparameter tuning, the id comes from the bottom of the file)

id = 100

df = pd.read_csv("_random_iter_res_0.csv", index_col=0)
df['params'] = df['params'].apply(ast.literal_eval)
defaults = df.loc[id]['params']

print(defaults)

{'model__C': 0.20736406818487657, 'model__l1_ratio': 0.4971680515517577}


In [51]:
# Setting model (remeber to (not) set appropriate defaults)

my_model = LogisticRegression(penalty='elasticnet', solver='saga', class_weight='balanced', n_jobs=-1, max_iter=200, l1_ratio=defaults["model__l1_ratio"])

# This was when I was testing SVM, ignore 
# my_model = SVM = SVC(class_weight='balanced', max_iter=20000)

model_pipe = Pipeline([('preprocessing', col_trans), ('model', my_model)])
# model_pipe.get_params()

In [52]:
cv_train = 5
cv_test = 3

In [53]:
# Setting optimizers, data and test_type

random_search = RandomizedSearchCV(
    model_pipe,
    param_distribution,
    n_iter=180,
    scoring='roc_auc',
    cv=cv_train,
    n_jobs=-1, 
    random_state=rn_st,
    return_train_score=True
)

bayes_search = BayesSearchCV(
    model_pipe,
    param_space,
    n_iter=60,
    scoring='roc_auc',
    cv=cv_train,
    n_jobs=-1,
    random_state=rn_st,
    return_train_score=True
)

data = [(d1, "class"), (d2, "class"), (d3, "class"), (d4, "Class1")]

In [90]:
i = 0
for d, t in data:
    main_results_df = pd.DataFrame(columns=['method','elapsed_time', 'best_score', 'test_score', 'best_params', 'auc_score', 'cv_auc_score'])
    X, y, _, _ = d.get_data(target=t)
    y = target_transformer.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rn_st)
    
    # Random Search
    start_time = time()
    random_search.fit(X_train, y_train)
    elapsed_time = time() - start_time
    
    best_score = random_search.best_score_
    test_score = random_search.score(X_test, y_test)
    best_params = str(random_search.best_params_)
    
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    auc_scr = roc_auc_score(y_test, y_pred)
    cv_auc_scr = np.mean(cross_val_score(best_model, X_test, y_test, scoring='roc_auc', cv=cv_test))
    
    random_results = pd.DataFrame(random_search.cv_results_)
    
    new_row = pd.DataFrame({
        'method': "RandomSearchCV",
        'elapsed_time': [elapsed_time],
        'best_score': [best_score],
        'test_score': [test_score],
        'best_params': [best_params],
        'auc_score': [auc_scr],
        'cv_auc_score': [cv_auc_scr]
    })
    main_results_df = pd.concat([main_results_df, new_row], ignore_index=True)
    
    random_results.to_csv(f"{test_type}_random_iter_res_{i}.csv", index=True, index_label='index')
    main_results_df.to_csv(f"{test_type}_main_res_{i}.csv", index=False)
    
    # Bayes Search    
    start_time = time()
    bayes_search.fit(X_train, y_train)
    elapsed_time = time() - start_time
    
    best_score = bayes_search.best_score_
    test_score = bayes_search.score(X_test, y_test)
    best_params = str(bayes_search.best_params_)
    
    best_model = bayes_search.best_estimator_
    y_pred = best_model.predict(X_test)
    auc_scr = roc_auc_score(y_test, y_pred)
    cv_auc_scr = np.mean(cross_val_score(best_model, X_test, y_test, scoring='roc_auc', cv=cv_test))
    
    bayes_results = pd.DataFrame(bayes_search.cv_results_)
    
    new_row = pd.DataFrame({
        'method': "BayesSearchCV",
        'elapsed_time': [elapsed_time],
        'best_score': [best_score],
        'test_score': [test_score],
        'best_params': [best_params],
        'auc_score': [auc_scr],
        'cv_auc_score': [cv_auc_scr]
    })
    main_results_df = pd.concat([main_results_df, new_row], ignore_index=True)
    
    bayes_results.to_csv(f"{test_type}_bayes_iter_res_{i}.csv", index=True, index_label='index')
    main_results_df.to_csv(f"{test_type}_main_res_{i}.csv", index=False)
    
    i += 1

  main_results_df = pd.concat([main_results_df, new_row], ignore_index=True)
  main_results_df = pd.concat([main_results_df, new_row], ignore_index=True)
  main_results_df = pd.concat([main_results_df, new_row], ignore_index=True)
  main_results_df = pd.concat([main_results_df, new_row], ignore_index=True)


In [54]:
# Establish defaults

file_list = [f"_random_iter_res_{i}.csv" for i in range(4)] 

dfs = []
for file in file_list:
    df = pd.read_csv(file, index_col=0)
    df['mean_test_score'] = -df['mean_test_score']
    
    mean = df['mean_test_score'].mean()
    std = df['mean_test_score'].std()
    min_score = df['mean_test_score'].min()
    max_score = df['mean_test_score'].max()
    
    df['mean_test_score_01'] = (df['mean_test_score'] - min_score) / (max_score - min_score)
    df['mean_test_score_std'] = (df['mean_test_score'] - mean) / std
    
    dfs.append(df)

mean_test_score = [df['mean_test_score'] for df in dfs]
mean_test_score_01 = [df['mean_test_score_01'] for df in dfs]
mean_test_score_std = [df['mean_test_score_std'] for df in dfs]

scores_df = pd.concat(mean_test_score + mean_test_score_01 + mean_test_score_std, axis=1)

scores_df.columns = [f"mean_test_score_{i}" for i in range(len(file_list))] + \
                    [f"mean_test_score_01_{i}" for i in range(len(file_list))] + \
                    [f"mean_test_score_std_{i}" for i in range(len(file_list))]

scores_df['mean_of_mean_test_score'] = scores_df[[f"mean_test_score_{i}" for i in range(len(file_list))]].mean(axis=1)
scores_df['mean_of_mean_test_score_01'] = scores_df[[f"mean_test_score_01_{i}" for i in range(len(file_list))]].mean(axis=1)
scores_df['mean_of_mean_test_score_std'] = scores_df[[f"mean_test_score_std_{i}" for i in range(len(file_list))]].mean(axis=1)

min_index = scores_df['mean_of_mean_test_score'].idxmin()
min_index_01 = scores_df['mean_of_mean_test_score_01'].idxmin()
min_index_std = scores_df['mean_of_mean_test_score_std'].idxmin()

print(f"Index: {min_index}, smallest value: {scores_df.loc[min_index, 'mean_of_mean_test_score']}\n")
print(f"Index: {min_index_01}, smallest value: {scores_df.loc[min_index, 'mean_of_mean_test_score_01']}\n")
print(f"Index: {min_index_std}, smallest value: {scores_df.loc[min_index, 'mean_of_mean_test_score_std']}\n")

print(scores_df.head)

Index: 100, smallest value: -0.8408558276326872

Index: 100, smallest value: 0.00683278311908349

Index: 140, smallest value: -0.36484708470845584

<bound method NDFrame.head of        mean_test_score_0  mean_test_score_1  mean_test_score_2  \
index                                                            
0              -0.765192          -0.826719          -0.970711   
1              -0.765230          -0.826719          -0.970715   
2              -0.779753          -0.825642          -0.966506   
3              -0.781558          -0.827946          -0.970641   
4              -0.774049          -0.826950          -0.970779   
...                  ...                ...                ...   
175            -0.500000          -0.500000          -0.610859   
176            -0.765305          -0.826719          -0.970683   
177            -0.771397          -0.825227          -0.969749   
178            -0.766092          -0.826719          -0.970720   
179            -0.500000      

In [None]:
# Retrieving defaults (again, for convenience)
id = 100

df = pd.read_csv("_random_iter_res_0.csv", index_col=0)
df['params'] = df['params'].apply(ast.literal_eval)
defaults = df.loc[id]['params']

In [60]:
# Scoring with default hyperparameters
def_model = LogisticRegression(penalty='elasticnet', solver='saga', class_weight='balanced', n_jobs=-1, max_iter=200, C=defaults["model__C"], l1_ratio=defaults["model__l1_ratio"])
def_model_pipe = Pipeline([('preprocessing', col_trans), ('model', def_model)])

default_results =[]
for d, t in data:
    X, y, _, _ = d.get_data(target=t)
    y = target_transformer.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rn_st)

    def_model_pipe.fit(X_train, y_train)
    y_pred = def_model_pipe.predict(X_test)
    auc_scr = roc_auc_score(y_test, y_pred)
    cv_auc_scr = np.mean(cross_val_score(def_model_pipe, X_test, y_test, scoring='roc_auc', cv=cv_test))
    default_results.append((auc_scr, cv_auc_scr))
    
print(default_results)



[(0.7246553122465531, 0.7139636530940878), (0.7332202111613877, 0.8291041906958861), (0.9267915828714377, 0.9558960660710003), (0.7076545751831891, 0.7109510049278877)]


In [61]:
# Calculating tunability

def parse_best_params(param_str):
    # Removes "OrderedDict(" and ")" to make it parsable by ast.literal_eval
    cleaned_str = param_str.replace("OrderedDict(", "").rstrip(")")
    return ast.literal_eval(cleaned_str)

random_diffs = []
bayes_diffs = []
time_diffs = []
total_diffs_df = pd.DataFrame()

test_types = [f"C_{id}", f"l1_{id}", ""]

for test_type in test_types:
    random_diffs.clear()
    bayes_diffs.clear()
    time_diffs.clear()
    
    i=0
    for d, t in data:
        X, y, _, _ = d.get_data(target=t)
        y = target_transformer.fit_transform(y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rn_st)
        
        file_path = f'{test_type}_main_res_{i}.csv'
        df = pd.read_csv(file_path)

        random_search = df[df['method'] == 'RandomSearchCV'].iloc[0]
        bayes_search = df[df['method'] == 'BayesSearchCV'].iloc[0]
        
        time_diffs.append(3 * bayes_search['elapsed_time'] / random_search['elapsed_time']) # 3 times, because 180 = 3 * 60
        
        def_auc = -max(default_results[i][0], default_results[i][1])
        random_diffs.append(def_auc + max(random_search['auc_score'], random_search['cv_auc_score']))
        bayes_diffs.append(def_auc + max(bayes_search['auc_score'], bayes_search['cv_auc_score']))
        
        '''
        # This was just to check if the results will be the same (they are)
        # I don't like repeating the code so...
        for met in ['RandomSearchCV', 'BayesSearchCV']:
            best_params = (df[df['method'] == met]['best_params']).apply(parse_best_params)
            idx = 0 if met == 'RandomSearchCV' else 1
            C = best_params[idx].get('model__C', None)
            l1_ratio = best_params[idx].get('model__l1_ratio', None)
            
            # Unfortunately there are many combinations
            if C is not None and l1_ratio is not None:
                def_model = LogisticRegression(penalty='elasticnet', solver='saga', class_weight='balanced', n_jobs=-1, max_iter=200, C=C, l1_ratio=l1_ratio)
            elif C is not None:
                def_model = LogisticRegression(penalty='elasticnet', solver='saga', class_weight='balanced', n_jobs=-1, max_iter=200, C=C, l1_ratio=defaults["model__l1_ratio"])
            else:
                def_model = LogisticRegression(penalty='elasticnet', solver='saga', class_weight='balanced', n_jobs=-1, max_iter=200, C=defaults["model__C"], l1_ratio=l1_ratio)
            def_model_pipe = Pipeline([('preprocessing', col_trans), ('model', def_model)])
            
            def_model_pipe.fit(X_train, y_train)
            y_pred = def_model_pipe.predict(X_test)
            auc_scr = roc_auc_score(y_test, y_pred)
        
            if met == 'RandomSearchCV':
                random_diffs.append(default_results[i] - auc_scr)
            else:
                bayes_diffs.append(default_results[i] - auc_scr)
        # End loop
        '''
        i += 1

    diffs_df = pd.DataFrame({
        'random_score': random_diffs,
        'bayes_score': bayes_diffs,
        'time_ratio': time_diffs
    })
    
    print(diffs_df)
    
    summary = {
        'RandomSearchCV': {
            'mean': diffs_df['random_score'].mean(),
            'median': diffs_df['random_score'].median()
        },
        'BayesSearchCV': {
            'mean': diffs_df['bayes_score'].mean(),
            'median': diffs_df['bayes_score'].median()
        },
        'Time': {
            'mean': diffs_df['time_ratio'].mean(),
            'median': diffs_df['time_ratio'].median()
        }
    }
    
    summary_row = pd.DataFrame({
        'test_type': [test_type],
        'random_mean': [summary['RandomSearchCV']['mean']],
        'random_median': [summary['RandomSearchCV']['median']],
        'bayes_mean': [summary['BayesSearchCV']['mean']],
        'bayes_median': [summary['BayesSearchCV']['median']],
        'time_mean': [summary['Time']['mean']],
        'time_median': [summary['Time']['median']]
    })
    
    total_diffs_df = pd.concat([total_diffs_df, summary_row], ignore_index=True)

print()
print(total_diffs_df)

   random_score  bayes_score  time_ratio
0      0.006662     0.003448    9.787245
1      0.009163     0.011983   19.329395
2      0.002074     0.002221    3.971060
3     -0.001674    -0.001855    4.836018
   random_score  bayes_score  time_ratio
0      0.014599     0.014599    7.623124
1      0.012047     0.012079   18.126382
2      0.000221     0.000354    3.339638
3      0.000422     0.000543    7.649208
   random_score  bayes_score  time_ratio
0      0.009069     0.018248   11.824899
1      0.011983     0.010285   21.963220
2      0.002045     0.002472    4.868045
3      0.001361    -0.001639    5.561116

  test_type  random_mean  random_median  bayes_mean  bayes_median  time_mean  \
0     C_140     0.004056       0.004368    0.003949      0.002835   9.480929   
1    l1_140     0.006822       0.006235    0.006893      0.006311   9.184588   
2               0.006114       0.005557    0.007341      0.006378  11.054320   

   time_median  
0     7.311632  
1     7.636166  
2     8.6930

In [62]:
base_values = total_diffs_df[total_diffs_df['test_type'] == ''].iloc[0]
relative_df = pd.DataFrame()

for column in ['random_mean', 'bayes_mean', 'random_median', 'bayes_median']:
    relative_df[f'relative_{column}'] = total_diffs_df[column] / base_values[column]

relative_df = relative_df.iloc[:-1]

print(relative_df)

   relative_random_mean  relative_bayes_mean  relative_random_median  \
0              0.663420             0.537955                0.786059   
1              1.115743             0.939003                1.121902   

   relative_bayes_median  
0               0.444484  
1               0.989459  
