*this notebook was run in Google Colab*

# setup

In [None]:
# mount notebook
from google.colab import drive

mount='/content/gdrive'
print("Colab: mounting Google drive on ", mount)

drive.mount(mount)

# switch to the directory on the Google Drive that you want to use
import os
drive_root = mount + "/My Drive/Colab Notebooks/thesis_training_models"
  
# create drive_root if it doesn't exist
create_drive_root = True
if create_drive_root:
    print("\nColab: making sure ", drive_root, " exists.")
    os.makedirs(drive_root, exist_ok=True)

# change to the directory
print("\nColab: Changing directory to ", drive_root)
%cd $drive_root

Colab: mounting Google drive on  /content/gdrive
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).

Colab: making sure  /content/gdrive/My Drive/Colab Notebooks/thesis_training_models  exists.

Colab: Changing directory to  /content/gdrive/My Drive/Colab Notebooks/thesis_training_models
/content/gdrive/My Drive/Colab Notebooks/thesis_training_models


In [None]:
# check computational resources: GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Jul 28 10:04:43 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# check computational resources: RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
# installations
! pip install ogb

# dgl
!pip install dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels/repo.html


In [None]:
# imports
import dgl

from dgl import backend as F

import pandas as pd

from ogb.graphproppred import DglGraphPropPredDataset

from sklearn.svm import SVC

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_curve, auc

import numpy as np

import pickle

import re

from tensorflow.python.client import device_lib

from os.path import exists as file_exists

from sklearn.feature_selection import SelectPercentile, f_classif, SelectFromModel

from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.neural_network import MLPClassifier

# helper functions

In [None]:
def get_split_data(input_path):

    # load data
    dataset = DglGraphPropPredDataset(name = "ogbg-molhiv")

    # load processed data
    df = pd.read_csv("./dataset/1D_2D_PubChemFP_SubFP_preprocessed.csv")

    # load splitting indices with OGB scaffold splitting
    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

    return  df.iloc[train_idx, df.columns != "y"], df.loc[train_idx, ["y"]],df.iloc[valid_idx, df.columns != "y"], df.loc[valid_idx, ["y"]], df.iloc[test_idx, df.columns != "y"], df.loc[test_idx, ["y"]],

In [None]:
def select_features(data_tr_x, data_va_x, data_te_x, data_tr_y):
    # univariate feature selection
    first_trans = SelectPercentile(f_classif, percentile=70)
    first_trans.fit(data_tr_x, data_tr_y.values.ravel())
    data_tr_x_fs = first_trans.transform(data_tr_x)
    data_va_x_fs = first_trans.transform(data_va_x)
    data_te_x_fs = first_trans.transform(data_te_x)

    # select from model
    clf = XGBClassifier(random_state=0)
    clf = clf.fit(data_tr_x_fs, data_tr_y.values.ravel())
    second_trans = SelectFromModel(clf, prefit=True)
    data_tr_x_fs = second_trans.transform(data_tr_x_fs)
    data_va_x_fs = second_trans.transform(data_va_x_fs)
    data_te_x_fs = second_trans.transform(data_te_x_fs)

    # get feature names
    mask1 = first_trans.get_support()
    mask2 = second_trans.get_support()
    new_feats1 = data_tr_x.columns[mask1] 
    new_feats2 = new_feats1[mask2]

    return data_tr_x_fs, data_va_x_fs, data_te_x_fs, new_feats2

In [None]:
# metrics
def statistical(y_true, y_pred, y_pro):
    c_mat = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = list(c_mat.flatten())
    se = tp / (tp + fn)
    sp = tn / (tn + fp)
    acc = (tp + tn) / (tn + fp + fn + tp)
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) + 1e-8)
    auc_prc = auc(precision_recall_curve(y_true, y_pro, pos_label=1)[1],
                  precision_recall_curve(y_true, y_pro, pos_label=1)[0])
    auc_roc = roc_auc_score(y_true, y_pro)
    return tn, fp, fn, tp, se, sp, acc, mcc, auc_prc, auc_roc

In [None]:
# calculate positive weight
def get_pos_weight(data):
    num_pos = F.sum(data.labels, dim=0)
    num_indices = F.tensor(len(data.labels))
    return (num_indices - num_pos) / num_pos

# data

In [None]:
# get scaffold-split (full descriptors) data
data_tr_x, data_tr_y, data_va_x, data_va_y, data_te_x, data_te_y = get_split_data(("./data/preprocessed data/1D_2D_PubChemFP_SubFP_preprocessed.csv"))
print(f"full descriptor data contains {data_tr_x.shape[1]} features")

full descriptor data contains 233 features


In [None]:
# get data post feature selection
data_tr_x_fs, data_va_x_fs, data_te_x_fs, new_feats = select_features(data_tr_x, data_va_x, data_te_x, data_tr_y)
print(f"feature selection yielded {data_tr_x_fs.shape[1]} from {data_tr_x.shape[1]} original features")
print(f"selected features are: {new_feats}")

feature selection yielded 67 from 233 original features
selected features are: Index(['naAromAtom', 'nN', 'nO', 'nS', 'nP', 'AATS1m', 'AATS2m', 'AATS5m',
       'AATS7m', 'AATS3p', 'AATS4p', 'AATS2i', 'ATSC0m', 'ATSC4v', 'ATSC5v',
       'ATSC6v', 'ATSC0p', 'ATSC1p', 'ATSC3p', 'ATSC8p', 'ATSC3i', 'ATSC4i',
       'ATSC7i', 'AATSC1m', 'AATSC1v', 'AATSC8i', 'MATS6e', 'GATS1c', 'GATS2c',
       'GATS4c', 'GATS6c', 'GATS7c', 'GATS2m', 'GATS2e', 'GATS3e', 'GATS7i',
       'GATS3s', 'GATS4s', 'GATS5s', 'nBondsS3', 'nBondsD', 'C3SP2', 'C3SP3',
       'fragC', 'nHBAcc2', 'nHBDon', 'IC2', 'IC3', 'CIC2', 'MIC0', 'ZMIC1',
       'nAtomP', 'nAtomLAC', 'MLogP', 'MDEC-23', 'MDEO-22', 'MDEN-12',
       'MDEN-23', 'nRotB', 'topoRadius', 'GGI1', 'GGI2', 'GGI3', 'SpMAD_D',
       'EE_D', 'VE3_D', 'SRW5'],
      dtype='object')


# experimentation overview

In [None]:
overview_df_filename = "experimentation_overview"
if not file_exists(overview_df_filename):
    print("no experiments conducted yet, please run the models")
else:
    overview_df = pd.read_parquet(overview_df_filename)
    display(overview_df)

Unnamed: 0,model_type,filename,GPU_accelerator,RAM,data_features,hyperparameters,train_performance_ROC-AUC_avg/std/max,valid_performance_ROC-AUC_avg/std/max,test_performance_ROC-AUC_avg/std/max
0,SVM,svm_opt_full_descr_5k_50,Tesla P100-PCIE-16GB,27.33 GB,all features (233 feats),"{'C': 40.52181430939162, 'break_ties': False, ...","[0.99155, 1e-05, 0.99157]","[0.79398, 7e-05, 0.79412]","[0.65325, 4e-05, 0.65331]"
1,SVM,svm_opt_feat_select_15k_50,Tesla P100-PCIE-16GB,27.33 GB,feature selection (67 feats),"{'C': 0.9146720748918769, 'break_ties': False,...","[0.99813, 0.0, 0.99813]","[0.79447, 0.0002, 0.79488]","[0.75988, 0.00025, 0.76021]"
2,RF,rf_opt_full_descr_50,Tesla P100-PCIE-16GB,27.33 GB,all features (233 feats),"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[0.99116, 0.00023, 0.99144]","[0.78926, 0.00851, 0.80319]","[0.78592, 0.00485, 0.79239]"
3,RF,rf_opt_feat_select_50,Tesla P100-PCIE-16GB,27.33 GB,feature selection (67 feats),"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...","[0.95785, 0.00362, 0.96235]","[0.74416, 0.04167, 0.81029]","[0.73382, 0.01442, 0.76491]"
4,XGB,xgb_opt_full_descr_50,Tesla P100-PCIE-16GB,27.33 GB,all features (233 feats),"{'base_score': 0.5, 'booster': 'gbtree', 'cols...","[0.99891, 8e-05, 0.99908]","[0.79425, 0.00737, 0.80282]","[0.75037, 0.00928, 0.76298]"
5,XGB,xgb_opt_feat_select_50,Tesla P100-PCIE-16GB,27.33 GB,feature selection (67 feats),"{'base_score': 0.5, 'booster': 'gbtree', 'cols...","[0.89512, 0.0249, 0.92884]","[0.7885, 0.00918, 0.80097]","[0.76791, 0.01247, 0.77909]"
6,MLP,mlp_opt_full_descr_50,Tesla T4,27.33 GB,all features (233 feats),"{'activation': 'relu', 'alpha': 0.000274680092...","[0.9997, 0.00035, 0.99998]","[0.7626, 0.01725, 0.79007]","[0.70937, 0.01347, 0.73482]"


# SVM

In [None]:
# hyperparameter optimization setup
OPT_ITERS = 50
repetitions = 10
max_iter = 5000  
cache_size = 1000

svm_hyper_space = {'C': hp.uniform('C', 0.1, 100),
                   'gamma': hp.uniform('gamma', 0, 0.3)}

## SVM (all features)

In [None]:
def svm_hyper_opt(args):
    model = SVC(**args, kernel='rbf', random_state=0, probability=True, class_weight='balanced',
                    cache_size=cache_size, max_iter=max_iter, verbose =True) 
    model.fit(data_tr_x, data_tr_y.values.ravel())
    val_preds = model.predict_proba(data_va_x)
    loss = 1 - roc_auc_score(data_va_y, val_preds[:, 1])
    return {'loss': loss, 'status': STATUS_OK}

In [None]:
filename_svm_full_descr = "svm_opt_full_descr_5k_50" 

if file_exists(filename_svm_full_descr+".sav") and file_exists(filename_svm_full_descr+"_performance"):
    print("no training and optimization needed, everything can be loaded")

    # model
    print('\n')
    print("best SVM model on full descriptors is:")
    loaded_model = pickle.load(open(filename_svm_full_descr+".sav", 'rb'))
    print(loaded_model)

    # performance
    perf_df = pd.read_parquet(filename_svm_full_descr + "_performance")
    print('\n')
    print(f"mean ROC-AUC across {repetitions} different seeds")
    print(f"train: {round(np.average(perf_df['auc_roc'][0]), 5)} | validation: {round(np.average(perf_df['auc_roc'][1]), 5)}, test: {round(np.average(perf_df['auc_roc'][2]), 5)}")

    # overview
    print('\n')
    print("experimentation overview:")
    # load 
    overview_df = pd.read_parquet(overview_df_filename)
    print(overview_df.to_markdown())

else:
    print("performing training and optimization")

    # hyperparameter optimization
    print("starting hyperparameter optimization")
    trials = Trials()
    best_results_svm_full_descr = fmin(svm_hyper_opt, svm_hyper_space, algo=tpe.suggest, max_evals=OPT_ITERS, trials=trials)
    print(f"the best SVM hyperparameters are: {best_results_svm_full_descr}")
    best_model = SVC(C=best_results_svm_full_descr['C'], gamma=best_results_svm_full_descr['gamma'], kernel='rbf', random_state=0,
                            probability=True, class_weight='balanced', cache_size=cache_size, max_iter=max_iter, verbose=True)
    best_model.fit(data_tr_x, data_tr_y.values.ravel())

    # save hyperparameters
    with open(filename_svm_full_descr+"_hps", 'wb') as f:
        pickle.dump(best_results_svm_full_descr, f)
    # loadable via ...
    # with open(filename_svm_full_descr+"_hps", 'rb') as f:
    #    loaded_dict = pickle.load(f)

    # save best model
    pickle.dump(best_model, open(filename_svm_full_descr+".sav", 'wb'))
    # loadable via ...
    # best_model = pickle.load(open(filename_svm_full_descr+".sav", 'rb'))

    # repetitions for performance on different seeds
    tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc = [], [], [], [], [], [], [], [], [], []
    va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc = [], [], [], [], [], [], [], [], [], []
    te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc = [], [], [], [], [], [], [], [], [], []

    tr_lst = [tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc]
    va_lst = [va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc]
    te_lst = [te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]

    print("performing repetitions on different seeds")
    for i in range(repetitions):
        
        # first replicate model with initial seed
        if i == 0:
            seed = 0
        else:
            seed = np.random.randint(1, 999999) # all but initial random seed of 0
        
        best_model = SVC(C=best_results_svm_full_descr['C'], gamma=best_results_svm_full_descr['gamma'], kernel='rbf', random_state=seed,
                            probability=True, class_weight='balanced', cache_size=cache_size, max_iter=max_iter, verbose=True)
        
        best_model.fit(data_tr_x, data_tr_y.values.ravel())

        # training metrics calc
        tr_pred = best_model.predict_proba(data_tr_x)
        tr_metrics = list(statistical(data_tr_y, np.argmax(tr_pred, axis=1), tr_pred[:, 1]))

        # validation metric calc
        va_pred = best_model.predict_proba(data_va_x)
        va_metrics = list(statistical(data_va_y, np.argmax(va_pred, axis=1), va_pred[:, 1]))

        # test metric calc
        te_pred = best_model.predict_proba(data_te_x)
        te_metrics = list(statistical(data_te_y, np.argmax(te_pred, axis=1), te_pred[:, 1]))

        # creating dataframe
        for j in range(len(tr_lst)):               
            tr_lst[j].append(tr_metrics[j])
            va_lst[j].append(va_metrics[j])
            te_lst[j].append(te_metrics[j])

    metric_cls = ["tn", "fp", "fn", "tp", "se", "sp", "acc", "mcc", "auc_prc", "auc_roc"] 
    metrics_data = [["train", tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc],
                    ["validation", va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc],
                    ["test", te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]]
    svm_full_descr_perf = pd.DataFrame(metrics_data, columns = ["split"] + metric_cls)
    
    # save performance df
    svm_full_descr_perf.to_parquet(filename_svm_full_descr + "_performance", index=0)      
    # loadable via ...
    # svm_full_descr_perf = pd.read_parquet(filename_svm_full_descr + "_performance")

    # add model info 
    cols = ["avg_auc_roc", "std_auc_roc", "top_roc_auc"]
    tr_aggr = []
    va_aggr = []
    te_aggr = []
    results = [tr_aggr, va_aggr, te_aggr]

    for i in range(len(results)):
        # avg_auc_roc
        results[i].append(round(np.average(svm_full_descr_perf["auc_roc"][i]), 5))
        # std_auc_roc
        results[i].append(round(np.std(svm_full_descr_perf["auc_roc"][i]), 5))
        # top_roc_auc
        results[i].append(round(np.max(svm_full_descr_perf["auc_roc"][i]), 5))

    cls = ["model_type", "filename", "GPU_accelerator", "RAM", "data_features", "hyperparameters", "train_performance_ROC-AUC_avg/std/max", "valid_performance_ROC-AUC_avg/std/max", "test_performance_ROC-AUC_avg/std/max"]
    model_type = "SVM"
    data_features = f"all features ({data_tr_x.shape[1]} feats)"
    filename = filename_svm_full_descr
    GPU_info = !nvidia-smi -L
    GPU_accelerator = re.search(r"\: (.*?)\(", str(GPU_info)).group(1)
    RAM = f"{round(virtual_memory().total / 1e9, 2)} GB"
    hyperparameters = best_model.get_params() # all fixed and optimized hyperparameters
    tr_performance = tr_aggr
    va_performance = va_aggr
    te_performance = te_aggr

    # does overview table exist?
    if not file_exists(overview_df_filename):
        # create dataframe with model info
        info = [[model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]]
        df = pd.DataFrame(info, columns=cls)
        # save
        df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(df.to_markdown())
    else:
        # load 
        overview_df = pd.read_parquet(overview_df_filename)

        # add row for model
        new_row = {}
        keys = cls
        values = [model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]
        for key in keys:
            for value in values:
                new_row[key] = value
                values.remove(value)
                break 
        overview_df = overview_df.append(new_row, ignore_index=True)
        # save
        overview_df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(overview_df.to_markdown())

performing training and optimization
starting hyperparameter optimization
[LibSVM]
  0%|          | 0/50 [00:00<?, ?it/s, best loss: ?]




[LibSVM]
  2%|▏         | 1/50 [04:44<3:52:01, 284.11s/it, best loss: 0.3271038482265335]




[LibSVM]
  4%|▍         | 2/50 [09:23<3:44:52, 281.10s/it, best loss: 0.3042098398001176]




[LibSVM]
  6%|▌         | 3/50 [14:01<3:39:08, 279.76s/it, best loss: 0.3042098398001176]




[LibSVM]
  8%|▊         | 4/50 [18:40<3:34:22, 279.63s/it, best loss: 0.2985621203213795]




[LibSVM]
 10%|█         | 5/50 [23:29<3:32:18, 283.08s/it, best loss: 0.2985621203213795]




[LibSVM]
 12%|█▏        | 6/50 [28:16<3:28:22, 284.15s/it, best loss: 0.2590954463060945]




[LibSVM]
 14%|█▍        | 7/50 [32:53<3:22:07, 282.05s/it, best loss: 0.2590954463060945]




[LibSVM]
 16%|█▌        | 8/50 [37:33<3:16:54, 281.31s/it, best loss: 0.2590954463060945]




[LibSVM]
 18%|█▊        | 9/50 [42:01<3:09:15, 276.97s/it, best loss: 0.2590954463060945]




[LibSVM]
 20%|██        | 10/50 [47:06<3:10:35, 285.88s/it, best loss: 0.2590954463060945]




[LibSVM]
 22%|██▏       | 11/50 [51:46<3:04:35, 283.99s/it, best loss: 0.2590954463060945]




[LibSVM]
 24%|██▍       | 12/50 [56:26<2:58:59, 282.63s/it, best loss: 0.2590954463060945]




[LibSVM]
 26%|██▌       | 13/50 [1:01:07<2:54:05, 282.30s/it, best loss: 0.2590954463060945]




[LibSVM]
 28%|██▊       | 14/50 [1:05:45<2:48:36, 281.02s/it, best loss: 0.2590954463060945]




[LibSVM]
 30%|███       | 15/50 [1:09:41<2:36:00, 267.44s/it, best loss: 0.2590954463060945]




[LibSVM]
 32%|███▏      | 16/50 [1:14:08<2:31:26, 267.24s/it, best loss: 0.2590954463060945]




[LibSVM]
 34%|███▍      | 17/50 [1:18:36<2:27:03, 267.37s/it, best loss: 0.2590954463060945]




[LibSVM]
 36%|███▌      | 18/50 [1:21:22<2:06:23, 236.99s/it, best loss: 0.2300898368606703]




[LibSVM]
 38%|███▊      | 19/50 [1:26:17<2:11:26, 254.39s/it, best loss: 0.2300898368606703]




[LibSVM]
 40%|████      | 20/50 [1:30:45<2:09:13, 258.47s/it, best loss: 0.2300898368606703]




[LibSVM]
 42%|████▏     | 21/50 [1:33:38<1:52:30, 232.79s/it, best loss: 0.2300898368606703]




[LibSVM]
 44%|████▍     | 22/50 [1:36:34<1:40:47, 215.97s/it, best loss: 0.20610731432490692]




[LibSVM]
 46%|████▌     | 23/50 [1:40:37<1:40:44, 223.88s/it, best loss: 0.20610731432490692]




[LibSVM]
 48%|████▊     | 24/50 [1:44:31<1:38:23, 227.07s/it, best loss: 0.20610731432490692]




[LibSVM]
 50%|█████     | 25/50 [1:49:03<1:40:12, 240.51s/it, best loss: 0.20610731432490692]




[LibSVM]
 52%|█████▏    | 26/50 [1:52:55<1:35:10, 237.93s/it, best loss: 0.20610731432490692]




[LibSVM]
 54%|█████▍    | 27/50 [1:57:24<1:34:46, 247.23s/it, best loss: 0.20610731432490692]




[LibSVM]
 56%|█████▌    | 28/50 [2:02:15<1:35:29, 260.43s/it, best loss: 0.20610731432490692]




[LibSVM]
 58%|█████▊    | 29/50 [2:07:03<1:34:02, 268.71s/it, best loss: 0.20610731432490692]




[LibSVM]
 60%|██████    | 30/50 [2:10:34<1:23:45, 251.26s/it, best loss: 0.20610731432490692]




[LibSVM]
 62%|██████▏   | 31/50 [2:15:31<1:23:58, 265.20s/it, best loss: 0.20610731432490692]




[LibSVM]
 64%|██████▍   | 32/50 [2:19:32<1:17:19, 257.76s/it, best loss: 0.20610731432490692]




[LibSVM]
 66%|██████▌   | 33/50 [2:22:37<1:06:50, 235.93s/it, best loss: 0.20610731432490692]




[LibSVM]
 68%|██████▊   | 34/50 [2:27:01<1:05:07, 244.24s/it, best loss: 0.20610731432490692]




[LibSVM]
 70%|███████   | 35/50 [2:31:30<1:02:59, 251.96s/it, best loss: 0.20610731432490692]




[LibSVM]
 72%|███████▏  | 36/50 [2:36:03<1:00:11, 257.99s/it, best loss: 0.20610731432490692]




[LibSVM]
 74%|███████▍  | 37/50 [2:40:39<57:05, 263.51s/it, best loss: 0.20610731432490692]




[LibSVM]
 76%|███████▌  | 38/50 [2:45:15<53:27, 267.33s/it, best loss: 0.20610731432490692]




[LibSVM]
 78%|███████▊  | 39/50 [2:49:48<49:20, 269.12s/it, best loss: 0.20610731432490692]




[LibSVM]
 80%|████████  | 40/50 [2:53:25<42:14, 253.46s/it, best loss: 0.20610731432490692]




[LibSVM]
 82%|████████▏ | 41/50 [2:57:49<38:28, 256.54s/it, best loss: 0.20610731432490692]




[LibSVM]
 84%|████████▍ | 42/50 [3:02:16<34:37, 259.65s/it, best loss: 0.20610731432490692]




[LibSVM]
 86%|████████▌ | 43/50 [3:06:31<30:08, 258.39s/it, best loss: 0.20610731432490692]




[LibSVM]
 88%|████████▊ | 44/50 [3:11:05<26:17, 262.86s/it, best loss: 0.20610731432490692]




[LibSVM]
 90%|█████████ | 45/50 [3:15:29<21:56, 263.26s/it, best loss: 0.20610731432490692]




[LibSVM]
 92%|█████████▏| 46/50 [3:18:21<15:43, 235.79s/it, best loss: 0.20610731432490692]




[LibSVM]
 94%|█████████▍| 47/50 [3:22:51<12:18, 246.18s/it, best loss: 0.20610731432490692]




[LibSVM]
 96%|█████████▌| 48/50 [3:27:22<08:27, 253.74s/it, best loss: 0.20610731432490692]




[LibSVM]
 98%|█████████▊| 49/50 [3:31:55<04:19, 259.40s/it, best loss: 0.20610731432490692]




100%|██████████| 50/50 [3:36:28<00:00, 259.78s/it, best loss: 0.20610731432490692]
the best SVM hyperparameters are: {'C': 40.52181430939162, 'gamma': 0.003877592396436093}
[LibSVM]



performing repetitions on different seeds
[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



model information added to experimentation overview
|    | model_type   | filename                 | GPU_accelerator      | RAM      | data_features            | hyperparameters                                                                                                                                                                                                                                                                                                             | train_performance_ROC-AUC_avg/std/max   | valid_performance_ROC-AUC_avg/std/max   | test_performance_ROC-AUC_avg/std/max   |
|---:|:-------------|:-------------------------|:---------------------|:---------|:-------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## SVM (feature selection)

In [None]:
def svm_hyper_opt_fs(args):
    model = SVC(**args, kernel='rbf', random_state=0, probability=True, class_weight='balanced',
                    cache_size=cache_size, max_iter=max_iter, verbose =True) 
    model.fit(data_tr_x_fs, data_tr_y.values.ravel())
    val_preds = model.predict_proba(data_va_x_fs)
    loss = 1 - roc_auc_score(data_va_y, val_preds[:, 1])
    return {'loss': loss, 'status': STATUS_OK}

In [None]:
filename_svm_feat_select = "svm_opt_feat_select_5k_50"

if file_exists(filename_svm_feat_select+".sav") and file_exists(filename_svm_feat_select+"_performance"):
    print("no training and optimization needed, everything can be loaded")

    # model
    print('\n')
    print("best SVM model with feature selection is:")
    loaded_model = pickle.load(open(filename_svm_feat_select+".sav", 'rb'))
    print(loaded_model)

    # performance
    perf_df = pd.read_parquet(filename_svm_feat_select + "_performance")
    print('\n')
    print(f"mean ROC-AUC across {repetitions} different seeds")
    print(f"train: {round(np.average(perf_df['auc_roc'][0]), 5)} | validation: {round(np.average(perf_df['auc_roc'][1]), 5)}, test: {round(np.average(perf_df['auc_roc'][2]), 5)}")

    # overview
    print('\n')
    print("experimentation overview:")
    # load 
    overview_df = pd.read_parquet(overview_df_filename)
    print(overview_df.to_markdown())

else:
    print("performing training and optimization")

    # hyperparameter optimization
    print("starting hyperparameter optimization")
    trials = Trials()
    best_results_svm_feat_select = fmin(svm_hyper_opt_fs, svm_hyper_space, algo=tpe.suggest, max_evals=OPT_ITERS, trials=trials)
    print(f"the best SVM hyper-parameters are: {best_results_svm_feat_select}")
    best_model = SVC(C=best_results_svm_feat_select['C'], gamma=best_results_svm_feat_select['gamma'], kernel='rbf', random_state=0,
                            probability=True, class_weight='balanced', cache_size=2000, max_iter=max_iter, verbose=True)
    best_model.fit(data_tr_x_fs, data_tr_y.values.ravel())

    # save hyperparameters
    with open(filename_svm_feat_select+"_hps", 'wb') as f:
        pickle.dump(best_results_svm_feat_select, f)
    # loadable via ...
    # with open(filename_svm_feat_select+"_hps", 'rb') as f:
    #    loaded_dict = pickle.load(f)

    # save best model
    pickle.dump(best_model, open(filename_svm_feat_select+".sav", 'wb'))
    # loadable via ...
    # best_model = pickle.load(open(filename_svm_feat_select+".sav", 'rb'))

    # repetitions for performance on different seeds
    tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc = [], [], [], [], [], [], [], [], [], []
    va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc = [], [], [], [], [], [], [], [], [], []
    te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc = [], [], [], [], [], [], [], [], [], []

    tr_lst = [tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc]
    va_lst = [va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc]
    te_lst = [te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]

    print("performing repetitions on different seeds")
    for i in range(repetitions):
        
        # first replicate model with initial seed
        if i == 0:
            seed = 0
        else:
            seed = np.random.randint(1, 999999) # all but initial random seed of 0
        
        best_model = SVC(C=best_results_svm_feat_select['C'], gamma=best_results_svm_feat_select['gamma'], kernel='rbf', random_state=seed,
                            probability=True, class_weight='balanced', cache_size=cache_size, max_iter=max_iter, verbose=True)
        
        best_model.fit(data_tr_x_fs, data_tr_y.values.ravel())

        # training metrics calc
        tr_pred = best_model.predict_proba(data_tr_x_fs)
        tr_metrics = list(statistical(data_tr_y, np.argmax(tr_pred, axis=1), tr_pred[:, 1]))

        # validation metric calc
        va_pred = best_model.predict_proba(data_va_x_fs)
        va_metrics = list(statistical(data_va_y, np.argmax(va_pred, axis=1), va_pred[:, 1]))

        # test metric calc
        te_pred = best_model.predict_proba(data_te_x_fs)
        te_metrics = list(statistical(data_te_y, np.argmax(te_pred, axis=1), te_pred[:, 1]))

        # creating dataframe
        for j in range(len(tr_lst)):               
            tr_lst[j].append(tr_metrics[j])
            va_lst[j].append(va_metrics[j])
            te_lst[j].append(te_metrics[j])

    metric_cls = ["tn", "fp", "fn", "tp", "se", "sp", "acc", "mcc", "auc_prc", "auc_roc"] 
    metrics_data = [["train", tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc],
                    ["validation", va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc],
                    ["test", te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]]
    svm_feat_select_perf = pd.DataFrame(metrics_data, columns = ["split"] + metric_cls)
    
    # save performance df
    svm_feat_select_perf.to_parquet(filename_svm_feat_select + "_performance", index=0)      
    # loadable via ...
    # svm_feat_select_perf = pd.read_parquet(filename_svm_feat_select + "_performance")

    # add model info 
    cols = ["avg_auc_roc", "std_auc_roc", "top_roc_auc"]
    tr_aggr = []
    va_aggr = []
    te_aggr = []
    results = [tr_aggr, va_aggr, te_aggr]

    for i in range(len(results)):
        # avg_auc_roc
        results[i].append(round(np.average(svm_feat_select_perf["auc_roc"][i]), 5))
        # std_auc_roc
        results[i].append(round(np.std(svm_feat_select_perf["auc_roc"][i]), 5))
        # top_roc_auc
        results[i].append(round(np.max(svm_feat_select_perf["auc_roc"][i]), 5))

    cls = ["model_type", "filename", "GPU_accelerator", "RAM", "data_features", "hyperparameters", "train_performance_ROC-AUC_avg/std/max", "valid_performance_ROC-AUC_avg/std/max", "test_performance_ROC-AUC_avg/std/max"]
    model_type = "SVM"
    data_features = f"feature selection ({data_tr_x_fs.shape[1]} feats)"
    filename = filename_svm_feat_select
    GPU_info = !nvidia-smi -L
    GPU_accelerator = re.search(r"\: (.*?)\(", str(GPU_info)).group(1)
    RAM = f"{round(virtual_memory().total / 1e9, 2)} GB"
    hyperparameters = best_model.get_params() # all fixed and optimized hyperparameters
    tr_performance = tr_aggr
    va_performance = va_aggr
    te_performance = te_aggr

    # does overview table exist?
    if not file_exists(overview_df_filename):
        # create dataframe with model info
        info = [[model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]]
        df = pd.DataFrame(info, columns=cls)
        # save
        df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(df.to_markdown())
    else:
        # load 
        overview_df = pd.read_parquet(overview_df_filename)

        # add row for model
        new_row = {}
        keys = cls
        values = [model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]
        for key in keys:
            for value in values:
                new_row[key] = value
                values.remove(value)
                break 
        overview_df = overview_df.append(new_row, ignore_index=True)
        # save
        overview_df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(overview_df.to_markdown())

performing training and optimization
starting hyperparameter optimization
[LibSVM]
  0%|          | 0/50 [00:00<?, ?it/s, best loss: ?]




[LibSVM]
  2%|▏         | 1/50 [01:49<1:29:09, 109.18s/it, best loss: 0.24576535861258075]




[LibSVM]
  4%|▍         | 2/50 [03:02<1:10:28, 88.09s/it, best loss: 0.22471462864981384]




[LibSVM]
  6%|▌         | 3/50 [04:53<1:17:20, 98.72s/it, best loss: 0.22471462864981384]




[LibSVM]
  8%|▊         | 4/50 [06:50<1:21:00, 105.67s/it, best loss: 0.22471462864981384]




[LibSVM]
 10%|█         | 5/50 [08:09<1:12:05, 96.13s/it, best loss: 0.21772578630217532]




[LibSVM]
 12%|█▏        | 6/50 [09:43<1:09:57, 95.41s/it, best loss: 0.21772578630217532]




[LibSVM]
 14%|█▍        | 7/50 [10:53<1:02:32, 87.27s/it, best loss: 0.21772578630217532]




[LibSVM]
 16%|█▌        | 8/50 [12:30<1:03:12, 90.29s/it, best loss: 0.21772578630217532]




[LibSVM]
 18%|█▊        | 9/50 [13:39<57:11, 83.70s/it, best loss: 0.21772578630217532]




[LibSVM]
 20%|██        | 10/50 [15:26<1:00:26, 90.67s/it, best loss: 0.21772578630217532]




[LibSVM]
 22%|██▏       | 11/50 [17:11<1:01:50, 95.13s/it, best loss: 0.21772578630217532]




[LibSVM]
 24%|██▍       | 12/50 [19:08<1:04:28, 101.79s/it, best loss: 0.20573375955320405]




[LibSVM]
 26%|██▌       | 13/50 [21:00<1:04:41, 104.90s/it, best loss: 0.20573375955320405]




[LibSVM]
 28%|██▊       | 14/50 [22:46<1:03:04, 105.11s/it, best loss: 0.20573375955320405]




[LibSVM]
 30%|███       | 15/50 [24:40<1:02:59, 107.97s/it, best loss: 0.20573375955320405]




[LibSVM]
 32%|███▏      | 16/50 [26:33<1:02:05, 109.56s/it, best loss: 0.20573375955320405]




[LibSVM]
 34%|███▍      | 17/50 [27:47<54:22, 98.87s/it, best loss: 0.20573375955320405]




[LibSVM]
 36%|███▌      | 18/50 [29:38<54:31, 102.24s/it, best loss: 0.20573375955320405]




[LibSVM]
 38%|███▊      | 19/50 [31:29<54:12, 104.93s/it, best loss: 0.20573375955320405]




[LibSVM]
 40%|████      | 20/50 [33:08<51:36, 103.23s/it, best loss: 0.20573375955320405]




[LibSVM]
 42%|████▏     | 21/50 [35:03<51:39, 106.86s/it, best loss: 0.20573375955320405]




[LibSVM]
 44%|████▍     | 22/50 [36:45<49:08, 105.31s/it, best loss: 0.20573375955320405]




[LibSVM]
 46%|████▌     | 23/50 [38:26<46:51, 104.14s/it, best loss: 0.20573375955320405]




[LibSVM]
 48%|████▊     | 24/50 [39:56<43:10, 99.65s/it, best loss: 0.20573375955320405]




[LibSVM]
 50%|█████     | 25/50 [41:24<40:09, 96.36s/it, best loss: 0.20573375955320405]




[LibSVM]
 52%|█████▏    | 26/50 [43:03<38:47, 96.99s/it, best loss: 0.20573375955320405]




[LibSVM]
 54%|█████▍    | 27/50 [44:37<36:50, 96.11s/it, best loss: 0.20573375955320405]




[LibSVM]
 56%|█████▌    | 28/50 [46:32<37:23, 101.96s/it, best loss: 0.20573375955320405]




[LibSVM]
 58%|█████▊    | 29/50 [47:47<32:48, 93.72s/it, best loss: 0.20573375955320405]




[LibSVM]
 60%|██████    | 30/50 [49:20<31:07, 93.39s/it, best loss: 0.20573375955320405]




[LibSVM]
 62%|██████▏   | 31/50 [50:58<30:00, 94.79s/it, best loss: 0.20573375955320405]




[LibSVM]
 64%|██████▍   | 32/50 [52:41<29:13, 97.42s/it, best loss: 0.20573375955320405]




[LibSVM]
 66%|██████▌   | 33/50 [54:29<28:28, 100.52s/it, best loss: 0.20573375955320405]




[LibSVM]
 68%|██████▊   | 34/50 [56:18<27:26, 102.93s/it, best loss: 0.20573375955320405]




[LibSVM]
 70%|███████   | 35/50 [57:49<24:51, 99.44s/it, best loss: 0.20573375955320405]




[LibSVM]
 72%|███████▏  | 36/50 [59:33<23:33, 100.96s/it, best loss: 0.20573375955320405]




[LibSVM]
 74%|███████▍  | 37/50 [1:00:43<19:50, 91.61s/it, best loss: 0.20573375955320405]




[LibSVM]
 76%|███████▌  | 38/50 [1:01:47<16:39, 83.27s/it, best loss: 0.20573375955320405]




[LibSVM]
 78%|███████▊  | 39/50 [1:03:26<16:09, 88.12s/it, best loss: 0.20573375955320405]




[LibSVM]
 80%|████████  | 40/50 [1:04:52<14:34, 87.41s/it, best loss: 0.20573375955320405]




[LibSVM]
 82%|████████▏ | 41/50 [1:06:44<14:12, 94.68s/it, best loss: 0.20573375955320405]




[LibSVM]
 84%|████████▍ | 42/50 [1:08:27<12:57, 97.22s/it, best loss: 0.20573375955320405]




[LibSVM]
 86%|████████▌ | 43/50 [1:10:15<11:43, 100.50s/it, best loss: 0.20573375955320405]




[LibSVM]
 88%|████████▊ | 44/50 [1:11:57<10:04, 100.80s/it, best loss: 0.20573375955320405]




[LibSVM]
 90%|█████████ | 45/50 [1:13:40<08:28, 101.66s/it, best loss: 0.20573375955320405]




[LibSVM]
 92%|█████████▏| 46/50 [1:15:02<06:22, 95.63s/it, best loss: 0.20573375955320405]




[LibSVM]
 94%|█████████▍| 47/50 [1:16:12<04:23, 87.94s/it, best loss: 0.20573375955320405]




[LibSVM]
 96%|█████████▌| 48/50 [1:17:50<03:02, 91.06s/it, best loss: 0.20573375955320405]




[LibSVM]
 98%|█████████▊| 49/50 [1:19:03<01:25, 85.63s/it, best loss: 0.20573375955320405]




100%|██████████| 50/50 [1:20:51<00:00, 97.03s/it, best loss: 0.20573375955320405]
the best SVM hyper-parameters are: {'C': 0.9146720748918769, 'gamma': 0.07830765964249707}
[LibSVM]



performing repetitions on different seeds
[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



[LibSVM]



model information added to experimentation overview
|    | model_type   | filename                   | GPU_accelerator      | RAM      | data_features                | hyperparameters                                                                                                                                                                                                                                                                                                             | train_performance_ROC-AUC_avg/std/max   | valid_performance_ROC-AUC_avg/std/max   | test_performance_ROC-AUC_avg/std/max   |
|---:|:-------------|:---------------------------|:---------------------|:---------|:-----------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# RF

In [None]:
# hyperparameter optimization setup for RF
OPT_ITERS = 50 
repetitions = 10 

rf_hyper_space = {'n_estimators': hp.choice('n_estimators', [10, 25, 50, 100, 200, 300, 400, 500]),
                  'max_depth': hp.choice('max_depth', range(3, 15)),
                  'min_samples_leaf': hp.choice('min_samples_leaf', [1, 3, 5, 10, 25, 50]),
                  'min_impurity_decrease': hp.uniform('min_impurity_decrease', 0, 0.02),
                  'max_features': hp.choice('max_features', ['sqrt', 'log2', 0.2, 0.4, 0.6, 0.8])}

# hyperparameter lists for building best model (for all hyperparameters with hp.choice())
n_estimators_ls = [10, 25, 50, 100, 200, 300, 400, 500]
max_depth_ls = range(3, 15)
min_samples_leaf_ls = [1, 3, 5, 10, 25, 50]
max_features_ls = ['sqrt', 'log2', 0.2, 0.4, 0.6, 0.8]

## RF (all features)

In [None]:
def rf_hyper_opt(args):
    model = RandomForestClassifier(**args, n_jobs=-1, random_state=0, verbose=0, class_weight='balanced')
    model.fit(data_tr_x, data_tr_y.values.ravel())
    val_preds = model.predict_proba(data_va_x)
    loss = 1 - roc_auc_score(data_va_y, val_preds[:, 1])
    return {'loss': loss, 'status': STATUS_OK}

In [None]:
filename_rf_full_descr = "rf_opt_full_descr_50"

if file_exists(filename_rf_full_descr+".sav") and file_exists(filename_rf_full_descr+"_performance"):
    print("no training and optimization needed, everything can be loaded")

    # model
    print('\n')
    print("best RF model on full descriptors is:")
    loaded_model = pickle.load(open(filename_rf_full_descr+".sav", 'rb'))
    print(loaded_model)

    # performance
    perf_df = pd.read_parquet(filename_rf_full_descr + "_performance")
    print('\n')
    print(f"mean ROC-AUC across {repetitions} different seeds")
    print(f"train: {round(np.average(perf_df['auc_roc'][0]), 5)} | validation: {round(np.average(perf_df['auc_roc'][1]), 5)}, test: {round(np.average(perf_df['auc_roc'][2]), 5)}")

    # overview
    print('\n')
    print("experimentation overview:")
    # load 
    overview_df = pd.read_parquet(overview_df_filename)
    print(overview_df.to_markdown())

else:
    print("performing training and optimization")

    # hyperparameter optimization
    print("starting hyperparameter optimization")
    trials = Trials()
    best_results_rf_full_descr = fmin(rf_hyper_opt, rf_hyper_space, algo=tpe.suggest, max_evals=OPT_ITERS, trials=trials)
    
    text = (
            "the best RF hyperparameters are: "
            f"n_estimators {n_estimators_ls[best_results_rf_full_descr['n_estimators']]} | "
            f"max_depth {max_depth_ls[best_results_rf_full_descr['max_depth']]} | "
            f"min_samples_leaf {min_samples_leaf_ls[best_results_rf_full_descr['min_samples_leaf']]} | "
            f"max_features {max_features_ls[best_results_rf_full_descr['max_features']]} | "
            f"min_impurity_decrease {best_results_rf_full_descr['min_impurity_decrease']}"
            )
    print(text)
    best_model = RandomForestClassifier(n_estimators=n_estimators_ls[best_results_rf_full_descr['n_estimators']],
                                        max_depth=max_depth_ls[best_results_rf_full_descr['max_depth']],
                                        min_samples_leaf=min_samples_leaf_ls[best_results_rf_full_descr['min_samples_leaf']],
                                        max_features=max_features_ls[best_results_rf_full_descr['max_features']],
                                        min_impurity_decrease=best_results_rf_full_descr['min_impurity_decrease'],
                                        n_jobs=-1, random_state=0, verbose=0, class_weight='balanced')

    best_model.fit(data_tr_x, data_tr_y.values.ravel())

    # save hyperparameters
    with open(filename_rf_full_descr+"_hps", 'wb') as f:
        pickle.dump(best_results_rf_full_descr, f)
    # loadable via ...
    # with open(filename_rf_full_descr+"_hps", 'rb') as f:
    #    loaded_dict = pickle.load(f)

    # save best model
    pickle.dump(best_model, open(filename_rf_full_descr+".sav", 'wb'))
    # loadable via ...
    # best_model = pickle.load(open(filename_rf_full_descr+".sav", 'rb'))

    # repetitions for performance on different seeds
    tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc = [], [], [], [], [], [], [], [], [], []
    va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc = [], [], [], [], [], [], [], [], [], []
    te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc = [], [], [], [], [], [], [], [], [], []

    tr_lst = [tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc]
    va_lst = [va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc]
    te_lst = [te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]

    print("performing repetitions on different seeds")
    for i in range(repetitions):
        
        # first replicate model with initial seed
        if i == 0:
            seed = 0
        else:
            seed = np.random.randint(1, 999999) # all but initial random seed of 0
        
        best_model = RandomForestClassifier(n_estimators=n_estimators_ls[best_results_rf_full_descr['n_estimators']],
                                            max_depth=max_depth_ls[best_results_rf_full_descr['max_depth']],
                                            min_samples_leaf=min_samples_leaf_ls[best_results_rf_full_descr['min_samples_leaf']],
                                            max_features=max_features_ls[best_results_rf_full_descr['max_features']],
                                            min_impurity_decrease=best_results_rf_full_descr['min_impurity_decrease'],
                                            n_jobs=-1, random_state=seed, verbose=0, class_weight='balanced')
        
        best_model.fit(data_tr_x, data_tr_y.values.ravel())

        # training metrics calc
        tr_pred = best_model.predict_proba(data_tr_x)
        tr_metrics = list(statistical(data_tr_y, np.argmax(tr_pred, axis=1), tr_pred[:, 1]))

        # validation metric calc
        va_pred = best_model.predict_proba(data_va_x)
        va_metrics = list(statistical(data_va_y, np.argmax(va_pred, axis=1), va_pred[:, 1]))

        # test metric calc
        te_pred = best_model.predict_proba(data_te_x)
        te_metrics = list(statistical(data_te_y, np.argmax(te_pred, axis=1), te_pred[:, 1]))

        # creating dataframe
        for j in range(len(tr_lst)):               
            tr_lst[j].append(tr_metrics[j])
            va_lst[j].append(va_metrics[j])
            te_lst[j].append(te_metrics[j])

    metric_cls = ["tn", "fp", "fn", "tp", "se", "sp", "acc", "mcc", "auc_prc", "auc_roc"] 
    metrics_data = [["train", tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc],
                    ["validation", va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc],
                    ["test", te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]]
    rf_full_descr_perf = pd.DataFrame(metrics_data, columns = ["split"] + metric_cls)
    
    # save performance df
    rf_full_descr_perf.to_parquet(filename_rf_full_descr + "_performance", index=0)      
    # loadable via ...
    # rf_full_descr_perf = pd.read_parquet(filename_rf_full_descr + "_performance")

    # add model info 
    cols = ["avg_auc_roc", "std_auc_roc", "top_roc_auc"]
    tr_aggr = []
    va_aggr = []
    te_aggr = []
    results = [tr_aggr, va_aggr, te_aggr]

    for i in range(len(results)):
        # avg_auc_roc
        results[i].append(round(np.average(rf_full_descr_perf["auc_roc"][i]), 5))
        # std_auc_roc
        results[i].append(round(np.std(rf_full_descr_perf["auc_roc"][i]), 5))
        # top_roc_auc
        results[i].append(round(np.max(rf_full_descr_perf["auc_roc"][i]), 5))

    cls = ["model_type", "filename", "GPU_accelerator", "RAM", "data_features", "hyperparameters", "train_performance_ROC-AUC_avg/std/max", "valid_performance_ROC-AUC_avg/std/max", "test_performance_ROC-AUC_avg/std/max"]
    model_type = "RF"
    data_features = f"all features ({data_tr_x.shape[1]} feats)"
    filename = filename_rf_full_descr
    GPU_info = !nvidia-smi -L
    GPU_accelerator = re.search(r"\: (.*?)\(", str(GPU_info)).group(1)
    RAM = f"{round(virtual_memory().total / 1e9, 2)} GB"
    hp_dict = best_model.get_params() # all fixed and optimized hyperparameters
    hyperparameters = hp_dict
    tr_performance = tr_aggr
    va_performance = va_aggr
    te_performance = te_aggr

    # does overview table exist?
    if not file_exists(overview_df_filename):
        # create dataframe with model info
        info = [[model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]]
        df = pd.DataFrame(info, columns=cls)
        # save
        df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(df.to_markdown())
    else:
        # load 
        overview_df = pd.read_parquet(overview_df_filename)

        # add row for model
        new_row = {}
        keys = cls
        values = [model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]
        for key in keys:
            for value in values:
                new_row[key] = value
                values.remove(value)
                break 
        overview_df = overview_df.append(new_row, ignore_index=True)
        overview_df["hyperparameters"] = overview_df["hyperparameters"].astype(str)
        # save new df
        overview_df.to_parquet(overview_df_filename, index=0)   
        print("model information added to experimentation overview")
        print(overview_df.to_markdown())

performing training and optimization
starting hyperparameter optimization
100%|██████████| 50/50 [49:28<00:00, 59.38s/it, best loss: 0.20491928767391732]
the best RF hyperparameters are: n_estimators 100 | max_depth 13 | min_samples_leaf 3 | max_features 0.4 | min_impurity_decrease 0.0007866053743963098
performing repetitions on different seeds
model information added to experimentation overview
|    | model_type   | filename                   | GPU_accelerator      | RAM      | data_features                | hyperparameters                                                                                                                                                                                                                                                                                                                                                                                                             | train_performance_ROC-AUC_avg/std/max   | valid_performance_ROC-AUC_avg

## RF (feature selection)

In [None]:
def rf_hyper_opt_fs(args):
    model = RandomForestClassifier(**args, n_jobs=-1, random_state=0, verbose=0, class_weight='balanced')
    model.fit(data_tr_x_fs, data_tr_y.values.ravel())
    val_preds = model.predict_proba(data_va_x_fs)
    loss = 1 - roc_auc_score(data_va_y, val_preds[:, 1])
    return {'loss': loss, 'status': STATUS_OK}

In [None]:
filename_rf_feat_select = "rf_opt_feat_select_50"

if file_exists(filename_rf_feat_select+".sav") and file_exists(filename_rf_feat_select+"_performance"):
    print("no training and optimization needed, everything can be loaded")

    # model
    print('\n')
    print("best RF model on feature selection is:")
    loaded_model = pickle.load(open(filename_rf_feat_select+".sav", 'rb'))
    print(loaded_model)

    # performance
    perf_df = pd.read_parquet(filename_rf_feat_select + "_performance")
    print('\n')
    print(f"mean ROC-AUC across {repetitions} different seeds")
    print(f"train: {round(np.average(perf_df['auc_roc'][0]), 5)} | validation: {round(np.average(perf_df['auc_roc'][1]), 5)}, test: {round(np.average(perf_df['auc_roc'][2]), 5)}")

    # overview
    print('\n')
    print("experimentation overview:")
    # load 
    overview_df = pd.read_parquet(overview_df_filename)
    print(overview_df.to_markdown())

else:
    print("performing training and optimization")

    # hyperparameter optimization
    print("starting hyperparameter optimization")
    trials = Trials()
    best_results_rf_feat_select = fmin(rf_hyper_opt_fs, rf_hyper_space, algo=tpe.suggest, max_evals=OPT_ITERS, trials=trials)
    
    text = (
            "the best RF hyperparameters are: "
            f"n_estimators {n_estimators_ls[best_results_rf_feat_select['n_estimators']]} | "
            f"max_depth {max_depth_ls[best_results_rf_feat_select['max_depth']]} | "
            f"min_samples_leaf {min_samples_leaf_ls[best_results_rf_feat_select['min_samples_leaf']]} | "
            f"max_features {max_features_ls[best_results_rf_feat_select['max_features']]} | "
            f"min_impurity_decrease {best_results_rf_feat_select['min_impurity_decrease']}"
            )
    print(text)
    best_model = RandomForestClassifier(n_estimators=n_estimators_ls[best_results_rf_feat_select['n_estimators']],
                                        max_depth=max_depth_ls[best_results_rf_feat_select['max_depth']],
                                        min_samples_leaf=min_samples_leaf_ls[best_results_rf_feat_select['min_samples_leaf']],
                                        max_features=max_features_ls[best_results_rf_feat_select['max_features']],
                                        min_impurity_decrease=best_results_rf_feat_select['min_impurity_decrease'],
                                        n_jobs=-1, random_state=0, verbose=0, class_weight='balanced')

    best_model.fit(data_tr_x_fs, data_tr_y.values.ravel())

    # save hyperparameters
    with open(filename_rf_feat_select+"_hps", 'wb') as f:
        pickle.dump(best_results_rf_feat_select, f)
    # loadable via ...
    # with open(filename_rf_feat_select+"_hps", 'rb') as f:
    #    loaded_dict = pickle.load(f)

    # save best model
    pickle.dump(best_model, open(filename_rf_feat_select+".sav", 'wb'))
    # loadable via ...
    # best_model = pickle.load(open(filename_rf_feat_select+".sav", 'rb'))

    # repetitions for performance on different seeds
    tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc = [], [], [], [], [], [], [], [], [], []
    va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc = [], [], [], [], [], [], [], [], [], []
    te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc = [], [], [], [], [], [], [], [], [], []

    tr_lst = [tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc]
    va_lst = [va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc]
    te_lst = [te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]

    print("performing repetitions on different seeds")
    for i in range(repetitions):
        
        # first replicate model with initial seed
        if i == 0:
            seed = 0
        else:
            seed = np.random.randint(1, 999999) # all but initial random seed of 0
        
        best_model = RandomForestClassifier(n_estimators=n_estimators_ls[best_results_rf_feat_select['n_estimators']],
                                            max_depth=max_depth_ls[best_results_rf_feat_select['max_depth']],
                                            min_samples_leaf=min_samples_leaf_ls[best_results_rf_feat_select['min_samples_leaf']],
                                            max_features=max_features_ls[best_results_rf_feat_select['max_features']],
                                            min_impurity_decrease=best_results_rf_feat_select['min_impurity_decrease'],
                                            n_jobs=-1, random_state=seed, verbose=0, class_weight='balanced')
        
        best_model.fit(data_tr_x_fs, data_tr_y.values.ravel())

        # training metrics calc
        tr_pred = best_model.predict_proba(data_tr_x_fs)
        tr_metrics = list(statistical(data_tr_y, np.argmax(tr_pred, axis=1), tr_pred[:, 1]))

        # validation metric calc
        va_pred = best_model.predict_proba(data_va_x_fs)
        va_metrics = list(statistical(data_va_y, np.argmax(va_pred, axis=1), va_pred[:, 1]))

        # test metric calc
        te_pred = best_model.predict_proba(data_te_x_fs)
        te_metrics = list(statistical(data_te_y, np.argmax(te_pred, axis=1), te_pred[:, 1]))

        # creating dataframe
        for j in range(len(tr_lst)):               
            tr_lst[j].append(tr_metrics[j])
            va_lst[j].append(va_metrics[j])
            te_lst[j].append(te_metrics[j])

    metric_cls = ["tn", "fp", "fn", "tp", "se", "sp", "acc", "mcc", "auc_prc", "auc_roc"] 
    metrics_data = [["train", tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc],
                    ["validation", va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc],
                    ["test", te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]]
    rf_feat_select_perf = pd.DataFrame(metrics_data, columns = ["split"] + metric_cls)
    
    # save performance df
    rf_feat_select_perf.to_parquet(filename_rf_feat_select + "_performance", index=0)      
    # loadable via ...
    # rf_feat_select_perf = pd.read_parquet(filename_rf_feat_select + "_performance")

    # add model info 
    cols = ["avg_auc_roc", "std_auc_roc", "top_roc_auc"]
    tr_aggr = []
    va_aggr = []
    te_aggr = []
    results = [tr_aggr, va_aggr, te_aggr]

    for i in range(len(results)):
        # avg_auc_roc
        results[i].append(round(np.average(rf_feat_select_perf["auc_roc"][i]), 5))
        # std_auc_roc
        results[i].append(round(np.std(rf_feat_select_perf["auc_roc"][i]), 5))
        # top_roc_auc
        results[i].append(round(np.max(rf_feat_select_perf["auc_roc"][i]), 5))

    cls = ["model_type", "filename", "GPU_accelerator", "RAM", "data_features", "hyperparameters", "train_performance_ROC-AUC_avg/std/max", "valid_performance_ROC-AUC_avg/std/max", "test_performance_ROC-AUC_avg/std/max"]
    model_type = "RF"
    data_features = f"feature selection ({data_tr_x_fs.shape[1]} feats)"
    filename = filename_rf_feat_select
    GPU_info = !nvidia-smi -L
    GPU_accelerator = re.search(r"\: (.*?)\(", str(GPU_info)).group(1)
    RAM = f"{round(virtual_memory().total / 1e9, 2)} GB"
    hp_dict = best_model.get_params() # all fixed and optimized hyperparameters
    hyperparameters = hp_dict
    tr_performance = tr_aggr
    va_performance = va_aggr
    te_performance = te_aggr

    # does overview table exist?
    if not file_exists(overview_df_filename):
        # create dataframe with model info
        info = [[model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]]
        df = pd.DataFrame(info, columns=cls)
        # save
        df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(df.to_markdown())
    else:
        # load 
        overview_df = pd.read_parquet(overview_df_filename)

        # add row for model
        new_row = {}
        keys = cls
        values = [model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]
        for key in keys:
            for value in values:
                new_row[key] = value
                values.remove(value)
                break 
        overview_df = overview_df.append(new_row, ignore_index=True)
        overview_df["hyperparameters"] = overview_df["hyperparameters"].astype(str)
        # save new df
        overview_df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(overview_df.to_markdown())

performing training and optimization
starting hyperparameter optimization
100%|██████████| 50/50 [14:33<00:00, 17.46s/it, best loss: 0.18970917842445623]
the best RF hyperparameters are: n_estimators 10 | max_depth 10 | min_samples_leaf 3 | max_features log2 | min_impurity_decrease 0.0003677058219403621
performing repetitions on different seeds
model information added to experimentation overview
|    | model_type   | filename                   | GPU_accelerator      | RAM      | data_features                | hyperparameters                                                                                                                                                                                                                                                                                                                                                                                                               | train_performance_ROC-AUC_avg/std/max   | valid_performance_ROC-AUC_a

# XGBoost

In [None]:
# hyperparameter optimization setup for XGB
OPT_ITERS = 50
repetitions = 10
patience = 25

xgb_hyper_space = {'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
                   'gamma': hp.uniform('gamma', 0, 0.3),
                   'min_child_weight': hp.choice('min_child_weight', range(1, 5)),
                   'subsample': hp.uniform('subsample', 0.5, 1.0),
                   'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
                   'max_depth': hp.choice('max_depth', range(3, 12)),
                   'n_estimators': hp.choice('n_estimators', [100, 200, 300, 400, 500, 1000, 1500, 2000])}

# hyperparameter lists for building best model (for all hyperparameters with hp.choice())
min_child_weight_ls = range(1, 5)
max_depth_ls = range(3, 12)
n_estimators_ls = [100, 200, 300, 400, 500, 1000, 1500, 2000]

In [None]:
# calculate weight for positive data instances
dataset = DglGraphPropPredDataset(name = "ogbg-molhiv")
pos_weight = float(get_pos_weight(dataset))

## XGBoost (all features)

In [None]:
def xgb_hyper_opt(args):
    model = XGBClassifier(**args, n_jobs=-1, random_state=0, scale_pos_weight=pos_weight)
    model.fit(data_tr_x, data_tr_y.values.ravel(),
              eval_metric='auc', eval_set=[(data_va_x, data_va_y.values.ravel())],
              early_stopping_rounds=patience, verbose=False)
    val_preds = model.predict_proba(data_va_x)
    loss = 1 - roc_auc_score(data_va_y, val_preds[:, 1])
    return {'loss': loss, 'status': STATUS_OK}

In [None]:
filename_xgb_full_descr = "xgb_opt_full_descr_50"

if file_exists(filename_xgb_full_descr+".sav") and file_exists(filename_xgb_full_descr+"_performance"):
    print("no training and optimization needed, everything can be loaded")

    # model
    print('\n')
    print("best XGB model on full descriptors is:")
    loaded_model = pickle.load(open(filename_xgb_full_descr+".sav", 'rb'))
    print(loaded_model)

    # performance
    perf_df = pd.read_parquet(filename_xgb_full_descr + "_performance")
    print('\n')
    print(f"mean ROC-AUC across {repetitions} different seeds")
    print(f"train: {round(np.average(perf_df['auc_roc'][0]), 5)} | validation: {round(np.average(perf_df['auc_roc'][1]), 5)}, test: {round(np.average(perf_df['auc_roc'][2]), 5)}")

    # overview
    print('\n')
    print("experimentation overview:")
    # load 
    overview_df = pd.read_parquet(overview_df_filename)
    print(overview_df.to_markdown())

else:
    print("performing training and optimization")

    # hyperparameter optimization
    print("starting hyperparameter optimization")
    trials = Trials()
    best_results_xgb_full_descr = fmin(xgb_hyper_opt, xgb_hyper_space, algo=tpe.suggest, max_evals=OPT_ITERS, trials=trials)
    
    text = (
            "the best XGB hyperparameters are: "
            f"learning_rate {best_results_xgb_full_descr['learning_rate']} | "
            f"gamma {best_results_xgb_full_descr['gamma']} | "
            f"min_child_weight {min_child_weight_ls[best_results_xgb_full_descr['min_child_weight']]} | "
            f"subsample {best_results_xgb_full_descr['subsample']} | "
            f"colsample_bytree {best_results_xgb_full_descr['colsample_bytree']} | "
            f"max_depth {max_depth_ls[best_results_xgb_full_descr['max_depth']]} | "
            f"n_estimators {n_estimators_ls[best_results_xgb_full_descr['n_estimators']]}"
            )
    print(text)
    best_model = XGBClassifier(learning_rate = best_results_xgb_full_descr['learning_rate'],
                               gamma = best_results_xgb_full_descr['gamma'],
                               min_child_weight = min_child_weight_ls[best_results_xgb_full_descr['min_child_weight']],
                               subsample = best_results_xgb_full_descr['subsample'],
                               colsample_bytree = best_results_xgb_full_descr['colsample_bytree'],
                               max_depth = max_depth_ls[best_results_xgb_full_descr['max_depth']],
                               n_estimators = n_estimators_ls[best_results_xgb_full_descr['n_estimators']],
                               n_jobs=-1, random_state=0, scale_pos_weight=pos_weight)

    best_model.fit(data_tr_x, data_tr_y.values.ravel(),
              eval_metric='auc', eval_set=[(data_va_x, data_va_y.values.ravel())],
              early_stopping_rounds=patience, verbose=False)

    # save hyperparameters
    with open(filename_xgb_full_descr+"_hps", 'wb') as f:
        pickle.dump(best_results_xgb_full_descr, f)
    # loadable via ...
    # with open(filename_xgb_full_descr+"_hps", 'rb') as f:
    #    loaded_dict = pickle.load(f)

    # save best model
    pickle.dump(best_model, open(filename_xgb_full_descr+".sav", 'wb'))
    # loadable via ...
    # best_model = pickle.load(open(filename_xgb_full_descr+".sav", 'rb'))

    # repetitions for performance on different seeds
    tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc = [], [], [], [], [], [], [], [], [], []
    va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc = [], [], [], [], [], [], [], [], [], []
    te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc = [], [], [], [], [], [], [], [], [], []

    tr_lst = [tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc]
    va_lst = [va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc]
    te_lst = [te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]

    print("performing repetitions on different seeds")
    for i in range(repetitions):
        
        # first replicate model with initial seed
        if i == 0:
            seed = 0
        else:
            seed = np.random.randint(1, 999999) # all but initial random seed of 0
        
        best_model = XGBClassifier(learning_rate = best_results_xgb_full_descr['learning_rate'],
                               gamma = best_results_xgb_full_descr['gamma'],
                               min_child_weight = min_child_weight_ls[best_results_xgb_full_descr['min_child_weight']],
                               subsample = best_results_xgb_full_descr['subsample'],
                               colsample_bytree = best_results_xgb_full_descr['colsample_bytree'],
                               max_depth = max_depth_ls[best_results_xgb_full_descr['max_depth']],
                               n_estimators = n_estimators_ls[best_results_xgb_full_descr['n_estimators']],
                               n_jobs=-1, random_state=seed, scale_pos_weight=pos_weight)
        
        best_model.fit(data_tr_x, data_tr_y.values.ravel())

        # training metrics calc
        tr_pred = best_model.predict_proba(data_tr_x)
        tr_metrics = list(statistical(data_tr_y, np.argmax(tr_pred, axis=1), tr_pred[:, 1]))

        # validation metric calc
        va_pred = best_model.predict_proba(data_va_x)
        va_metrics = list(statistical(data_va_y, np.argmax(va_pred, axis=1), va_pred[:, 1]))

        # test metric calc
        te_pred = best_model.predict_proba(data_te_x)
        te_metrics = list(statistical(data_te_y, np.argmax(te_pred, axis=1), te_pred[:, 1]))

        # creating dataframe
        for j in range(len(tr_lst)):               
            tr_lst[j].append(tr_metrics[j])
            va_lst[j].append(va_metrics[j])
            te_lst[j].append(te_metrics[j])

    metric_cls = ["tn", "fp", "fn", "tp", "se", "sp", "acc", "mcc", "auc_prc", "auc_roc"] 
    metrics_data = [["train", tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc],
                    ["validation", va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc],
                    ["test", te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]]
    xgb_full_descr_perf = pd.DataFrame(metrics_data, columns = ["split"] + metric_cls)
    
    # save performance df
    xgb_full_descr_perf.to_parquet(filename_xgb_full_descr + "_performance", index=0)      
    # loadable via ...
    # xgb_full_descr_perf = pd.read_parquet(filename_xgb_full_descr + "_performance")

    # add model info 
    cols = ["avg_auc_roc", "std_auc_roc", "top_roc_auc"]
    tr_aggr = []
    va_aggr = []
    te_aggr = []
    results = [tr_aggr, va_aggr, te_aggr]

    for i in range(len(results)):
        # avg_auc_roc
        results[i].append(round(np.average(xgb_full_descr_perf["auc_roc"][i]), 5))
        # std_auc_roc
        results[i].append(round(np.std(xgb_full_descr_perf["auc_roc"][i]), 5))
        # top_roc_auc
        results[i].append(round(np.max(xgb_full_descr_perf["auc_roc"][i]), 5))

    cls = ["model_type", "filename", "GPU_accelerator", "RAM", "data_features", "hyperparameters", "train_performance_ROC-AUC_avg/std/max", "valid_performance_ROC-AUC_avg/std/max", "test_performance_ROC-AUC_avg/std/max"]
    model_type = "XGB"
    data_features = f"all features ({data_tr_x.shape[1]} feats)"
    filename = filename_xgb_full_descr
    GPU_info = !nvidia-smi -L
    GPU_accelerator = re.search(r"\: (.*?)\(", str(GPU_info)).group(1)
    RAM = f"{round(virtual_memory().total / 1e9, 2)} GB"
    hp_dict = best_model.get_params() # all fixed and optimized hyperparameters
    hyperparameters = hp_dict
    tr_performance = tr_aggr
    va_performance = va_aggr
    te_performance = te_aggr

    # does overview table exist?
    if not file_exists(overview_df_filename):
        # create dataframe with model info
        info = [[model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]]
        df = pd.DataFrame(info, columns=cls)
        # save
        df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(df.to_markdown())
    else:
        # load 
        overview_df = pd.read_parquet(overview_df_filename)

        # add row for model
        new_row = {}
        keys = cls
        values = [model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]
        for key in keys:
            for value in values:
                new_row[key] = value
                values.remove(value)
                break 
        overview_df = overview_df.append(new_row, ignore_index=True)
        overview_df["hyperparameters"] = overview_df["hyperparameters"].astype(str)
        # save new df
        overview_df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(overview_df.to_markdown())

performing training and optimization
starting hyperparameter optimization
100%|██████████| 50/50 [42:03<00:00, 50.46s/it, best loss: 0.18452074759945136]
the best XGB hyperparameters are: learning_rate 0.20941648886207803 | gamma 0.22369342452726626 | min_child_weight 1 | subsample 0.910169049816008 | colsample_bytree 0.9436828391115852 | max_depth 3 | n_estimators 500
performing repetitions on different seeds
model information added to experimentation overview
|    | model_type   | filename                   | GPU_accelerator      | RAM      | data_features                | hyperparameters                                                                                                                                                                                                                                                                                                                                                                                                                   

## XGBoost (feature selection)

In [None]:
def xgb_hyper_opt_fs(args):
    model = XGBClassifier(**args, n_jobs=-1, random_state=0, scale_pos_weight=pos_weight)
    model.fit(data_tr_x_fs, data_tr_y.values.ravel(),
              eval_metric='auc', eval_set=[(data_va_x_fs, data_va_y.values.ravel())],
              early_stopping_rounds=patience, verbose=False)
    val_preds = model.predict_proba(data_va_x_fs)
    loss = 1 - roc_auc_score(data_va_y, val_preds[:, 1])
    return {'loss': loss, 'status': STATUS_OK}

In [None]:
filename_xgb_feat_select = "xgb_opt_feat_select_50"

if file_exists(filename_xgb_feat_select+".sav") and file_exists(filename_xgb_feat_select+"_performance"):
    print("no training and optimization needed, everything can be loaded")

    # model
    print('\n')
    print("best XGB model on feature selection is:")
    loaded_model = pickle.load(open(filename_xgb_feat_select+".sav", 'rb'))
    print(loaded_model)

    # performance
    perf_df = pd.read_parquet(filename_xgb_feat_select + "_performance")
    print('\n')
    print(f"mean ROC-AUC across {repetitions} different seeds")
    print(f"train: {round(np.average(perf_df['auc_roc'][0]), 5)} | validation: {round(np.average(perf_df['auc_roc'][1]), 5)}, test: {round(np.average(perf_df['auc_roc'][2]), 5)}")

    # overview
    print('\n')
    print("experimentation overview:")
    # load 
    overview_df = pd.read_parquet(overview_df_filename)
    print(overview_df.to_markdown())

else:
    print("performing training and optimization")

    # hyperparameter optimization
    print("starting hyperparameter optimization")
    trials = Trials()
    best_results_xgb_feat_select = fmin(xgb_hyper_opt_fs, xgb_hyper_space, algo=tpe.suggest, max_evals=OPT_ITERS, trials=trials)
    
    text = (
            "the best XGB hyperparameters are: "
            f"learning_rate {best_results_xgb_feat_select['learning_rate']} | "
            f"gamma {best_results_xgb_feat_select['gamma']} | "
            f"min_child_weight {min_child_weight_ls[best_results_xgb_feat_select['min_child_weight']]} | "
            f"subsample {best_results_xgb_feat_select['subsample']} | "
            f"colsample_bytree {best_results_xgb_feat_select['colsample_bytree']} | "
            f"max_depth {max_depth_ls[best_results_xgb_feat_select['max_depth']]} | "
            f"n_estimators {n_estimators_ls[best_results_xgb_feat_select['n_estimators']]}"
            )
    print(text)
    best_model = XGBClassifier(learning_rate = best_results_xgb_feat_select['learning_rate'],
                               gamma = best_results_xgb_feat_select['gamma'],
                               min_child_weight = min_child_weight_ls[best_results_xgb_feat_select['min_child_weight']],
                               subsample = best_results_xgb_full_descr['subsample'],
                               colsample_bytree = best_results_xgb_feat_select['colsample_bytree'],
                               max_depth = max_depth_ls[best_results_xgb_feat_select['max_depth']],
                               n_estimators = n_estimators_ls[best_results_xgb_feat_select['n_estimators']],
                               n_jobs=-1, random_state=0, scale_pos_weight=pos_weight)

    best_model.fit(data_tr_x_fs, data_tr_y.values.ravel(),
              eval_metric='auc', eval_set=[(data_va_x_fs, data_va_y.values.ravel())],
              early_stopping_rounds=patience, verbose=False)
    
    # save hyperparameters
    with open(filename_xgb_feat_select+"_hps", 'wb') as f:
        pickle.dump(best_results_xgb_feat_select, f)
    # loadable via ...
    # with open(filename_xgb_feat_select+"_hps", 'rb') as f:
    #    loaded_dict = pickle.load(f)

    # save best model
    pickle.dump(best_model, open(filename_xgb_feat_select+".sav", 'wb'))
    # loadable via ...
    # best_model = pickle.load(open(filename_xgb_feat_select+".sav", 'rb'))

    # repetitions for performance on different seeds
    tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc = [], [], [], [], [], [], [], [], [], []
    va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc = [], [], [], [], [], [], [], [], [], []
    te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc = [], [], [], [], [], [], [], [], [], []

    tr_lst = [tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc]
    va_lst = [va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc]
    te_lst = [te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]

    print("performing repetitions on different seeds")
    for i in range(repetitions):
        
        # first replicate model with initial seed
        if i == 0:
            seed = 0
        else:
            seed = np.random.randint(1, 999999) # all but initial random seed of 0
        
        best_model = XGBClassifier(learning_rate = best_results_xgb_feat_select['learning_rate'],
                               gamma = best_results_xgb_feat_select['gamma'],
                               min_child_weight = min_child_weight_ls[best_results_xgb_feat_select['min_child_weight']],
                               subsample = best_results_xgb_feat_select['subsample'],
                               colsample_bytree = best_results_xgb_feat_select['colsample_bytree'],
                               max_depth = max_depth_ls[best_results_xgb_feat_select['max_depth']],
                               n_estimators = n_estimators_ls[best_results_xgb_feat_select['n_estimators']],
                               n_jobs=-1, random_state=seed, scale_pos_weight=pos_weight)
            
        best_model.fit(data_tr_x_fs, data_tr_y.values.ravel(),
                eval_metric='auc', eval_set=[(data_va_x_fs, data_va_y.values.ravel())],
                early_stopping_rounds=patience, verbose=False)
        
        # training metrics calc
        tr_pred = best_model.predict_proba(data_tr_x_fs)
        tr_metrics = list(statistical(data_tr_y, np.argmax(tr_pred, axis=1), tr_pred[:, 1]))

        # validation metric calc
        va_pred = best_model.predict_proba(data_va_x_fs)
        va_metrics = list(statistical(data_va_y, np.argmax(va_pred, axis=1), va_pred[:, 1]))

        # test metric calc
        te_pred = best_model.predict_proba(data_te_x_fs)
        te_metrics = list(statistical(data_te_y, np.argmax(te_pred, axis=1), te_pred[:, 1]))

        # creating dataframe
        for j in range(len(tr_lst)):               
            tr_lst[j].append(tr_metrics[j])
            va_lst[j].append(va_metrics[j])
            te_lst[j].append(te_metrics[j])

    metric_cls = ["tn", "fp", "fn", "tp", "se", "sp", "acc", "mcc", "auc_prc", "auc_roc"] 
    metrics_data = [["train", tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc],
                    ["validation", va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc],
                    ["test", te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]]
    xgb_feat_select_perf = pd.DataFrame(metrics_data, columns = ["split"] + metric_cls)
    
    # save performance df
    xgb_feat_select_perf.to_parquet(filename_xgb_feat_select + "_performance", index=0)      
    # loadable via ...
    # xgb_feat_select_perf = pd.read_parquet(filename_xgb_feat_select + "_performance")

    # add model info 
    cols = ["avg_auc_roc", "std_auc_roc", "top_roc_auc"]
    tr_aggr = []
    va_aggr = []
    te_aggr = []
    results = [tr_aggr, va_aggr, te_aggr]

    for i in range(len(results)):
        # avg_auc_roc
        results[i].append(round(np.average(xgb_feat_select_perf["auc_roc"][i]), 5))
        # std_auc_roc
        results[i].append(round(np.std(xgb_feat_select_perf["auc_roc"][i]), 5))
        # top_roc_auc
        results[i].append(round(np.max(xgb_feat_select_perf["auc_roc"][i]), 5))

    cls = ["model_type", "filename", "GPU_accelerator", "RAM", "data_features", "hyperparameters", "train_performance_ROC-AUC_avg/std/max", "valid_performance_ROC-AUC_avg/std/max", "test_performance_ROC-AUC_avg/std/max"]
    model_type = "XGB"
    data_features = f"feature selection ({data_tr_x_fs.shape[1]} feats)"
    filename = filename_xgb_feat_select
    GPU_info = !nvidia-smi -L
    GPU_accelerator = re.search(r"\: (.*?)\(", str(GPU_info)).group(1)
    RAM = f"{round(virtual_memory().total / 1e9, 2)} GB"
    hp_dict = best_model.get_params() # all fixed and optimized hyperparameters
    hyperparameters = hp_dict
    tr_performance = tr_aggr
    va_performance = va_aggr
    te_performance = te_aggr

    # does overview table exist?
    if not file_exists(overview_df_filename):
        # create dataframe with model info
        info = [[model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]]
        df = pd.DataFrame(info, columns=cls)
        # save
        df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(df.to_markdown())
    else:
        # load 
        overview_df = pd.read_parquet(overview_df_filename)

        # add row for model
        new_row = {}
        keys = cls
        values = [model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]
        for key in keys:
            for value in values:
                new_row[key] = value
                values.remove(value)
                break 
        overview_df = overview_df.append(new_row, ignore_index=True)
        overview_df["hyperparameters"] = overview_df["hyperparameters"].astype(str)
        # save new df
        overview_df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(overview_df.to_markdown())

performing training and optimization
starting hyperparameter optimization
100%|██████████| 50/50 [13:08<00:00, 15.76s/it, best loss: 0.19903273809523814]
the best XGB hyperparameters are: learning_rate 0.1246187700498615 | gamma 0.1982639800431708 | min_child_weight 4 | subsample 0.7215172738554442 | colsample_bytree 0.7144328026415806 | max_depth 3 | n_estimators 300
performing repetitions on different seeds
model information added to experimentation overview
|    | model_type   | filename                   | GPU_accelerator      | RAM      | data_features                | hyperparameters                                                                                                                                                                                                                                                                                                                                                                                                                    

# MLP

In [None]:
# hyperparameter optimization setup for MLP
OPT_ITERS =50
repetitions =10
patience_mlp =50
epochs = 300 
max_iter_mlp = epochs
batch_size = 128

mlp_hyper_space = {'learning_rate_init': hp.choice('learning_rate_init', [10 ** -2.5, 10 ** -3.5, 10 ** -1.5]),
                   'alpha': hp.uniform('alpha', 0, 0.01), # l2 regularization
                   'hidden_layer_sizes': hp.choice('hidden_layer_sizes', [(50, 50), (100, 100), (100, 100, 100), (250, 250, 250), (500, 500, 500)])}
# keep activation default (relu)
# keep solver default at adam

# hyperparameter lists for building best model (for all hyperparameters with hp.choice())
learning_rate_init_ls = [10 ** -2.5, 10 ** -3.5, 10 ** -1.5]
hidden_layer_sizes_ls = [(50, 50), (100, 100), (100, 100, 100), (250, 250, 250), (500, 500, 500)]

## MLP (all features)

In [None]:
def mlp_hyper_opt(args):
    model = MLPClassifier(**args, max_iter=max_iter_mlp, random_state=0, n_iter_no_change=patience_mlp)
    model.fit(data_tr_x, data_tr_y.values.ravel())
    val_preds = model.predict_proba(data_va_x)
    loss = 1 - roc_auc_score(data_va_y, val_preds[:, 1])
    return {'loss': loss, 'status': STATUS_OK}

In [None]:
filename_mlp_full_descr = "mlp_opt_full_descr_50"

if file_exists(filename_mlp_full_descr+".sav") and file_exists(filename_mlp_full_descr+"_performance"):
    print("no training and optimization needed, everything can be loaded")

    # model
    print('\n')
    print("best MLP model on full descriptors is:")
    loaded_model = pickle.load(open(filename_mlp_full_descr+".sav", 'rb'))
    print(loaded_model)

    # performance
    perf_df = pd.read_parquet(filename_mlp_full_descr + "_performance")
    print('\n')
    print(f"mean ROC-AUC across {repetitions} different seeds")
    print(f"train: {round(np.average(perf_df['auc_roc'][0]), 5)} | validation: {round(np.average(perf_df['auc_roc'][1]), 5)}, test: {round(np.average(perf_df['auc_roc'][2]), 5)}")

    # overview
    print('\n')
    print("experimentation overview:")
    # load 
    overview_df = pd.read_parquet(overview_df_filename)
    print(overview_df.to_markdown())

else:
    print("performing training and optimization")

    # hyperparameter optimization
    print("starting hyperparameter optimization")
    trials = Trials()
    best_results_mlp_full_descr = fmin(mlp_hyper_opt, mlp_hyper_space, algo=tpe.suggest, max_evals=OPT_ITERS, trials=trials)
    
    text = (
            "the best MLP hyperparameters are: "
            f"learning_rate_init {learning_rate_init_ls[best_results_mlp_full_descr['learning_rate_init']]} | "
            f"l2 {best_results_mlp_full_descr['alpha']} | "
            f"hidden_layer_sizes {hidden_layer_sizes_ls[best_results_mlp_full_descr['hidden_layer_sizes']]}"
            )
    print(text)
    best_model = MLPClassifier(learning_rate_init= learning_rate_init_ls[best_results_mlp_full_descr['learning_rate_init']],
                               alpha=best_results_mlp_full_descr['alpha'],
                               hidden_layer_sizes = hidden_layer_sizes_ls[best_results_mlp_full_descr['hidden_layer_sizes']],
                               max_iter=max_iter_mlp, random_state=0, n_iter_no_change=patience_mlp)

    # best_model.fit(data_tr_x.to_numpy().astype(float), data_tr_y.values.ravel().astype(int))
    best_model.fit(data_tr_x, data_tr_y.values.ravel())

    # save hyperparameters
    with open(filename_mlp_full_descr+"_hps", 'wb') as f:
        pickle.dump(best_results_mlp_full_descr, f)
    # loadable via ...
    # with open(filename_mlp_full_descr+"_hps", 'rb') as f:
    #    loaded_dict = pickle.load(f)

    # save best model
    pickle.dump(best_model, open(filename_mlp_full_descr+".sav", 'wb'))
    # loadable via ...
    # best_model = pickle.load(open(filename_mlp_full_descr+".sav", 'rb'))

    # repetitions for performance on different seeds
    tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc = [], [], [], [], [], [], [], [], [], []
    va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc = [], [], [], [], [], [], [], [], [], []
    te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc = [], [], [], [], [], [], [], [], [], []

    tr_lst = [tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc]
    va_lst = [va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc]
    te_lst = [te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]

    print("performing repetitions on different seeds")
    for i in range(repetitions):
        
        # first replicate model with initial seed
        if i == 0:
            seed = 0
        else:
            seed = np.random.randint(1, 999999) # all but initial random seed of 0
        
        best_model = MLPClassifier(learning_rate_init= learning_rate_init_ls[best_results_mlp_full_descr['learning_rate_init']],
                               alpha=best_results_mlp_full_descr['alpha'],
                               hidden_layer_sizes = hidden_layer_sizes_ls[best_results_mlp_full_descr['hidden_layer_sizes']],
                               max_iter=max_iter_mlp, random_state=seed, n_iter_no_change=patience_mlp)
        
        best_model.fit(data_tr_x, data_tr_y.values.ravel())

        # training metrics calc
        tr_pred = best_model.predict_proba(data_tr_x)
        tr_metrics = list(statistical(data_tr_y, np.argmax(tr_pred, axis=1), tr_pred[:, 1]))

        # validation metric calc
        va_pred = best_model.predict_proba(data_va_x)
        va_metrics = list(statistical(data_va_y, np.argmax(va_pred, axis=1), va_pred[:, 1]))

        # test metric calc
        te_pred = best_model.predict_proba(data_te_x)
        te_metrics = list(statistical(data_te_y, np.argmax(te_pred, axis=1), te_pred[:, 1]))

        # creating dataframe
        for j in range(len(tr_lst)):               
            tr_lst[j].append(tr_metrics[j])
            va_lst[j].append(va_metrics[j])
            te_lst[j].append(te_metrics[j])

    metric_cls = ["tn", "fp", "fn", "tp", "se", "sp", "acc", "mcc", "auc_prc", "auc_roc"] 
    metrics_data = [["train", tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc],
                    ["validation", va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc],
                    ["test", te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]]
    mlp_full_descr_perf = pd.DataFrame(metrics_data, columns = ["split"] + metric_cls)
    
    # save performance df
    mlp_full_descr_perf.to_parquet(filename_mlp_full_descr + "_performance", index=0)      
    # loadable via ...
    # mlp_full_descr_perf = pd.read_parquet(filename_mlp_full_descr + "_performance")

    # add model info 
    cols = ["avg_auc_roc", "std_auc_roc", "top_roc_auc"]
    tr_aggr = []
    va_aggr = []
    te_aggr = []
    results = [tr_aggr, va_aggr, te_aggr]

    for i in range(len(results)):
        # avg_auc_roc
        results[i].append(round(np.average(mlp_full_descr_perf["auc_roc"][i]), 5))
        # std_auc_roc
        results[i].append(round(np.std(mlp_full_descr_perf["auc_roc"][i]), 5))
        # top_roc_auc
        results[i].append(round(np.max(mlp_full_descr_perf["auc_roc"][i]), 5))

    cls = ["model_type", "filename", "GPU_accelerator", "RAM", "data_features", "hyperparameters", "train_performance_ROC-AUC_avg/std/max", "valid_performance_ROC-AUC_avg/std/max", "test_performance_ROC-AUC_avg/std/max"]
    model_type = "MLP"
    data_features = f"all features ({data_tr_x.shape[1]} feats)"
    filename = filename_mlp_full_descr
    GPU_info = !nvidia-smi -L
    GPU_accelerator = re.search(r"\: (.*?)\(", str(GPU_info)).group(1)
    RAM = f"{round(virtual_memory().total / 1e9, 2)} GB"
    hp_dict = best_model.get_params() # all fixed and optimized hyperparameters
    hyperparameters = hp_dict
    tr_performance = tr_aggr
    va_performance = va_aggr
    te_performance = te_aggr

    # does overview table exist?
    if not file_exists(overview_df_filename):
        # create dataframe with model info
        info = [[model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]]
        df = pd.DataFrame(info, columns=cls)
        # save
        df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(df.to_markdown())
    else:
        # load 
        overview_df = pd.read_parquet(overview_df_filename)

        # add row for model
        new_row = {}
        keys = cls
        values = [model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]
        for key in keys:
            for value in values:
                new_row[key] = value
                values.remove(value)
                break 
        overview_df = overview_df.append(new_row, ignore_index=True)
        overview_df["hyperparameters"] = overview_df["hyperparameters"].astype(str)
        # save new df
        overview_df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(overview_df.to_markdown())

performing training and optimization
starting hyperparameter optimization
  2%|▏         | 1/50 [03:05<2:31:22, 185.36s/it, best loss: 0.2796639231824417]




  4%|▍         | 2/50 [04:27<1:39:50, 124.80s/it, best loss: 0.2749393738977074]




 18%|█▊        | 9/50 [57:41<5:33:41, 488.32s/it, best loss: 0.23312573486184596]




 24%|██▍       | 12/50 [1:11:29<3:50:36, 364.12s/it, best loss: 0.23295273613560652]




 26%|██▌       | 13/50 [1:15:38<3:23:08, 329.42s/it, best loss: 0.2188326719576721] 




 38%|███▊      | 19/50 [1:44:41<2:05:49, 243.52s/it, best loss: 0.20994390554575737]




 68%|██████▊   | 34/50 [4:02:21<1:21:00, 303.75s/it, best loss: 0.20994390554575737]




 74%|███████▍  | 37/50 [4:08:44<42:56, 198.21s/it, best loss: 0.20994390554575737]





 92%|█████████▏| 46/50 [6:55:17<1:05:44, 986.09s/it, best loss: 0.20994390554575737] 




 96%|█████████▌| 48/50 [7:29:22<30:47, 923.93s/it, best loss: 0.20994390554575737]   




100%|██████████| 50/50 [7:32:16<00:00, 542.74s/it, best loss: 0.20994390554575737]
the best MLP hyperparameters are: learning_rate_init 0.0031622776601683794 | l2 0.0002746800923919801 | hidden_layer_sizes (100, 100, 100)
performing repetitions on different seeds
model information added to experimentation overview
|    | model_type   | filename                   | GPU_accelerator      | RAM      | data_features                | hyperparameters                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | train_performance_ROC-AUC_avg/std/

## MLP (feature selection)

In [None]:
def mlp_hyper_opt_fs(args):
    model = MLPClassifier(**args, max_iter=max_iter_mlp, random_state=0, n_iter_no_change=patience_mlp)
    model.fit(data_tr_x_fs, data_tr_y.values.ravel())
    val_preds = model.predict_proba(data_va_x_fs)
    loss = 1 - roc_auc_score(data_va_y, val_preds[:, 1])
    return {'loss': loss, 'status': STATUS_OK}

In [None]:
filename_mlp_full_descr = "mlp_opt_feat_select_50"

if file_exists(filename_mlp_full_descr+".sav") and file_exists(filename_mlp_full_descr+"_performance"):
    print("no training and optimization needed, everything can be loaded")

    # model
    print('\n')
    print("best MLP model on feature selection is:")
    loaded_model = pickle.load(open(filename_mlp_full_descr+".sav", 'rb'))
    print(loaded_model)

    # performance
    perf_df = pd.read_parquet(filename_mlp_full_descr + "_performance")
    print('\n')
    print(f"mean ROC-AUC across {repetitions} different seeds")
    print(f"train: {round(np.average(perf_df['auc_roc'][0]), 5)} | validation: {round(np.average(perf_df['auc_roc'][1]), 5)}, test: {round(np.average(perf_df['auc_roc'][2]), 5)}")

    # overview
    print('\n')
    print("experimentation overview:")
    # load 
    overview_df = pd.read_parquet(overview_df_filename)
    print(overview_df.to_markdown())

else:
    print("performing training and optimization")

    # hyperparameter optimization
    print("starting hyperparameter optimization")
    trials = Trials()
    best_results_mlp_feat_select = fmin(mlp_hyper_opt_fs, mlp_hyper_space, algo=tpe.suggest, max_evals=OPT_ITERS, trials=trials)
    
    text = (
            "the best MLP hyperparameters are: "
            f"learning_rate_init {learning_rate_init_ls[best_results_mlp_feat_select['learning_rate_init']]} | "
            f"l2 {best_results_mlp_feat_select['alpha']} | "
            f"hidden_layer_sizes {hidden_layer_sizes_ls[best_results_mlp_feat_select['hidden_layer_sizes']]}"
            )
    print(text)
    best_model = MLPClassifier(learning_rate_init= learning_rate_init_ls[best_results_mlp_feat_select['learning_rate_init']],
                               alpha=best_results_mlp_feat_select['alpha'],
                               hidden_layer_sizes = hidden_layer_sizes_ls[best_results_mlp_feat_select['hidden_layer_sizes']],
                               max_iter=max_iter_mlp, random_state=0, n_iter_no_change=patience_mlp)

    # best_model.fit(data_tr_x.to_numpy().astype(float), data_tr_y.values.ravel().astype(int))
    best_model.fit(data_tr_x_fs, data_tr_y.values.ravel())

    # save hyperparameters
    with open(filename_mlp_full_descr+"_hps", 'wb') as f:
        pickle.dump(best_results_mlp_feat_select, f)
    # loadable via ...
    # with open(filename_mlp_full_descr+"_hps", 'rb') as f:
    #    loaded_dict = pickle.load(f)

    # save best model
    pickle.dump(best_model, open(filename_mlp_full_descr+".sav", 'wb'))
    # loadable via ...
    # best_model = pickle.load(open(filename_mlp_full_descr+".sav", 'rb'))

    # repetitions for performance on different seeds
    tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc = [], [], [], [], [], [], [], [], [], []
    va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc = [], [], [], [], [], [], [], [], [], []
    te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc = [], [], [], [], [], [], [], [], [], []

    tr_lst = [tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc]
    va_lst = [va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc]
    te_lst = [te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]

    print("performing repetitions on different seeds")
    for i in range(repetitions):
        
        # first replicate model with initial seed
        if i == 0:
            seed = 0
        else:
            seed = np.random.randint(1, 999999) # all but initial random seed of 0
        
        best_model = MLPClassifier(learning_rate_init= learning_rate_init_ls[best_results_mlp_feat_select['learning_rate_init']],
                               alpha=best_results_mlp_feat_select['alpha'],
                               hidden_layer_sizes = hidden_layer_sizes_ls[best_results_mlp_feat_select['hidden_layer_sizes']],
                               max_iter=max_iter_mlp, random_state=seed, n_iter_no_change=patience_mlp)
        
        best_model.fit(data_tr_x_fs, data_tr_y.values.ravel())

        # training metrics calc
        tr_pred = best_model.predict_proba(data_tr_x_fs)
        tr_metrics = list(statistical(data_tr_y, np.argmax(tr_pred, axis=1), tr_pred[:, 1]))

        # validation metric calc
        va_pred = best_model.predict_proba(data_va_x_fs)
        va_metrics = list(statistical(data_va_y, np.argmax(va_pred, axis=1), va_pred[:, 1]))

        # test metric calc
        te_pred = best_model.predict_proba(data_te_x_fs)
        te_metrics = list(statistical(data_te_y, np.argmax(te_pred, axis=1), te_pred[:, 1]))

        # creating dataframe
        for j in range(len(tr_lst)):               
            tr_lst[j].append(tr_metrics[j])
            va_lst[j].append(va_metrics[j])
            te_lst[j].append(te_metrics[j])

    metric_cls = ["tn", "fp", "fn", "tp", "se", "sp", "acc", "mcc", "auc_prc", "auc_roc"] 
    metrics_data = [["train", tr_tns, tr_fps, tr_fns, tr_tp, tr_se, tr_sp, tr_acc, tr_mcc, tr_auc_prc, tr_auc_roc],
                    ["validation", va_tns, va_fps, va_fns, va_tp, va_se, va_sp, va_acc, va_mcc, va_auc_prc, va_auc_roc],
                    ["test", te_tns, te_fps, te_fns, te_tp, te_se, te_sp, te_acc, te_mcc, te_auc_prc, te_auc_roc]]
    mlp_feat_select_perf = pd.DataFrame(metrics_data, columns = ["split"] + metric_cls)
    
    # save performance df
    mlp_feat_select_perf.to_parquet(filename_mlp_full_descr + "_performance", index=0)      
    # loadable via ...
    # mlp_feat_select_perf = pd.read_parquet(filename_mlp_full_descr + "_performance")

    # add model info 
    cols = ["avg_auc_roc", "std_auc_roc", "top_roc_auc"]
    tr_aggr = []
    va_aggr = []
    te_aggr = []
    results = [tr_aggr, va_aggr, te_aggr]

    for i in range(len(results)):
        # avg_auc_roc
        results[i].append(round(np.average(mlp_feat_select_perf["auc_roc"][i]), 5))
        # std_auc_roc
        results[i].append(round(np.std(mlp_feat_select_perf["auc_roc"][i]), 5))
        # top_roc_auc
        results[i].append(round(np.max(mlp_feat_select_perf["auc_roc"][i]), 5))

    cls = ["model_type", "filename", "GPU_accelerator", "RAM", "data_features", "hyperparameters", "train_performance_ROC-AUC_avg/std/max", "valid_performance_ROC-AUC_avg/std/max", "test_performance_ROC-AUC_avg/std/max"]
    model_type = "MLP"
    data_features = f"feature selection ({data_tr_x_fs.shape[1]} feats)"
    filename = filename_mlp_full_descr
    GPU_info = !nvidia-smi -L
    GPU_accelerator = re.search(r"\: (.*?)\(", str(GPU_info)).group(1)
    RAM = f"{round(virtual_memory().total / 1e9, 2)} GB"
    hp_dict = best_model.get_params() # all fixed and optimized hyperparameters
    hyperparameters = hp_dict
    tr_performance = tr_aggr
    va_performance = va_aggr
    te_performance = te_aggr

    # does overview table exist?
    if not file_exists(overview_df_filename):
        # create dataframe with model info
        info = [[model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]]
        df = pd.DataFrame(info, columns=cls)
        # save
        df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(df.to_markdown())
    else:
        # load 
        overview_df = pd.read_parquet(overview_df_filename)

        # add row for model
        new_row = {}
        keys = cls
        values = [model_type, filename, GPU_accelerator, RAM, data_features, hyperparameters, tr_performance, va_performance, te_performance]
        for key in keys:
            for value in values:
                new_row[key] = value
                values.remove(value)
                break 
        overview_df = overview_df.append(new_row, ignore_index=True)
        overview_df["hyperparameters"] = overview_df["hyperparameters"].astype(str)
        # save new df
        overview_df.to_parquet(overview_df_filename, index=0)
        print("model information added to experimentation overview")
        print(overview_df.to_markdown())

performing training and optimization
starting hyperparameter optimization
  2%|▏         | 1/50 [01:32<1:15:47, 92.80s/it, best loss: 0.2931210807368215]




  4%|▍         | 2/50 [03:32<1:26:39, 108.33s/it, best loss: 0.2931210807368215]




 10%|█         | 5/50 [10:51<1:37:53, 130.52s/it, best loss: 0.25475210660395853]




 12%|█▏        | 6/50 [24:18<4:24:31, 360.72s/it, best loss: 0.25475210660395853]




 16%|█▌        | 8/50 [54:10<6:45:31, 579.32s/it, best loss: 0.23420965608465616]




 28%|██▊       | 14/50 [3:37:35<9:12:14, 920.41s/it, best loss: 0.23420965608465616]  





 58%|█████▊    | 29/50 [4:22:45<31:26, 89.81s/it, best loss: 0.21357075494806976]




 64%|██████▍   | 32/50 [4:26:23<24:56, 83.11s/it, best loss: 0.18814759700176364]




 70%|███████   | 35/50 [4:32:47<27:11, 108.78s/it, best loss: 0.18814759700176364]




 80%|████████  | 40/50 [5:53:34<2:46:25, 998.56s/it, best loss: 0.18814759700176364] 




 84%|████████▍ | 42/50 [6:24:19<2:17:51, 1033.91s/it, best loss: 0.18814759700176364]




 90%|█████████ | 45/50 [6:30:12<36:22, 436.54s/it, best loss: 0.18814759700176364]





 98%|█████████▊| 49/50 [7:57:47<11:37, 697.71s/it, best loss: 0.18814759700176364]




100%|██████████| 50/50 [8:15:56<00:00, 595.14s/it, best loss: 0.18814759700176364]
the best MLP hyperparameters are: learning_rate_init 0.03162277660168379 | l2 0.004791443737261613 | hidden_layer_sizes (100, 100)
performing repetitions on different seeds
model information added to experimentation overview
|    | model_type   | filename                   | GPU_accelerator      | RAM      | data_features                | hyperparameters                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | train_performance_ROC-AUC_avg/std/max   | 