# Combined Imports

In [2]:
import sys
# Delete and then Utils Module to update content (needs to be optional)
if sys.modules.keys().__contains__('utils'):
    del sys.modules['utils']
if sys.modules.keys().__contains__('const'):
    del sys.modules['const']

import const
from utils import Utility
from utils import OptimizerModel
from utils import CNN_FINAL
from utils import MLP_FINAL

import maldi_learn.utilities as ml_utilities
import maldi_learn.driams as ml_driams
import maldi_learn.filters as ml_filters
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
from os.path import exists
import optuna

from sklearn import metrics

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

import seaborn as sns

import logging
from optuna.samplers import RandomSampler

import json

# Preprocessing

## Parameters

In [100]:
# Parameters for further processing
optim_algo = 'aucroc'
bacterial_species = const.SPECIES_ECOLI
bacterial_species_all = '*'  
predicted_antibiotics_loaded = [const.ANTIBIOTIC_CEFTRIAXONE]
predicted_antibiotic = const.ANTIBIOTIC_CEFTRIAXONE
binning = const.BINNING_6K
if (binning == const.BINNING_6K):
    n_bins = 6000
else:
    n_bins = 18000
driams_dataset_label = const.DATASET_ALL

## Loading & Binning & Train Test Splits

In [101]:
# DRIAMS A
driams_dataset = Utility.load_data(bacterial_species, predicted_antibiotics_loaded, 'DRIAMS_A', [
                                   '2015', '2016', '2017', '2018'], binning)

In [32]:
driams_dataset_all = Utility.load_data(bacterial_species_all, predicted_antibiotics_loaded, 'DRIAMS_A', [
                                   '2015', '2016', '2017', '2018'], binning)

In [33]:
# Remove S. aureus species from combined dataset, for training for multiple species.
index_saureus=[]
for i,val in enumerate(driams_dataset_all.y['species']):
    if val == 'Staphylococcus aureus':
        index_saureus.append(i)
index_saureus.reverse()
driams_dataset_all.y = driams_dataset_all.y.loc[driams_dataset_all.y['species']!='Staphylococcus aureus']
for i in index_saureus:
    driams_dataset_all.X.pop(i)

In [5]:
driams_dataset_b = Utility.load_data(
    bacterial_species, predicted_antibiotics_loaded, 'DRIAMS_B', ['2018'], binning)

In [6]:
driams_dataset_c = Utility.load_data(
    bacterial_species, predicted_antibiotics_loaded, 'DRIAMS_C', ['2018'], binning)

In [36]:
# Combine Multiple DRIAMS Datasets
def combined_split_train_test(seed, driams_a, driams_b, driams_c, antibiotic):


    train_index_a, test_index_a = ml_utilities.case_based_stratification(driams_a.y,
                                                                         antibiotic=antibiotic,
                                                                         random_state=seed)

    train_index_b, test_index_b = ml_utilities.stratify_by_species_and_label(driams_b.y,
                                                                             antibiotic=antibiotic,
                                                                             random_state=seed)

    train_index_c, test_index_c = ml_utilities.stratify_by_species_and_label(driams_c.y,
                                                                            antibiotic=antibiotic,
                                                                            random_state=seed)

    
    y_driamsa = driams_a.to_numpy(antibiotic)
    if binning == const.BINNING_18K:
        X_driamsa = np.asarray([spectrum for spectrum in driams_a.X])
    else:
        X_driamsa = np.asarray([spectrum.intensities for spectrum in driams_a.X])

    y_driamsb = driams_b.to_numpy(antibiotic)
    if binning == const.BINNING_18K:
        X_driamsb = np.asarray([spectrum for spectrum in driams_b.X])
    else:
        X_driamsb = np.asarray([spectrum.intensities for spectrum in driams_b.X])

    y_driamsc = driams_c.to_numpy(antibiotic)
    if binning == const.BINNING_18K:
       X_driamsc = np.asarray([spectrum for spectrum in driams_c.X])
    else:
      X_driamsc = np.asarray([spectrum.intensities for spectrum in driams_c.X])

    X_train_combined = {}
    y_train_combined = {}
    X_test_combined = {}
    y_test_combined = {}


    for index,iter in enumerate(train_index_a):
        X_train_combined[str(iter)+'a'] = X_driamsa[train_index_a[index]]
        y_train_combined[str(iter)+'a'] = y_driamsa[train_index_a[index]]
    for index, iter in enumerate(test_index_a):
        X_test_combined[str(iter)+'a'] = X_driamsa[test_index_a[index]]
        y_test_combined[str(iter)+'a'] = y_driamsa[test_index_a[index]]

    for index, iter in enumerate(train_index_b):
        X_train_combined[str(iter)+'b'] = X_driamsb[train_index_b[index]]
        y_train_combined[str(iter)+'b'] = y_driamsb[train_index_b[index]]

    for index, iter in enumerate(train_index_c):
       X_train_combined[str(iter)+'c'] = X_driamsc[train_index_c[index]]
       y_train_combined[str(iter)+'c'] = y_driamsc[train_index_c[index]]

    # Reshuffle Train and Test Datasets
    train_keys = list(X_train_combined.keys())
    test_keys = list(X_test_combined.keys())
    random.Random(const.RANDOM_STATE).shuffle(train_keys)

    X_train_combined = {k: X_train_combined[k] for k in train_keys}
    y_train_combined = {k: y_train_combined[k] for k in train_keys}

    X_test_combined = {k: X_test_combined[k] for k in test_keys}
    y_test_combined = {k: y_test_combined[k] for k in test_keys}

    return X_train_combined, y_train_combined, X_test_combined, y_test_combined

In [None]:
X_train_arr = []
y_train_arr = []
X_test_arr = []
y_test_arr = []

# Combining 
combinesoxa_flag = False
randseeds = []
# Same seeds as weis et al.
for i in [164, 172, 188, 270, 344, 35, 409, 480, 545, 89]:
    randseeds.append(i)
for i in randseeds:
    if (combinesoxa_flag == False):
        X_train_c, y_train_c, X_test_c, y_test_c = Utility.split_train_test(
            i, driams_dataset_label, binning, predicted_antibiotics_loaded, driams_dataset)
        X_train_arr.append(X_train_c[predicted_antibiotic])
        y_train_arr.append(y_train_c[predicted_antibiotic])
        X_test_arr.append(X_test_c[predicted_antibiotic])
        y_test_arr.append(y_test_c[predicted_antibiotic])
    else:
        X_train_c, y_train_c, X_test_c, y_test_c = combined_split_train_test(
            i, driams_dataset, driams_dataset_all, driams_dataset_c, predicted_antibiotic)
        X_train_arr.append(list(X_train_c.values()))
        y_train_arr.append(list(y_train_c.values()))
        X_test_arr.append(list(X_test_c.values()))
        y_test_arr.append(list(y_test_c.values()))

## Initialization for ML

In [None]:
X_train_multiple,y_train_multiple,X_test_multiple,y_test_multiple = Utility.split_train_test(const.RANDOM_STATE, driams_dataset_label, binning, predicted_antibiotics_loaded, driams_dataset)
X_train = X_train_multiple[predicted_antibiotic]
y_train = y_train_multiple[predicted_antibiotic]
X_test = X_test_multiple[predicted_antibiotic]
y_test = y_test_multiple[predicted_antibiotic]

# Baseline Models
baseline_models = {}

# Tensors
X_train_tensor = {}
y_train_tensor = {}
train_dataset = {}
dataset_label=const.US_NO
# Do Undersampling depending on dataset_label
X, y = Utility.select_undersampling_dataset(
    X_train, y_train, dataset_label)
baseline_models[dataset_label] = {}

# MLP AND CNN use tensors, because they are implemented through pytorch
# Initialize Tensors
X_train_tensor[dataset_label] = torch.FloatTensor(X)
y_train_tensor[dataset_label] = torch.LongTensor(y)
train_dataset[dataset_label] = data.TensorDataset(
    X_train_tensor[dataset_label], y_train_tensor[dataset_label])
# Initialize Baseline OptimizerModels
for method in const.METHOD_DUMMY, const.METHOD_LR, const.METHOD_RFO, const.METHOD_TREE:
    baseline_models[dataset_label][method] = OptimizerModel(
            dataset_label, method, X, y)

# Initialize target Tensors
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)
test_dataset = data.TensorDataset(
    X_train_tensor[dataset_label], y_train_tensor[dataset_label])

# Classification/Hyperparameter Search

## Baseline Hyperparameter Optimization with CV

In [None]:
# Dummy Classifier
for undersampling in baseline_models:
    Utility.train_hyperparams(predicted_antibiotic, bacterial_species, undersampling,
                      driams_dataset_label, binning, baseline_models, const.METHOD_DUMMY)
    print('best_params:',
          baseline_models[undersampling][const.METHOD_DUMMY].best_params)

In [None]:
# Tree Classifier
for undersampling in baseline_models:
    Utility.train_hyperparams(predicted_antibiotic, bacterial_species, undersampling,
                      driams_dataset_label, binning, baseline_models, const.METHOD_TREE)
    print('best_params:',
          baseline_models[undersampling][const.METHOD_TREE].best_params)
    # 76 Minuten

In [None]:
# Logistic Regression Classifier
for undersampling in baseline_models:
    Utility.train_hyperparams(predicted_antibiotic, bacterial_species, undersampling,
                      driams_dataset_label, binning, baseline_models, const.METHOD_LR)
    print('best_params:',
          baseline_models[undersampling][const.METHOD_LR].best_params)
    # 186 Minuten

In [None]:
# Random Forest Classifier
for undersampling in baseline_models:
    Utility.train_hyperparams(predicted_antibiotic, bacterial_species, undersampling,
                      driams_dataset_label, binning, baseline_models, const.METHOD_RFO)
    print('best_params:',
          baseline_models[undersampling][const.METHOD_RFO].best_params)

## Baseline Training/Evaluation

In [None]:
# Train Final model for all Classifiers
undersampling = const.US_NO
for method in baseline_models[undersampling]:
    load_name = Utility.create_file_identifier(
        const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, method, undersampling, binning, 'DRIAMS_A')
    save_name = Utility.create_file_identifier(
        predicted_antibiotic, bacterial_species, method, undersampling, binning, driams_dataset_label)
    current_model = baseline_models[undersampling][method]

    best_params = Utility.load_best_params(load_name)
    # Set loaded Params
    current_model.set_best_params(best_params)
    # Predict actual Test Data with Best Params
    y_test_pred, y_test_proba = Utility.predict_with_best(
        current_model, X_test)
    # Set Predicted Labels to later compare them
    current_model.set_predicted_labels(y_test_pred, y_test_proba[:, 1])
    # Save Values for next usage.
    Utility.save_predictions(save_name+str(const.RANDOM_STATE), current_model.predicted_labels)
    Utility.save_probas(save_name+str(const.RANDOM_STATE), current_model.predicted_probas)
Utility.save_predictions(
    'actual_test'+str(const.RANDOM_STATE)+predicted_antibiotic+bacterial_species+driams_dataset_label, y_test)

In [None]:
auroc_array_baseline = {
}
auprc_array_baseline = {
}
for seed_num in range(10):
    print("Seed:", randseeds[seed_num])
    X_train = X_train_arr[seed_num]
    y_train = y_train_arr[seed_num]
    X_test = X_test_arr[seed_num]
    y_test = y_test_arr[seed_num]
    # Baseline Models
    baseline_models = {}

    # for dataset_label in const.US_NO,const.US_RANDOM:
    dataset_label = const.US_NO
    # Do Undersampling depending on dataset_label
    X, y = Utility.select_undersampling_dataset(
        X_train, y_train, dataset_label)
    baseline_models[dataset_label] = {}

    # Initialize Baseline OptimizerModels
    for method in const.METHOD_TREE, const.METHOD_RFO:
        baseline_models[dataset_label][method] = OptimizerModel(
            dataset_label, method, X, y)

        file_load = Utility.create_file_identifier(
            const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, method, dataset_label, binning, 'DRIAMS_A')
        file_save = Utility.create_file_identifier(
            predicted_antibiotic, bacterial_species, method, dataset_label, binning, driams_dataset_label)
        current_model = baseline_models[dataset_label][method]

        best_params = Utility.load_best_params(file_load)
        # Set loaded Params
        # print(best_params)
        current_model.set_best_params(best_params)
        # Predict actual Test Data with Best Params
        y_test_pred, y_test_proba = Utility.predict_with_best(
            current_model, X_test)
        Utility.save_predictions(
            file_save+str(randseeds[seed_num]), y_test_pred)
        Utility.save_probas(
            file_save+str(randseeds[seed_num]), y_test_proba[:, 1])
        #Utility.save_predictions(
        #    'actual_test'+str(randseeds[seed_num])+predicted_antibiotic+bacterial_species+driams_dataset_label, y_test)

        auroc_array_baseline[dataset_label+method+str(
            randseeds[seed_num])] = metrics.roc_auc_score(y_test, y_test_proba[:, 1])
        auprc_array_baseline[dataset_label+method+str(
            randseeds[seed_num])] = metrics.average_precision_score(y_test, y_test_proba[:, 1])

In [None]:
for method in const.METHOD_MLP, const.METHOD_CNN:
    file_load = Utility.create_file_identifier(
        const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, method, dataset_label, binning, driams_dataset_label)
    best_params = Utility.load_best_params(file_load+'_'+str(50)+'_trials')
    print(best_params)

## Models
- Final Models: See utils.py

## Optuna optimization

In [104]:
def do_study(file_id, n_trials_or_timeout, load_study_flag, X_train, y_train, method, n_bins, model_class):
    if (load_study_flag == False):  # Create New study and do n_trials on it
        try:
            optuna.delete_study(study_name=file_id,
                                storage='sqlite:///example.db')
        except:
            print('this should not be reached')
        finally:
            study = optuna.create_study(
                direction='maximize', study_name=file_id, storage='sqlite:///example.db')
        if (isinstance(n_trials_or_timeout, int)):
            study.optimize(lambda trial: Utility.optimize_py(
                trial, method, X_train, y_train, n_bins, model_class), n_trials=n_trials_or_timeout)
        else:
            study.optimize(lambda trial: Utility.optimize_py(
                trial, method, X_train, y_train, n_bins, model_class), timeout=n_trials_or_timeout)
        return study
    elif (load_study_flag == True):  # Load study and do as many trials so atleast n_trials total are done
        try:
            study = optuna.load_study(
                study_name=file_id, storage='sqlite:///example.db')
        except:
            study = optuna.create_study(
                direction='maximize', study_name=file_id, storage='sqlite:///example.db')
        if (isinstance(n_trials_or_timeout, int)):
            if (n_trials_or_timeout-len(study.trials) > 0):
                study.optimize(lambda trial: Utility.optimize_py(trial, method, X_train, y_train,
                               n_bins, model_class), n_trials=n_trials_or_timeout-len(study.trials))
        else:
            study.optimize(lambda trial: Utility.optimize_py(
                trial, method, X_train, y_train, n_bins, model_class), timeout=n_trials_or_timeout)
        return study



## Optimize MLP

In [None]:
loadmlp = False  # Variable for only loading or calculating new trials
n_trials_mlp = 50  # Optuna Trials

for dataset_label in const.US_NO, const.US_RANDOM:
    name = Utility.create_file_identifier(
        predicted_antibiotic, bacterial_species, const.METHOD_MLP, dataset_label, binning, driams_dataset_label)
    # create/load/do study, return best Params
    study_mlp = do_study(name, n_trials_mlp, loadmlp,
                         X_train_tensor[dataset_label], y_train_tensor[dataset_label], const.METHOD_MLP, n_bins, MLP_FINAL)
    Utility.save_best_params(
        name+'_'+str(len(study_mlp.trials))+'_trials', study_mlp.best_params)

## Optimize CNN

In [None]:
###################### NO US ###############
loadcnn = False  # Variable for only loading or calculating new trials
n_trials_cnn = int(50)  # Optuna Trials
for dataset_label in const.US_NO, const.US_RANDOM:
    name = Utility.create_file_identifier(
        predicted_antibiotic, bacterial_species, const.METHOD_CNN, dataset_label, binning, driams_dataset_label)
    # create/load/do study, return best Params
    study_cnn = do_study(name, n_trials_cnn, loadcnn,
                         X_train_tensor[dataset_label], y_train_tensor[dataset_label], const.METHOD_CNN, n_bins, CNN_FINAL)
    Utility.save_best_params(
        name+'_'+str(len(study_cnn.trials))+'_trials', study_cnn.best_params)

In [None]:
###################### RANDOM ###############
loadcnn = False  # Variable for only loading or calculating new trials
n_trials_cnn = int(50)  # Optuna Trials
dataset_label = const.US_RANDOM  # const.US_NO
name = Utility.create_file_identifier(
    predicted_antibiotic, bacterial_species, const.METHOD_CNN, dataset_label, binning, driams_dataset_label)
# create/load/do study, return best Params
study_cnn_random = do_study(name, n_trials_cnn, loadcnn,
                            X_train_tensor[dataset_label], y_train_tensor[dataset_label], const.METHOD_CNN, n_bins, CNN_FINAL)
Utility.save_best_params(
    name+'_'+str(len(study_cnn_random.trials))+'_trials', study_cnn_random.best_params)

## Train MLP

In [None]:
no_trials = 50
predictions_mlp = {}
dataset_label = const.US_NO
load_name = Utility.create_file_identifier(
    const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.METHOD_MLP, dataset_label, binning, driams_dataset_label)
save_name = Utility.create_file_identifier(
    predicted_antibiotic, bacterial_species, const.METHOD_MLP, dataset_label, binning, driams_dataset_label)
# Predict actual Test Data with Best Params
best_params = Utility.load_best_params(load_name+'_'+str(no_trials)+'_trials')
# Create Train and Test Iterators with determined Batch Size
train_iterator = Utility.create_iterator(
    X_train_tensor[dataset_label], y_train_tensor[dataset_label], True, best_params['batch_size'])
test_iterator = Utility.create_iterator(
    X_test_tensor, y_test_tensor, False, best_params['batch_size'])
pred, proba, scores, preds, probas, training_scores, training_losses, test_losses = predict_with_best_py(
    best_params, train_iterator, test_iterator, n_bins, const.METHOD_MLP, MLP_FINAL)
# Calculate labels (1 or 0)
predictions_mlp[const.BEST_PARAMS] = best_params
# save probas to dict
predictions_mlp[const.PREDICTIONS] = pred.cpu().detach().numpy()
predictions_mlp[const.PROBABILITIES] = proba.cpu().detach().numpy()
predictions_mlp['test_aucroc'] = scores
predictions_mlp['training_aucroc'] = training_scores
predictions_mlp['test_losses'] = test_losses
predictions_mlp['training_losses'] = training_losses
predictions_mlp[const.BEST_PREDS] = preds[np.argmax(
    training_scores)].cpu().detach().numpy()
predictions_mlp[const.BEST_PROBAS] = probas[np.argmax(
    training_scores)].cpu().detach().numpy()

Utility.save_predictions(save_name, predictions_mlp[const.PREDICTIONS])
Utility.save_losses(save_name, predictions_mlp['test_aucroc'])
Utility.save_probas(save_name, predictions_mlp[const.PROBABILITIES])

## Train CNN and MLP for 10 Train-Test Splits

In [None]:
# Train the CNN and MLP with the best hyperparams previously selected.
auroc_array_cnn = {
    const.US_NO: [],
    const.US_RANDOM: []
}
auroc_array_mlp = {
    const.US_NO: [],
    const.US_RANDOM: []
}

for seed_num in range(10):
    print("Seed:", randseeds[seed_num])
    X_train = X_train_arr[seed_num]
    y_train = y_train_arr[seed_num]
    X_test = X_test_arr[seed_num]
    y_test = y_test_arr[seed_num]

    # Tensors
    X_train_tensor = {}
    y_train_tensor = {}
    train_dataset = {}
    dataset_label = const.US_NO
    # Do Undersampling depending on dataset_label
    X, y = Utility.select_undersampling_dataset(
        X_train, y_train, dataset_label)

    # MLP AND CNN use tensors, because they are implemented through pytorch
    # Initialize Tensors
    X_train_tensor[dataset_label] = torch.FloatTensor(X)
    y_train_tensor[dataset_label] = torch.LongTensor(y)
    train_dataset[dataset_label] = data.TensorDataset(
        X_train_tensor[dataset_label], y_train_tensor[dataset_label])

    # Initialize target Tensors
    X_test_tensor = torch.FloatTensor(X_test)
    y_test_tensor = torch.LongTensor(y_test)
    test_dataset = data.TensorDataset(
        X_train_tensor[dataset_label], y_train_tensor[dataset_label])

    no_trials = 50
    predictions_cnn = {}
    predictions_mlp = {}
    dataset_label = const.US_NO  #const.US_RANDOM:
    print("CNN")
    load_name = Utility.create_file_identifier(
        const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.METHOD_CNN, dataset_label, binning, 'DRIAMS_A')
    save_name = Utility.create_file_identifier(
        predicted_antibiotic, bacterial_species, const.METHOD_CNN, dataset_label, binning, driams_dataset_label)
    best_params = Utility.load_best_params(load_name+'_'+str(50)+'_trials')
    train_iterator = Utility.create_iterator(
        X_train_tensor[dataset_label], y_train_tensor[dataset_label], True, best_params['batch_size'])
    test_iterator = Utility.create_iterator(
        X_test_tensor, y_test_tensor, False, best_params['batch_size'])
    pred, proba, scores, preds, probas, training_scores, training_losses, test_losses = Utility.predict_with_best_py(
        best_params, train_iterator, test_iterator, n_bins, const.METHOD_CNN, CNN_FINAL)
    predictions_cnn[const.PROBABILITIES] = proba.cpu().detach().numpy()
    predictions_cnn['test_aucroc'] = scores
    # Save Losses and Probas with Seed Number for Reproducing
    Utility.save_losses('test_aucroc'+save_name +
                        str(randseeds[seed_num]), scores)
    Utility.save_losses('training_aucroc'+save_name +
                        str(randseeds[seed_num]), training_scores)
    Utility.save_losses('test_losses'+save_name +
                        str(randseeds[seed_num]), test_losses)
    Utility.save_losses('training_losses'+save_name +
                        str(randseeds[seed_num]), training_losses)
    Utility.save_probas(
        save_name+str(randseeds[seed_num]), predictions_cnn[const.PROBABILITIES])
    auroc_array_cnn[dataset_label].append(predictions_cnn[const.PROBABILITIES])
    print("MLP")
    load_name = Utility.create_file_identifier(
        const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.METHOD_MLP, dataset_label, binning, 'DRIAMS_A')
    save_name = Utility.create_file_identifier(
        predicted_antibiotic, bacterial_species, const.METHOD_MLP, dataset_label, binning, driams_dataset_label)
    best_params = Utility.load_best_params(load_name+'_'+str(50)+'_trials')
    train_iterator = Utility.create_iterator(
        X_train_tensor[dataset_label], y_train_tensor[dataset_label], True, best_params['batch_size'])
    test_iterator = Utility.create_iterator(
        X_test_tensor, y_test_tensor, False, best_params['batch_size'])
    pred, proba, scores, preds, probas, training_scores, training_losses, test_losses = Utility.predict_with_best_py(
        best_params, train_iterator, test_iterator, n_bins, const.METHOD_MLP, MLP_FINAL)
    predictions_mlp[const.PROBABILITIES] = proba.cpu().detach().numpy()
    predictions_mlp['test_aucroc'] = scores
    # Save Losses and Probas with Seed Number for reloading
    Utility.save_losses('test_aucroc'+save_name +
                        str(randseeds[seed_num]), scores)
    Utility.save_losses('training_aucroc'+save_name +
                        str(randseeds[seed_num]), training_scores)
    Utility.save_losses('test_losses'+save_name +
                        str(randseeds[seed_num]), test_losses)
    Utility.save_losses('training_losses'+save_name +
                        str(randseeds[seed_num]), training_losses)
    Utility.save_probas(
        save_name+str(randseeds[seed_num]), predictions_mlp[const.PROBABILITIES])
    auroc_array_mlp[dataset_label].append(predictions_mlp[const.PROBABILITIES])

In [None]:
load_name = Utility.create_file_identifier(
    const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.METHOD_MLP, dataset_label, const.BINNING_18K, driams_dataset_label)
best_params = Utility.load_best_params(load_name+'_'+str(50)+'_trials')

In [None]:
# Reload Predictions for 10 Train-Test Trials
# Has to have correct Seeds and variables
auroc_array_cnn = {
    const.US_NO: [],
    const.US_RANDOM: []
}
auroc_array_mlp = {
    const.US_NO: [],
    const.US_RANDOM: []
}
for seed_num in range(10):
    for dataset_label in const.US_NO, const.US_RANDOM:
        cnn_name = Utility.create_file_identifier(
            predicted_antibiotic, bacterial_species, const.METHOD_CNN, dataset_label, binning, driams_dataset_label)
        mlp_name = Utility.create_file_identifier(
            predicted_antibiotic, bacterial_species, const.METHOD_MLP, dataset_label, binning, driams_dataset_label)
        auroc_array_cnn[dataset_label].append(
            Utility.load_probas(cnn_name+str(randseeds[seed_num])))
        auroc_array_mlp[dataset_label].append(
            Utility.load_probas(mlp_name+str(randseeds[seed_num])))

In [None]:
for seed_num in range(10):
    print("Seed:", randseeds[seed_num])
    X_train = X_train_arr[seed_num]
    y_train = y_train_arr[seed_num]
    X_test = X_test_arr[seed_num]
    y_test = y_test_arr[seed_num]

    # Tensors
    X_train_tensor = {}
    y_train_tensor = {}
    train_dataset = {}

    dataset_label = const.US_NO  # , const.US_RANDOM:
    # Do Undersampling depending on dataset_label
    X, y = Utility.select_undersampling_dataset(
        X_train, y_train, dataset_label)

    # MLP AND CNN use tensors, because they are implemented through pytorch
    # Initialize Tensors
    X_train_tensor[dataset_label] = torch.FloatTensor(X)
    y_train_tensor[dataset_label] = torch.LongTensor(y)
    train_dataset[dataset_label] = data.TensorDataset(
        X_train_tensor[dataset_label], y_train_tensor[dataset_label])

    # Initialize target Tensors
    X_test_tensor = torch.FloatTensor(X_test)
    y_test_tensor = torch.LongTensor(y_test)
    test_dataset = data.TensorDataset(
        X_train_tensor[dataset_label], y_train_tensor[dataset_label])

    no_trials = 50
    predictions_cnn = {}

In [None]:
no_trials = 50
predictions_cnn = {}
dataset_label = const.US_RANDOM
load_name = Utility.create_file_identifier(
    const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.METHOD_CNN, dataset_label, binning, driams_dataset_label)
save_name = Utility.create_file_identifier(
    predicted_antibiotic, bacterial_species, const.METHOD_CNN, dataset_label, binning, driams_dataset_label)
# Predict actual Test Data with Best Params
best_params = Utility.load_best_params(load_name+'_'+str(50)+'_trials')
# Create Train and Test Iterators with determined Batch Size
train_iterator = Utility.create_iterator(
    X_train_tensor[dataset_label], y_train_tensor[dataset_label], True, best_params['batch_size'])
test_iterator = Utility.create_iterator(
    X_test_tensor, y_test_tensor, False, best_params['batch_size'])
pred, proba, scores, preds, probas, training_scores, training_losses, test_losses = Utility.predict_with_best_py(
    best_params, train_iterator, test_iterator, n_bins, const.METHOD_CNN, CNN_FINAL)
# Calculate labels (1 or 0)
predictions_cnn[const.BEST_PARAMS] = best_params
# save probas to dict
predictions_cnn[const.PREDICTIONS] = pred.cpu().detach().numpy()
predictions_cnn[const.PROBABILITIES] = proba.cpu().detach().numpy()
predictions_cnn['test_aucroc'] = scores
predictions_cnn['training_aucroc'] = training_scores
predictions_cnn['test_losses'] = test_losses
predictions_cnn['training_losses'] = training_losses
predictions_cnn[const.BEST_PREDS] = preds[np.argmax(
    training_scores)].cpu().detach().numpy()
predictions_cnn[const.BEST_PROBAS] = probas[np.argmax(
    training_scores)].cpu().detach().numpy()

Utility.save_predictions(save_name, predictions_cnn[const.PREDICTIONS])
Utility.save_losses(save_name, predictions_cnn['test_aucroc'])
Utility.save_probas(save_name, predictions_cnn[const.PROBABILITIES])

# Visualization

## Curves and Stripplots

In [None]:
undersample_label = const.US_NO
y_probas = {
    const.METHOD_CNN: predictions_cnn[const.PROBABILITIES],
    const.METHOD_MLP: predictions_mlp[const.PROBABILITIES],
    const.METHOD_DUMMY: baseline_models[undersample_label][const.METHOD_DUMMY].predicted_probas[:, 1],
    const.METHOD_LR: baseline_models[undersample_label][const.METHOD_LR].predicted_probas[:, 1],
    const.METHOD_RFO: baseline_models[undersample_label][const.METHOD_RFO].predicted_probas[:, 1],
    const.METHOD_TREE: baseline_models[undersample_label][const.METHOD_TREE].predicted_probas[:, 1]
}
Utility.plot_auc_prc(y_test, y_probas, undersample_label,
                     bacterial_species, predicted_antibiotic, False)

In [None]:
us_label = const.US_NO
auroc_array_cnn = {
    const.US_NO: [],
    const.US_RANDOM: []
}
auroc_array_mlp = {
    const.US_NO: [],
    const.US_RANDOM: []
}
for dataset_label in const.US_NO, const.US_RANDOM:
    for i in range(10):
        name = Utility.create_file_identifier(
            predicted_antibiotic, bacterial_species, const.METHOD_CNN, dataset_label, binning, driams_dataset_label)
        auroc_array_cnn[dataset_label].append(
            Utility.load_probas(name+str(randseeds[i])))
        name = Utility.create_file_identifier(
            predicted_antibiotic, bacterial_species, const.METHOD_MLP, dataset_label, binning, driams_dataset_label)
        auroc_array_mlp[dataset_label].append(
            Utility.load_probas(name+str(randseeds[i])))

In [None]:
y_probas = {}
us_label = const.US_NO
pd.DataFrame()
for i in range(10):
    y_probas[str(randseeds[i])+' cnn'] = auroc_array_cnn[us_label][i]
Utility.plot_auc_roc_multi(
    y_test_arr, y_probas, us_label, bacterial_species, predicted_antibiotic, False)

In [None]:
fprs = []
tprs = []
for id, element in enumerate(y_probas):
    if (const.OUTPUT_DIM == 1):
        fpr, tpr, thresh = metrics.roc_curve(y_test_arr[id], y_probas[element])
        auc = metrics.roc_auc_score(y_test_arr[id], y_probas[element])
    fprs.append(fpr)
    tprs.append(tpr)
    print(len(fpr))
    # df.loc[id]=tpr
    # df.loc[0]
    # plt.plot(fpr,tpr,label=f"{i}, AUCROC={auc:.2f}")
df = pd.DataFrame({'fpr': fprs[0], 'tpr': tprs[0]}, columns=['fpr', 'tpr'])
# df=df.transpose()
sns.lineplot(x=df['fpr'], y=df['tpr'])

In [None]:
fpr, tpr, thresh = metrics.roc_curve(y_test_arr[id], y_probas[element])
auc = metrics.roc_auc_score(y_test_arr[id], y_probas[element])

In [75]:
# Load Data
def load_from_saved_data(a_undersampled, a_antibiotic, a_species, a_label, a_binning):
    probas = {
        const.METHOD_DUMMY: [],
        const.METHOD_TREE: [],
        const.METHOD_LR: [],
        const.METHOD_RFO: [],
        const.METHOD_MLP: [],
        const.METHOD_CNN: [],
    }
    aurocs = {
        const.METHOD_DUMMY: [],
        const.METHOD_TREE: [],
        const.METHOD_LR: [],
        const.METHOD_RFO: [],
        const.METHOD_MLP: [],
        const.METHOD_CNN: []
    }
    auprcs = {
        const.METHOD_DUMMY: [],
        const.METHOD_TREE: [],
        const.METHOD_LR: [],
        const.METHOD_RFO: [],
        const.METHOD_MLP: [],
        const.METHOD_CNN: []
    }
    actual_values = []
    for method in const.METHOD_LR,const.METHOD_RFO,const.METHOD_TREE, const.METHOD_CNN,const.METHOD_MLP:
        for i in range(10):
            name = Utility.create_file_identifier(
                a_antibiotic, a_species, method, a_undersampled, a_binning, a_label)
            current_y = Utility.load_predictions(
                'actual_test'+str(randseeds[i])+a_antibiotic+a_species+a_label)
            current_probas = Utility.load_probas(name+str(randseeds[i]))
            current_auroc = metrics.roc_auc_score(current_y, current_probas)
            current_auprc = metrics.average_precision_score(
                current_y, current_probas)
            actual_values.append(current_y)
            probas[method].append(current_probas)
            aurocs[method].append(current_auroc)
            auprcs[method].append(current_auprc)
    return probas, aurocs, auprcs, actual_values

In [17]:
def draw_auroc_boxplot(aurocs):
    aurocs_df = pd.DataFrame(aurocs)
    # Plot the orbital period with horizontal boxes
    sns.set_theme(style="whitegrid")
    colors = sns.color_palette(palette=None, n_colors=6)
    means = {}
    labels = []
    for i in aurocs_df:
        means[aurocs_df[i].name] = 0
        for j, value in aurocs_df[i].items():
            means[aurocs_df[i].name] = means[aurocs_df[i].name]+value
        means[aurocs_df[i].name] = means[aurocs_df[i].name]/len(aurocs_df[i])
        labels.append(
            f"Avg AUC ROC {aurocs_df[i].name}:{means[aurocs_df[i].name]}")
    print(labels)
    p = sns.stripplot(data=aurocs_df, palette=colors, orient="h")
    ax = sns.boxplot(data=aurocs_df, palette=colors, orient="h", showmeans=True, meanline=True, meanprops={
                     'color': 'k', 'ls': '-', 'lw': 1}, medianprops={'visible': False}, whiskerprops={'visible': False},zorder=10, showbox=False, showcaps=False, showfliers=False, ax=p)  # ,showmeans=True,orient="h",
    ax.set(xlabel='AUC ROC', ylabel='Model')
    sns.despine()
    plt.show()

In [18]:
def draw_auprc_boxplot(aurocs):
    aurocs_df = pd.DataFrame(aurocs)
    # Plot the orbital period with horizontal boxes
    sns.set_theme(style="whitegrid")
    colors = sns.color_palette(palette=None, n_colors=6)
    p = sns.stripplot(data=aurocs_df, palette=colors, orient="h")
    ax = sns.boxplot(data=aurocs_df, palette=colors, orient="h", showmeans=True, meanline=True, meanprops={
                     'color': 'k', 'ls': '-', 'lw': 1}, medianprops={'visible': False}, whiskerprops={'visible': False},zorder=10, showbox=False, showcaps=False, showfliers=False, ax=p) 
    ax.set(xlabel='Average Precision', ylabel='Model')
    sns.despine()
    plt.show()

In [52]:
# S. Oxa
def load_maldi_amr_soxa():
    models = ['lr', 'mlp', 'lightgbm']
    filenames = {}
    scores_amr = {}
    aurocs_amr = {}
    auprcs_amr = {}

    for model in models:
        filenames[model] = [
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Staphylococcus_aureus_Antibiotic_Oxacillin_Seed_164_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Staphylococcus_aureus_Antibiotic_Oxacillin_Seed_172_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Staphylococcus_aureus_Antibiotic_Oxacillin_Seed_188_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Staphylococcus_aureus_Antibiotic_Oxacillin_Seed_270_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Staphylococcus_aureus_Antibiotic_Oxacillin_Seed_344_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Staphylococcus_aureus_Antibiotic_Oxacillin_Seed_35_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Staphylococcus_aureus_Antibiotic_Oxacillin_Seed_409_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Staphylococcus_aureus_Antibiotic_Oxacillin_Seed_480_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Staphylococcus_aureus_Antibiotic_Oxacillin_Seed_545_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Staphylococcus_aureus_Antibiotic_Oxacillin_Seed_89_workstation_no_HospitalHygiene.json',
        ]

    for model in models:
        scores_amr['weis_'+model] = []
        aurocs_amr['weis_'+model] = []
        auprcs_amr['weis_'+model] = []
        for i in range(10):
            f = open('json_s_oxa\\'+filenames[model][i])
            dictstuff = json.load(f)
            to_append = []
            for y_score in dictstuff.get('y_score'):
                to_append.append(y_score[1])
            y_test = dictstuff.get('y_test')
            scores_amr['weis_'+model].append(to_append)
            aurocs_amr['weis_'+model].append(
                metrics.roc_auc_score(y_test, scores_amr['weis_'+model][i]))
            auprcs_amr['weis_'+model].append(
                metrics.average_precision_score(y_test, scores_amr['weis_'+model][i]))
    return scores_amr, aurocs_amr, auprcs_amr

In [53]:
# E. Ceftri
def load_maldi_amr_e_cef():
    models = ['lr', 'mlp', 'lightgbm']
    filenames = {}
    scores_amr = {}
    aurocs_amr = {}
    auprcs_amr = {}

    for model in models:
        filenames[model] = [
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_164_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_172_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_188_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_270_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_344_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_35_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_409_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_480_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_545_workstation_no_HospitalHygiene.json',
            'Site_DRIAMS-A_Model_'+model +
            '_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_89_workstation_no_HospitalHygiene.json',
        ]

    for model in models:
        scores_amr['weis_'+model] = []
        aurocs_amr['weis_'+model] = []
        auprcs_amr['weis_'+model] = []
        for i in range(10):
            f = open('json_ecoli\\'+filenames[model][i])
            dictstuff = json.load(f)
            to_append = []
            for y_score in dictstuff.get('y_score'):
                to_append.append(y_score[1])
            y_test = dictstuff.get('y_test')
            scores_amr['weis_'+model].append(to_append)
            aurocs_amr['weis_'+model].append(
                metrics.roc_auc_score(y_test, scores_amr['weis_'+model][i]))
            auprcs_amr['weis_'+model].append(
                metrics.average_precision_score(y_test, scores_amr['weis_'+model][i]))
    return scores_amr, aurocs_amr, auprcs_amr

In [None]:
# Scenario 1
probas, aurocs, auprcs, actual_values = load_from_saved_data(
    const.US_NO, const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.DATASET_DRIAMSA, const.BINNING_6K)
draw_auroc_boxplot(aurocs)
draw_auprc_boxplot(auprcs)

In [None]:
# Scenario 2
probas, aurocs, auprcs, actual_values = load_from_saved_data(
    const.US_NO, const.ANTIBIOTIC_CEFTRIAXONE, const.SPECIES_ECOLI, const.DATASET_DRIAMSA, const.BINNING_6K)
draw_auroc_boxplot(aurocs)
draw_auprc_boxplot(auprcs)

In [None]:
# Scenario 3
probas, aurocs, auprcs, actual_values = load_from_saved_data(
    const.US_NO, const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.DATASET_ALL, const.BINNING_6K)
draw_auroc_boxplot(aurocs)
draw_auprc_boxplot(auprcs)

In [None]:
# Compare Scenario 1
probas, aurocs, auprcs, actual_values = load_from_saved_data(
    const.US_NO, const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.DATASET_DRIAMSA, const.BINNING_6K)
probas_amr, aurocs_amr, auprcs_amr = load_maldi_amr_soxa()

for method in aurocs_amr:
    aurocs[method] = aurocs_amr[method]
    auprcs[method] = auprcs_amr[method]

#Combine aurocs and probas
draw_auroc_boxplot(aurocs)
draw_auprc_boxplot(auprcs)

In [55]:
# Create a combined AUROC curve for multiple seeds, similar to the one in "Direct Antimicrobial Resistance Prediction"
def transform_probas_to_curve(probas, probas_amr, actual_values):
    aggregated_probas = {}
    aggregated_values = []
    for method in probas:
        aggregated_probas[method] = []
        aggregated_values = []
        for i, seed in enumerate(probas[method]):
            for j, value in enumerate(seed):
                aggregated_probas[method].append(value)
                aggregated_values.append(actual_values[i][j])
    for method in probas_amr:
        print(method)
        aggregated_probas[method] = []
        for i, seed in enumerate(probas_amr[method]):
            for j, value in enumerate(seed):
                aggregated_probas[method].append(value)
    return aggregated_probas, aggregated_values

In [None]:
# Compare Scenario 1 and 3
aurocs={}
auprcs={}
probas_a, aurocs_a, auprcs_a, actual_values = load_from_saved_data(
    const.US_NO, const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.DATASET_DRIAMSA, const.BINNING_6K)
probas_all, aurocs_all, auprcs_all, actual_values = load_from_saved_data(
    const.US_NO, const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.DATASET_ALL, const.BINNING_6K)

for method in aurocs_a:
    if method ==const.METHOD_LR or method == const.METHOD_CNN or method == const.METHOD_MLP:
        aurocs[method+'_S1'] = aurocs_a[method]
        aurocs[method+'_S3'] = aurocs_all[method]
        
for method in auprcs_a:
    if method ==const.METHOD_LR or method == const.METHOD_CNN or method == const.METHOD_MLP:
        auprcs[method+'_S1'] = auprcs_a[method]    
        auprcs[method+'_S3'] = auprcs_all[method]

draw_auroc_boxplot(aurocs)
draw_auprc_boxplot(auprcs)

In [None]:
# Compare scenarios using combined curves/probas
probas_amr, aurocs_amr, auprcs_amr = load_maldi_amr_soxa()
probas, aurocs, auprcs, actual_values = load_from_saved_data(
    const.US_NO, const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.DATASET_DRIAMSA, const.BINNING_6K)
aggregated_probas, aggregated_values = transform_probas_to_curve(
    probas, probas_amr, actual_values)

Utility.plot_auc_roc(aggregated_values, aggregated_probas, const.US_NO,
                     const.SPECIES_SAUREUS, const.ANTIBIOTIC_OXACILLIN, False)
plt.plot()
Utility.plot_auc_prc(aggregated_values, aggregated_probas, const.US_NO,
                     const.SPECIES_SAUREUS, const.ANTIBIOTIC_OXACILLIN, False)

In [None]:
# Compare scenarios using combined curves/probas
probas_amr, aurocs_amr, auprcs_amr = load_maldi_amr_e_cef()
probas, aurocs, auprcs, actual_values = load_from_saved_data(
    const.US_NO, const.ANTIBIOTIC_CEFTRIAXONE, const.SPECIES_ECOLI, const.DATASET_DRIAMSA, const.BINNING_6K)
aggregated_probas, aggregated_values = transform_probas_to_curve(
    probas, probas_amr, actual_values)

Utility.plot_auc_roc(aggregated_values, aggregated_probas, const.US_NO,
                     const.SPECIES_ECOLI, const.ANTIBIOTIC_CEFTRIAXONE, False)
Utility.plot_auc_prc(aggregated_values, aggregated_probas, const.US_NO,
                     const.SPECIES_ECOLI, const.ANTIBIOTIC_CEFTRIAXONE, False)

In [None]:
# Scenario 2
probas_amr, aurocs_amr, auprcs_amr = load_maldi_amr_e_cef()
probas, aurocs, auprcs = load_from_saved_data(
    const.US_NO, const.ANTIBIOTIC_CEFTRIAXONE, const.SPECIES_ECOLI, const.DATASET_DRIAMSA, const.BINNING_6K)
# pd.DataFrame(aurocs).describe()
for i in probas_amr:
    probas[i] = probas_amr[i]
    aurocs[i] = aurocs_amr[i]
    auprcs[i] = auprcs_amr[i]
draw_auroc_boxplot({k: aurocs[k] for k in (
    'cnn', 'weis_lightgbm', 'mlp', 'weis_mlp', 'rfo')})
draw_auprc_boxplot({k: auprcs[k] for k in (
    'cnn', 'weis_lightgbm', 'mlp', 'weis_mlp', 'rfo')})

pd.DataFrame({k: aurocs[k] for k in (
    'cnn', 'weis_lightgbm', 'mlp', 'weis_mlp', 'rfo')}).describe()

In [5]:
# Load Seed 42 results (Split for Hyperparameter Tuning)
def load_default_seed_results():
    aurocs={}
    auprcs={}
    probas={}
    current_y = Utility.load_predictions(
        'actual_test'+str(const.RANDOM_STATE)+const.ANTIBIOTIC_OXACILLIN+const.SPECIES_SAUREUS+const.DATASET_DRIAMSA)
    for method in const.METHOD_DUMMY, const.METHOD_TREE, const.METHOD_LR, const.METHOD_RFO, const.METHOD_MLP, const.METHOD_CNN:
        name = Utility.create_file_identifier(
            const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, method, const.US_NO, const.BINNING_6K, const.DATASET_DRIAMSA)
        current_probas = Utility.load_probas(name+str(const.RANDOM_STATE))
        print(len(current_probas))
        current_auroc = metrics.roc_auc_score(current_y, current_probas)
        current_auprc = metrics.average_precision_score(
            current_y, current_probas)
        probas[method]=current_probas
        aurocs[method]=current_auroc
        auprcs[method]=current_auprc
    return probas,aurocs,auprcs,current_y

In [None]:
# Load Show curves for scenario 1
probas,aurocs,auprcs,actual_y = load_default_seed_results()
sns.set_style("whitegrid")
Utility.plot_auc_roc(actual_y, probas, const.US_NO,
                   const.SPECIES_SAUREUS, const.ANTIBIOTIC_OXACILLIN, False)
Utility.plot_auc_prc(actual_y, probas, const.US_NO,
                   const.SPECIES_SAUREUS, const.ANTIBIOTIC_OXACILLIN, False)

In [None]:
# Baseline Models
baseline_models = {}

# Tensors
X_train_tensor = {}
y_train_tensor = {}
train_dataset = {}

for dataset_label in const.US_NO, const.US_RANDOM:
    # Do Undersampling depending on dataset_label
    X, y = Utility.select_undersampling_dataset(
        X_train, y_train, dataset_label)
    baseline_models[dataset_label] = {}

    # MLP AND CNN use tensors, because they are implemented through pytorch
    # Initialize Tensors
    X_train_tensor[dataset_label] = torch.FloatTensor(X)
    y_train_tensor[dataset_label] = torch.LongTensor(y)
    train_dataset[dataset_label] = data.TensorDataset(
        X_train_tensor[dataset_label], y_train_tensor[dataset_label])
    # Initialize Baseline OptimizerModels
    for method in const.METHOD_DUMMY, const.METHOD_LR, const.METHOD_RFO, const.METHOD_TREE:
        baseline_models[dataset_label][method] = OptimizerModel(
            dataset_label, method, X, y)

# Initialize target Tensors
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(y_test)
test_dataset = data.TensorDataset(
    X_train_tensor[dataset_label], y_train_tensor[dataset_label])

## Training /Learning Curves

In [103]:
predictions_cnn = {const.US_NO: {}, const.US_RANDOM: {}}
predictions_mlp = {const.US_NO: {}, const.US_RANDOM: {}}
for undersampling in const.US_NO, const.US_RANDOM:
    no_trials = 50
    load_name = Utility.create_file_identifier(
        const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.METHOD_CNN, undersampling, const.BINNING_18K, const.DATASET_DRIAMSA)
    best_params = Utility.load_best_params(load_name+'_'+str(50)+'_trials')
    predictions_cnn[undersampling][const.BEST_PARAMS] = best_params
    predictions_cnn[undersampling][const.PREDICTIONS] = Utility.load_predictions(
        load_name)
    predictions_cnn[undersampling]['test_aucroc'] = Utility.load_losses(
        load_name)
    predictions_cnn[undersampling][const.PROBABILITIES] = Utility.load_probas(
        load_name)

    no_trials = 50
    load_name = Utility.create_file_identifier(
        const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.METHOD_MLP, undersampling, const.BINNING_18K, const.DATASET_DRIAMSA)
    best_params = Utility.load_best_params(load_name+'_'+str(50)+'_trials')
    predictions_mlp[undersampling][const.BEST_PARAMS] = best_params
    predictions_mlp[undersampling][const.PREDICTIONS] = Utility.load_predictions(
        load_name)
    predictions_mlp[undersampling]['test_aucroc'] = Utility.load_losses(
        load_name)
    predictions_mlp[undersampling][const.PROBABILITIES] = Utility.load_probas(
        load_name)

    for method in baseline_models[undersampling]:
        load_name = Utility.create_file_identifier(
            const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, method, undersampling, const.BINNING_18K, const.DATASET_DRIAMSA)
        current_model = baseline_models[undersampling][method]

        best_params = Utility.load_best_params(load_name)
        # Set loaded Params
        current_model.set_best_params(best_params)

        current_model.set_predicted_labels(Utility.load_predictions(
            load_name), Utility.load_probas(load_name))

In [None]:
load_name = Utility.create_file_identifier(
        const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.METHOD_CNN, const.US_NO, const.BINNING_6K, const.DATASET_DRIAMSB)
test_aucroc = Utility.load_losses('test_aucroc'+load_name +
                        str(randseeds[seed_num]))
training_aucroc = Utility.load_losses('training_aucroc'+load_name +
                    str(randseeds[seed_num]))

plot = sns.lineplot(x=range(len(
    test_aucroc)), y=test_aucroc, label='cnn_test')
plot = sns.lineplot(x=range(len(
    training_aucroc)), y=training_aucroc, label='cnn_train')
plot.set(xlabel="Epoch", ylabel="AUC ROC")
plt.ylim([0.45,1.05])

In [None]:
for seed_num in randseeds:
    load_name = Utility.create_file_identifier(
            const.ANTIBIOTIC_OXACILLIN, const.SPECIES_SAUREUS, const.METHOD_MLP, const.US_NO, const.BINNING_6K, const.DATASET_DRIAMSB)
    test_aucroc = Utility.load_losses('test_aucroc'+load_name +
                            str(randseeds[seed_num]))
    training_aucroc = Utility.load_losses('training_aucroc'+load_name +
                        str(randseeds[seed_num]))
    plot = sns.lineplot(x=range(len(
        test_aucroc)), y=test_aucroc, label='mlp_test'+'_'+randseeds[seed_num])
    plot = sns.lineplot(x=range(len(
        training_aucroc)), y=training_aucroc, label='mlp_train'+randseeds[seed_num])
    plot.set(xlabel="Epoch", ylabel="AUC ROC")
    plt.ylim([0.45,1.05])
    plt.show()