In [1]:
import gc
import glob
import numpy as np
import os
import pandas as pd
import pickle
import random
import time
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import log_loss, balanced_accuracy_score, roc_auc_score, roc_curve
from sklearn.linear_model import Ridge

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import xgboost as xgb

import shap

### Input

Input info

In [2]:
output_folder = 'compareclassifiers_nohyperopt'
datasets = [['clinical','gene_all','mutation_onehot_all']]

Calculate SHAP values and interactions

In [3]:
compute_shap_values = [False]
compute_shap_interactions = [False]

Data splits

In [4]:
n_splits_trainvalidation_test = 20
k_train_validation = 5
early_stopping_size = 0.125

xgBoost stacking

In [5]:
n_hyperopt_iterations = 1

Seed

In [6]:
seed_ = 1

# implement seed
random.seed(seed_)
np.random.seed(seed_)

### HyperOpt Functions

In [7]:
def hyperopt_function(parameters):

    # load data
    with open('_files/data__.pickle', 'rb') as f:
        X_training_train, y_training_train, X_earlystopping_train, y_earlystopping_train, X_validation, y_validation = pickle.load(f, encoding='latin1')
    
    # calculate performance
    mean_validation_weightedlogloss = hyperopt_performance(X_training_train, y_training_train, X_earlystopping_train, y_earlystopping_train, X_validation, y_validation, parameters)
    
    # return performance
    return {'loss':mean_validation_weightedlogloss, 'status':STATUS_OK}

In [8]:
def hyperopt_performance(X_training_train, y_training_train, X_earlystopping_train, y_earlystopping_train, X_validation, y_validation, parameters):
    
    # initialize validation performance
    validation_weightedlogloss = []
    
    # iterate over number of training/validation splits
    for i in range(k_train_validation):

        # xgb datasets
        xgb_training = xgb.DMatrix(X_training_train[i], label=y_training_train[i])
        xgb_earlystopping = xgb.DMatrix(X_earlystopping_train[i], label=y_earlystopping_train[i])
        xgb_validation = xgb.DMatrix(X_validation[i], label=y_validation[i])

        # parameters
        param = {}
        param['objective'] = 'multi:softprob'
        param['num_class'] = len(datasets[a])
        param['eval_metric'] = 'mlogloss'
        param['seed'] = seed_
        evallist = [(xgb_training, 'train'), (xgb_earlystopping, 'eval')]

        # train on training
        bst = xgb.train(param, xgb_training, num_boost_round=10000, evals=evallist, early_stopping_rounds=10, verbose_eval=False)

        # evaluate on validation
        y_pred = bst.predict(xgb_validation, ntree_limit=bst.best_ntree_limit)
        weightedlogloss = log_loss(y_validation[i], y_pred, labels=list(range(len(datasets[a]))))
        validation_weightedlogloss.append(weightedlogloss)
    
    # average validation performance over all folds
    mean_validation_weightedlogloss = np.mean(validation_weightedlogloss) + np.std(validation_weightedlogloss)/np.sqrt(len(validation_weightedlogloss))
    return mean_validation_weightedlogloss

### SHAP functions

In [9]:
def calculate_shap_values(explainer, X_test, tree_limit):
    
    # compute values
    shap_values = explainer.shap_values(X_test, tree_limit=tree_limit)
    
    # merge features
    if len(categorical_conversion) > 0:
        shap_values_ = np.zeros((X_test.shape[0], len(merged_features[c])))
        for i, feature in enumerate(merged_features[c]):
            if feature not in categorical_conversion:
                shap_values_[:,i] = shap_values[:,features[c].index(feature)]
            else:
                find_indices = [j for j,x in enumerate(features[c]) if x.split(' | ')[0] == feature]
                shap_values_[:,i] = shap_values[:,find_indices].sum(axis=1)
        shap_values = shap_values_.copy()

    # return results
    return shap_values

In [10]:
def calculate_shap_interactions(explainer, X_test, tree_limit):
    
    # compute values
    shap_interaction_values = explainer.shap_interaction_values(X_test, tree_limit=tree_limit)[0]

    # merge features
    if len(categorical_conversion) > 0:
        shap_interaction_values_ = np.zeros((len(merged_features[c]), len(features[c])))
        for i,feature in enumerate(merged_features[c]):
            if feature not in categorical_conversion:
                shap_interaction_values_[i,:] = shap_interaction_values[features[c].index(feature),:]
            else:
                find_indices = [j for j,x in enumerate(features[c]) if x.split(' | ')[0] == feature]
                shap_interaction_values_[i,:] = shap_interaction_values[find_indices,:].sum(axis=0)
        shap_interaction_values = np.zeros((len(merged_features[c]), len(merged_features[c])))
        for i,feature in enumerate(merged_features[c]):
            if feature not in categorical_conversion:
                shap_interaction_values[:,i] = shap_interaction_values_[:,features[c].index(feature)]
            else:
                find_indices = [j for j,x in enumerate(features[c]) if x.split(' | ')[0] == feature]
                shap_interaction_values[:,i] = shap_interaction_values_[:,find_indices].sum(axis=1)

    # return results
    return shap_interaction_values

### Custom functions

In [11]:
def dummy_y(y):
    
    dummy_y_ = [[],[]]
    for i in range(len(y)):
        if y[i] == 0:
            dummy_y_[0].append(1)
            dummy_y_[1].append(0)
        else:
            dummy_y_[0].append(0)
            dummy_y_[1].append(1)
    dummy_y_ = np.array(dummy_y_).T
    return dummy_y_

### Create output folders and files

In [12]:
# dataset names
dataset_names = []
for a in range(len(datasets)):
    dataset_names.append('+'.join(datasets[a]))
    os.mkdir('%s/%s' % (output_folder, dataset_names[a]))
    
# performance files
performance_files_weightedlogloss = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_weightedlogloss.to_csv('%s/weightedlogloss.csv' % output_folder)

performance_files_balancedaccuracy = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_balancedaccuracy.to_csv('%s/balancedaccuracy.csv' % output_folder)

performance_files_auroc = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_auroc.to_csv('%s/auroc.csv' % output_folder)

performance_files_sensitivity_50 = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_sensitivity_50.to_csv('%s/sensitivity_50.csv' % output_folder)

performance_files_specificity_50 = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_specificity_50.to_csv('%s/specificity_50.csv' % output_folder)

performance_files_ppv_50 = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_ppv_50.to_csv('%s/ppv_50.csv' % output_folder)

performance_files_npv_50 = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_npv_50.to_csv('%s/npv_50.csv' % output_folder)

performance_files_optimal_threshold = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_optimal_threshold.to_csv('%s/optimal_threshold.csv' % output_folder)

performance_files_sensitivity_optimal = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_sensitivity_optimal.to_csv('%s/sensitivity_optimal.csv' % output_folder)

performance_files_specificity_optimal = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_specificity_optimal.to_csv('%s/specificity_optimal.csv' % output_folder)

performance_files_ppv_optimal = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_ppv_optimal.to_csv('%s/ppv_optimal.csv' % output_folder)

performance_files_npv_optimal = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=dataset_names)
performance_files_npv_optimal.to_csv('%s/npv_optimal.csv' % output_folder)

### Pipeline

In [13]:
# iterate over datasets
for a in range(len(datasets)):
    print('-------------------------')
    print('DATASET: %s' % dataset_names[a])
    print('-------------------------')
    
    # iterate over number of training+validation/testing splits
    for b in range(n_splits_trainvalidation_test):
    
        print('Split %d' % (b+1))
        
        # load categorical conversion from each dataset
        features = []
        merged_features = []
        with open('_datasets/%s.pickle' % datasets[a][0], 'rb') as f:
            X_matrix, y_vector, categorical_conversion_old = pickle.load(f, encoding='latin1')
        features.append(['%s # %s' % (datasets[a][0], x) for x in X_matrix.columns.tolist()])
        categorical_conversion = {}
        for key in categorical_conversion_old:
            categorical_conversion['%s # %s' % (datasets[a][0], key)] = categorical_conversion_old[key]
        if len(categorical_conversion) > 0:
            merged_features.append([])
            for feature in features[0]:
                if feature.split(' | ')[0] not in categorical_conversion:
                    merged_features[-1].append(feature)
                elif feature.split(' | ')[0] not in merged_features[-1]:
                    merged_features[-1].append(feature.split(' | ')[0])
        else:
            merged_features.append(features[0].copy())
        for c in range(1,len(datasets[a])):
            with open('_datasets/%s.pickle' % datasets[a][c], 'rb') as f:
                X_matrix_, y_vector_, categorical_conversion_old = pickle.load(f, encoding='latin1')
            features.append(['%s # %s' % (datasets[a][c], x) for x in X_matrix_.columns.tolist()])
            categorical_conversion_ = {}
            for key in categorical_conversion_old:
                categorical_conversion_['%s # %s' % (datasets[a][c], key)] = categorical_conversion_old[key]
            categorical_conversion = {**categorical_conversion, **categorical_conversion_}
            if len(categorical_conversion_) > 0:
                merged_features.append([])
                for feature in features[c]:
                    if feature.split(' | ')[0] not in categorical_conversion_:
                        merged_features[-1].append(feature)
                    elif feature.split(' | ')[0] not in merged_features[-1]:
                        merged_features[-1].append(feature.split(' | ')[0])
            else:
                merged_features.append(features[c].copy())
        
        # load results from individual datasets
        validation_X = []
        validation_predictions = []
        X_test = []
        y_pred = []
        explainers_independent = []
        explainers_dependent = []
        tree_limit = []
        for c in range(len(datasets[a])):
            with open('%s/_individual/%s/iter_%d.pickle' % (output_folder, datasets[a][c], b+1), 'rb') as f:
                validation_X_, validation_y, validation_predictions_, X_test_, y_test, y_pred_, explainer_independent, explainer_dependent, tree_limit_ = pickle.load(f)
            validation_X.append(validation_X_)
            validation_predictions.append(validation_predictions_)
            X_test.append(X_test_)
            y_pred.append(y_pred_)
            explainers_independent.append(explainer_independent)
            explainers_dependent.append(explainer_dependent)
            tree_limit.append(tree_limit_)

        # combine predictions
        validation_predictions = np.concatenate([x.reshape(-1,1) for x in validation_predictions], axis=1)
        test_predictions = np.concatenate([x.reshape(-1,1) for x in y_pred], axis=1)
        
        # if more than one dataset
        if len(datasets[a]) > 1:
        
            # validation best classifier
            validation_best_classifier = []
            for i in range(len(validation_y)):
                if validation_y[i] == 0:
                    validation_best_classifier.append(np.argmin(validation_predictions[i,:]))
                elif validation_y[i] == 1:
                    validation_best_classifier.append(np.argmax(validation_predictions[i,:]))
            validation_best_classifier = np.array(validation_best_classifier)

            # subset features that are in any of the models
            features_in_models = []
            for c in range(len(explainers_independent)):
                shapval = explainers_independent[c].shap_values(validation_X[c], tree_limit=tree_limit[c]).mean(axis=0)
                features_in_models.extend([features[c][i] for i in range(len(features[c])) if shapval[i] != 0])
                print('%s: %d/%d - %0.2f%% - %d features' % (datasets[a][c], len([x for x in validation_best_classifier if x==c]), len(validation_best_classifier), len([x for x in validation_best_classifier if x==c])/len(validation_best_classifier)*100, len([features[c][i] for i in range(len(features[c])) if shapval[i] != 0])))

            # get combined dataset with features in models
            validation_X_all = pd.concat(validation_X, axis=1)[features_in_models]
            X_test_all = pd.concat(X_test, axis=1)[features_in_models]

            # separate training_full from earlystopping_full
            training_index = []
            earlystopping_index = []
            sss = StratifiedShuffleSplit(n_splits=1, test_size=early_stopping_size, random_state=seed_)
            for training_, earlystopping_ in sss.split(validation_X_all, validation_best_classifier):
                training_index.append(list(training_))
                earlystopping_index.append(list(earlystopping_))
            training_full_X = validation_X_all.iloc[training_index[0],]
            earlystopping_full_X = validation_X_all.iloc[earlystopping_index[0],]
            training_full_y = validation_best_classifier[training_index[0]]
            earlystopping_full_y = validation_best_classifier[earlystopping_index[0]]

            # separate full
            sep1_index = []
            sep2_index = []
            skf = StratifiedKFold(n_splits=k_train_validation, shuffle=True, random_state=seed_)
            for sep1_, sep2_ in skf.split(validation_X_all, validation_best_classifier):
                sep1_index.append(list(sep1_))
                sep2_index.append(list(sep2_))
            sep1_X = []
            sep2_X = []
            sep1_y = []
            sep2_y = []
            for c in range(k_train_validation):
                sep1_X.append(validation_X_all.iloc[sep1_index[c],])
                sep2_X.append(validation_X_all.iloc[sep2_index[c],])
                sep1_y.append(validation_best_classifier[sep1_index[c]])
                sep2_y.append(validation_best_classifier[sep2_index[c]])

            # separate training_sep1 from earlystopping_sep1
            training_sep1_X = []
            earlystopping_sep1_X = []
            training_sep1_y = []
            earlystopping_sep1_y = []
            for c in range(k_train_validation):
                training_index = []
                earlystopping_index = []
                sss = StratifiedShuffleSplit(n_splits=1, test_size=early_stopping_size, random_state=seed_)
                for training_, earlystopping_ in sss.split(sep1_X[c], sep1_y[c]):
                    training_index.append(list(training_))
                    earlystopping_index.append(list(earlystopping_))
                training_sep1_X.append(sep1_X[c].iloc[training_index[0],])
                earlystopping_sep1_X.append(sep1_X[c].iloc[earlystopping_index[0],])
                training_sep1_y.append(sep1_y[c][training_index[0]])
                earlystopping_sep1_y.append(sep1_y[c][earlystopping_index[0]])

            # xgb parameter values
            parameters = {
                'dummy': hp.uniform('dummy', 0, 1),
                         }
            
            # save info for hyperopt
            with open('_files/data__.pickle','wb') as f:
                pickle.dump([training_sep1_X, training_sep1_y, earlystopping_sep1_X, earlystopping_sep1_y, sep2_X, sep2_y], f)

            # hyperopt to find best parameters
            trials = Trials()
            best = fmin(hyperopt_function, parameters, algo=tpe.suggest, max_evals=n_hyperopt_iterations, trials=trials, rstate=np.random.RandomState(seed_), verbose=0, show_progressbar=True)

            # create classifier using best parameters
            xgb_training = xgb.DMatrix(training_full_X, label=training_full_y)
            xgb_earlystopping = xgb.DMatrix(earlystopping_full_X, label=earlystopping_full_y)
            xgb_test = xgb.DMatrix(X_test_all)

            param = {'objective':'multi:softprob', 'num_class':len(datasets[a]), 'eval_metric':'mlogloss', 'seed':seed_}
            evallist = [(xgb_training, 'train'), (xgb_earlystopping, 'eval')]

            # train stacker
            bst = xgb.train(param, xgb_training, num_boost_round=10000, evals=evallist, early_stopping_rounds=10, verbose_eval=False)

            # get weights on testing set
            weights = bst.predict(xgb_test, ntree_limit=bst.best_ntree_limit)
            
            # calculate stacker performance - log loss
            test_best_classifier = []
            for i in range(len(y_test)):
                if y_test[i] == 0:
                    test_best_classifier.append(np.argmin(test_predictions[i,:]))
                elif y_test[i] == 1:
                    test_best_classifier.append(np.argmax(test_predictions[i,:]))
            test_best_classifier = np.array(test_best_classifier)
            
            # save stacker predictions
            with open('%s/%s/stacker_%d.pickle' % (output_folder,dataset_names[a],b+1) ,'wb') as f:
                pickle.dump([X_test, X_test_all, y_test, y_pred, test_predictions, weights, test_best_classifier, bst], f)
            
            # get predictions on test set
            y_pred = []
            for i in range(len(y_test)):
                y_pred.append(np.average(test_predictions[i,:], weights=weights[i,:]))
            y_pred = np.array(y_pred)
            
        # if only one dataset
        else:
            y_pred = y_pred[0]
            weights = np.array([1 for x in y_test]).reshape(-1,1)
        
        # save predictions
        with open('%s/%s/predictions_%d.pickle' % (output_folder,dataset_names[a],b+1) ,'wb') as f:
            pickle.dump([X_test[0].index.tolist(), y_test, y_pred], f)
        
        # calculate test performance - weighted log loss
        pos_weight = len([x for x in y_test if x==0])/len([x for x in y_test if x==1])
        sample_weights = [pos_weight if x==1 else 1 for x in y_test]
        performance = log_loss(y_test, y_pred, sample_weight=sample_weights)
        performance_files_weightedlogloss.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_weightedlogloss.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_weightedlogloss.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_weightedlogloss.at['STERR', dataset_names[a]] = np.nanstd(performance_files_weightedlogloss.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_weightedlogloss.to_csv('%s/weightedlogloss.csv' % output_folder)

        # calculate test performance - balanced accuracy
        y_pred_ = [1 if x>=0.5 else 0 for x in y_pred]
        performance = balanced_accuracy_score(y_test, y_pred_)
        performance_files_balancedaccuracy.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_balancedaccuracy.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_balancedaccuracy.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_balancedaccuracy.at['STERR', dataset_names[a]] = np.nanstd(performance_files_balancedaccuracy.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_balancedaccuracy.to_csv('%s/balancedaccuracy.csv' % output_folder)

        # calculate test performance - auroc
        y_pred_ = np.concatenate((np.array([1-x for x in y_pred]).reshape(-1,1), y_pred.reshape(-1,1)), axis=1)
        performance = roc_auc_score(dummy_y(y_test), y_pred_)
        performance_files_auroc.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_auroc.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_auroc.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_auroc.at['STERR', dataset_names[a]] = np.nanstd(performance_files_auroc.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_auroc.to_csv('%s/auroc.csv' % output_folder)
        
        # tp, tn, fp, fn
        tp = len([i for i in range(len(y_test)) if y_test[i]==1 and y_pred[i]>0.5])
        fp = len([i for i in range(len(y_test)) if y_test[i]==0 and y_pred[i]>0.5])
        tn = len([i for i in range(len(y_test)) if y_test[i]==0 and y_pred[i]<0.5])
        fn = len([i for i in range(len(y_test)) if y_test[i]==1 and y_pred[i]<0.5])
        
        # calculate test performance - sensitivity - 50
        performance = tp/(tp+fn)
        performance_files_sensitivity_50.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_sensitivity_50.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_sensitivity_50.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_sensitivity_50.at['STERR', dataset_names[a]] = np.nanstd(performance_files_sensitivity_50.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_sensitivity_50.to_csv('%s/sensitivity_50.csv' % output_folder)
        
        # calculate test performance - specificity - 50
        performance = tn/(tn+fp)
        performance_files_specificity_50.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_specificity_50.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_specificity_50.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_specificity_50.at['STERR', dataset_names[a]] = np.nanstd(performance_files_specificity_50.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_specificity_50.to_csv('%s/specificity_50.csv' % output_folder)
        
        # calculate test performance - ppv - 50
        performance = tp/(tp+fp)
        performance_files_ppv_50.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_ppv_50.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_ppv_50.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_ppv_50.at['STERR', dataset_names[a]] = np.nanstd(performance_files_ppv_50.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_ppv_50.to_csv('%s/ppv_50.csv' % output_folder)
        
        # calculate test performance - npv - 50
        performance = tn/(tn+fn)
        performance_files_npv_50.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_npv_50.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_npv_50.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_npv_50.at['STERR', dataset_names[a]] = np.nanstd(performance_files_npv_50.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_npv_50.to_csv('%s/npv_50.csv' % output_folder)
        
        # optimal threshold
        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        youden = [(1-fpr[i])+tpr[i] for i in range(len(thresholds))]
        top_index = []
        top_threshold = []
        for i in range(len(youden)):
            if youden[i] == np.max(youden):
                top_index.append(i)
                top_threshold.append(thresholds[i])
        distance_from_50 = [np.abs(x-0.5) for x in top_threshold]
        top_index = top_index[np.argmin(distance_from_50)]
        optimal_threshold = top_threshold[np.argmin(distance_from_50)]
        performance_files_optimal_threshold.at['split_%d' % (b+1), dataset_names[a]] = optimal_threshold
        performance_files_optimal_threshold.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_optimal_threshold.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_optimal_threshold.at['STERR', dataset_names[a]] = np.nanstd(performance_files_optimal_threshold.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_optimal_threshold.to_csv('%s/optimal_threshold.csv' % output_folder)
        
        # tp, tn, fp, fn
        tp = len([i for i in range(len(y_test)) if y_test[i]==1 and y_pred[i]>optimal_threshold])
        fp = len([i for i in range(len(y_test)) if y_test[i]==0 and y_pred[i]>optimal_threshold])
        tn = len([i for i in range(len(y_test)) if y_test[i]==0 and y_pred[i]<optimal_threshold])
        fn = len([i for i in range(len(y_test)) if y_test[i]==1 and y_pred[i]<optimal_threshold])
        
        # calculate test performance - sensitivity - optimal
        performance = tp/(tp+fn)
        performance_files_sensitivity_optimal.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_sensitivity_optimal.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_sensitivity_optimal.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_sensitivity_optimal.at['STERR', dataset_names[a]] = np.nanstd(performance_files_sensitivity_optimal.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_sensitivity_optimal.to_csv('%s/sensitivity_optimal.csv' % output_folder)
        
        # calculate test performance - specificity - optimal
        performance = tn/(tn+fp)
        performance_files_specificity_optimal.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_specificity_optimal.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_specificity_optimal.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_specificity_optimal.at['STERR', dataset_names[a]] = np.nanstd(performance_files_specificity_optimal.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_specificity_optimal.to_csv('%s/specificity_optimal.csv' % output_folder)
        
        # calculate test performance - ppv - optimal
        performance = tp/(tp+fp)
        performance_files_ppv_optimal.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_ppv_optimal.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_ppv_optimal.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_ppv_optimal.at['STERR', dataset_names[a]] = np.nanstd(performance_files_ppv_optimal.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_ppv_optimal.to_csv('%s/ppv_optimal.csv' % output_folder)
        
        # calculate test performance - npv - optimal
        performance = tn/(tn+fn)
        performance_files_npv_optimal.at['split_%d' % (b+1), dataset_names[a]] = performance
        performance_files_npv_optimal.at['MEAN', dataset_names[a]] = np.nanmean(performance_files_npv_optimal.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())
        performance_files_npv_optimal.at['STERR', dataset_names[a]] = np.nanstd(performance_files_npv_optimal.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], dataset_names[a]].values.tolist())/np.sqrt(b+1)
        performance_files_npv_optimal.to_csv('%s/npv_optimal.csv' % output_folder)
        
        # calculate shap values
        if compute_shap_values[a]:
        
            # initialize shap values
            shap_values = []

            # iterate over datasets
            for c in range(len(datasets[a])):

                # compute shap values
                shapval = calculate_shap_values(explainers_independent[c], X_test[c], tree_limit[c])

                # weight classifier shap values for each sample
                for i in range(len(y_test)):
                    shapval[i,:] *= weights[i,c]
                shap_values.append(pd.DataFrame(data=shapval, index=X_test[c].index.tolist(), columns=merged_features[c]))

            # merge multi-model shap values
            if len(shap_values) > 1:
                shap_values = pd.concat(shap_values, axis=1, sort=False)
            else:
                shap_values = shap_values[0]

            # expected values
            expected = []
            for c in range(len(datasets[a])):
                expected.append(explainers_independent[c].expected_value)
            expected = np.array(expected)

            # individual expected values
            shap_expected = []
            for i in range(len(y_test)):
                shap_expected.append(np.average(expected, weights=weights[i,:]))

            # save results
            with open('%s/%s/shap_values_%d.pickle' % (output_folder,dataset_names[a],b+1) ,'wb') as f:
                pickle.dump([shap_expected, shap_values], f)
                
        # calculate shap interactions
        if compute_shap_interactions[a]:
                                                                                                      
            # iterate over samples
            for i in tqdm(range(len(y_test))):
                                                                                                                                         
                # initialize sample results
                sample_shap = []
                
                # iterate over datasets
                for c in range(len(datasets[a])):
                        
                    # calculate shap interaction values
                    sample_shap.append(pd.DataFrame(data=weights[i,c] * calculate_shap_interactions(explainers_dependent[c], X_test[c].iloc[i:i+1,:], tree_limit[c]), index=merged_features[c], columns=merged_features[c]))
                        
                # merge multi-class shap values
                if len(sample_shap) > 1:
                    sample_shap_ = pd.concat(sample_shap, axis=1, sort=False).fillna(0)
                else:
                    sample_shap_ = sample_shap[0]
                
                # add to overall array
                if i==0:
                    shap_interaction_values = sample_shap_.copy()
                else:
                    shap_interaction_values += sample_shap_
                
            # divide by total number of samples
            shap_interaction_values /= len(y_test)
            
            # save results
            with open('%s/%s/shap_interactions_%d.pickle' % (output_folder,dataset_names[a],b+1) ,'wb') as f:
                pickle.dump(shap_interaction_values, f)

-------------------------
DATASET: clinical+gene_all+mutation_onehot_all
-------------------------
Split 1
clinical: 531/732 - 72.54% - 102 features




gene_all: 125/732 - 17.08% - 113 features




mutation_onehot_all: 76/732 - 10.38% - 144 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.25s/it, best loss: 0.8035304097419645]
Split 2
clinical: 404/732 - 55.19% - 114 features




gene_all: 260/732 - 35.52% - 194 features




mutation_onehot_all: 68/732 - 9.29% - 157 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.53s/it, best loss: 0.9490950869450593]
Split 3
clinical: 408/732 - 55.74% - 102 features




gene_all: 245/732 - 33.47% - 218 features




mutation_onehot_all: 79/732 - 10.79% - 185 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.08s/it, best loss: 0.8835333376975378]
Split 4
clinical: 426/732 - 58.20% - 57 features




gene_all: 225/732 - 30.74% - 152 features




mutation_onehot_all: 81/732 - 11.07% - 167 features
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.97s/it, best loss: 0.934064617839539]
Split 5
clinical: 388/732 - 53.01% - 93 features




gene_all: 266/732 - 36.34% - 226 features




mutation_onehot_all: 78/732 - 10.66% - 202 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.98s/it, best loss: 0.9817525144870795]
Split 6
clinical: 416/732 - 56.83% - 44 features




gene_all: 229/732 - 31.28% - 62 features




mutation_onehot_all: 87/732 - 11.89% - 207 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.62s/it, best loss: 0.9193076449849766]
Split 7
clinical: 366/732 - 50.00% - 96 features




gene_all: 272/732 - 37.16% - 78 features




mutation_onehot_all: 94/732 - 12.84% - 157 features
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.64s/it, best loss: 0.952499090121614]
Split 8
clinical: 445/732 - 60.79% - 102 features




gene_all: 199/732 - 27.19% - 192 features




mutation_onehot_all: 88/732 - 12.02% - 193 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.97s/it, best loss: 0.8992315690825557]
Split 9
clinical: 418/732 - 57.10% - 90 features




gene_all: 227/732 - 31.01% - 144 features




mutation_onehot_all: 87/732 - 11.89% - 161 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.86s/it, best loss: 0.9427611185602542]
Split 10
clinical: 418/732 - 57.10% - 90 features




gene_all: 226/732 - 30.87% - 121 features




mutation_onehot_all: 88/732 - 12.02% - 66 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.25s/it, best loss: 0.9356425442699213]
Split 11
clinical: 527/732 - 71.99% - 75 features




gene_all: 127/732 - 17.35% - 101 features




mutation_onehot_all: 78/732 - 10.66% - 181 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.58s/it, best loss: 0.7684505244893272]
Split 12
clinical: 482/732 - 65.85% - 56 features




gene_all: 192/732 - 26.23% - 152 features




mutation_onehot_all: 58/732 - 7.92% - 203 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.66s/it, best loss: 0.8421103823644976]
Split 13
clinical: 507/732 - 69.26% - 75 features




gene_all: 154/732 - 21.04% - 121 features




mutation_onehot_all: 71/732 - 9.70% - 156 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.86s/it, best loss: 0.8462376425946561]
Split 14
clinical: 426/732 - 58.20% - 101 features




gene_all: 212/732 - 28.96% - 177 features




mutation_onehot_all: 94/732 - 12.84% - 61 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.94s/it, best loss: 0.9339344447072885]
Split 15
clinical: 459/732 - 62.70% - 82 features




gene_all: 193/732 - 26.37% - 178 features




mutation_onehot_all: 80/732 - 10.93% - 194 features
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.47s/it, best loss: 0.869283418098456]
Split 16
clinical: 453/732 - 61.89% - 79 features




gene_all: 168/732 - 22.95% - 75 features




mutation_onehot_all: 111/732 - 15.16% - 147 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.61s/it, best loss: 0.9236980299661167]
Split 17
clinical: 435/732 - 59.43% - 85 features




gene_all: 210/732 - 28.69% - 217 features




mutation_onehot_all: 87/732 - 11.89% - 169 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.09s/it, best loss: 0.9395864983414468]
Split 18
clinical: 392/732 - 53.55% - 71 features




gene_all: 255/732 - 34.84% - 106 features




mutation_onehot_all: 85/732 - 11.61% - 145 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.14s/it, best loss: 0.9533723914124774]
Split 19
clinical: 416/732 - 56.83% - 70 features




gene_all: 197/732 - 26.91% - 119 features




mutation_onehot_all: 119/732 - 16.26% - 102 features
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.67s/it, best loss: 0.998454784801841]
Split 20
clinical: 451/732 - 61.61% - 62 features




gene_all: 194/732 - 26.50% - 133 features




mutation_onehot_all: 87/732 - 11.89% - 100 features
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.84s/it, best loss: 0.8377058419848645]
