In [1]:
import gc
import glob
import numpy as np
import os
import pandas as pd
import pickle
import random
import time
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import log_loss, balanced_accuracy_score, roc_auc_score, roc_curve

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

import shap

### Input

Output folder name

In [2]:
output_folder = 'compareclassifiers_randomforest'

if os.path.isdir(output_folder):
    raise Exception('Already run!')
else:
    os.mkdir(output_folder)
    os.mkdir('%s/_individual' % output_folder)

Datasets

In [3]:
datasets = ['clinical','gene_all','mutation_onehot_all']

Hyperopt parameters

In [4]:
n_hyperopt_iterations = 2**8

Data splits

In [5]:
n_splits_trainvalidation_test = 20
test_size = 0.2
k_train_validation = 5

Seed

In [6]:
seed_ = 1

# implement seed
random.seed(seed_)
np.random.seed(seed_)

# timestamp
timestamp = 1912131135

### HyperOpt Functions

In [7]:
def hyperopt_function(parameters):

    # load data
    with open('_files/data_%s.pickle' % timestamp, 'rb') as f:
        X_train, y_train, X_validation, y_validation = pickle.load(f, encoding='latin1')
    
    # calculate performance
    mean_validation_weightedlogloss, validation_pred = hyperopt_performance(X_train, y_train, X_validation, y_validation, parameters)
    gc.collect()
    
    # save validation predictions if best classifier
    with open('_files/validation_%s.pickle' % timestamp,'rb') as f:
        best_weightedlogloss = pickle.load(f)
    if mean_validation_weightedlogloss < best_weightedlogloss:
        with open('_files/validation_%s.pickle' % timestamp,'wb') as f:
            pickle.dump(mean_validation_weightedlogloss, f)
        with open('_files/validation_xgb_%s.pickle' % timestamp,'wb') as f:
            pickle.dump(validation_pred, f)
    
    # return performance
    return {'loss':mean_validation_weightedlogloss, 'status':STATUS_OK}

In [8]:
def hyperopt_performance(X_train, y_train, X_validation, y_validation, parameters):
    
    # initialize validation performance and predictions
    validation_weightedlogloss = []
    validation_pred = []
    
    # iterate over number of training/validation splits
    for i in range(k_train_validation):
        
        # positive weight
        pos_weight = len([x for x in y_train[i] if x==0])/len([x for x in y_train[i] if x==1])

        # parameters
        param = parameters.copy()
        param['class_weight'] = {0:1, 1:pos_weight}
        param['random_state'] = seed_
        param['n_estimators'] = int(param['n_estimators'])

        # train on training
        clf = RandomForestClassifier(**param).fit(X_train[i], y_train[i])

        # evaluate on validation
        y_pred = clf.predict_proba(X_validation[i])
        pos_weight = len([x for x in y_validation[i] if x==0])/len([x for x in y_validation[i] if x==1])
        sample_weights = [pos_weight if x==1 else 1 for x in y_validation[i]]
        weightedlogloss = log_loss(y_validation[i], y_pred, sample_weight=sample_weights)
        validation_weightedlogloss.append(weightedlogloss)
        validation_pred.append(y_pred)
    
    # average validation performance over all folds
    mean_validation_weightedlogloss = np.mean(validation_weightedlogloss) + np.std(validation_weightedlogloss)/np.sqrt(len(validation_weightedlogloss))
    return mean_validation_weightedlogloss, np.concatenate(validation_pred)

### Custom functions

In [9]:
def dummy_y(y):
    
    dummy_y_ = [[],[]]
    for i in range(len(y)):
        if y[i] == 0:
            dummy_y_[0].append(1)
            dummy_y_[1].append(0)
        else:
            dummy_y_[0].append(0)
            dummy_y_[1].append(1)
    dummy_y_ = np.array(dummy_y_).T
    return dummy_y_

### Output folders and files

In [10]:
# iterate over datasets
for a in range(len(datasets)):
    
    # folder
    os.mkdir('%s/_individual/%s' % (output_folder, datasets[a]))
    
# performance files
performance_files_weightedlogloss = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=datasets)
performance_files_weightedlogloss.to_csv('%s/_individual/weightedlogloss.csv' % output_folder)

performance_files_balancedaccuracy = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=datasets)
performance_files_balancedaccuracy.to_csv('%s/_individual/balancedaccuracy.csv' % output_folder)

performance_files_auroc = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=datasets)
performance_files_auroc.to_csv('%s/_individual/auroc.csv' % output_folder)

In [11]:
#import warnings
#warnings.filterwarnings("ignore")

### Pipeline

In [12]:
# iterate over datasets
for a in range(len(datasets)):
    print('-------------------------')
    print('DATASET: %s' % datasets[a])
    print('-------------------------')
    
    # load dataset
    with open('_datasets/%s.pickle' % datasets[a], 'rb') as f:
        X_matrix, y_vector, categorical_conversion = pickle.load(f, encoding='latin1')
    X_matrix.columns = ['%s # %s' % (datasets[a], feature) for feature in X_matrix.columns.tolist()]

    # divide train+validation from testing
    trainvalidation_index = []
    test_index = []
    sss = StratifiedShuffleSplit(n_splits=n_splits_trainvalidation_test, test_size=test_size, random_state=seed_)
    for trainvalidation_, test_ in sss.split(X_matrix, y_vector):
        trainvalidation_index.append(list(trainvalidation_))
        test_index.append(list(test_))

    # iterate over number of training+validation/testing splits
    for b in range(n_splits_trainvalidation_test):
        print('Split %d' % (b+1))
        
        # separate train+validation and testing
        X_trainvalidation = X_matrix.iloc[trainvalidation_index[b],]
        X_test = X_matrix.iloc[test_index[b],]
        y_trainvalidation = y_vector[trainvalidation_index[b]]
        y_test = y_vector[test_index[b]]
        
        # divide train from validation
        train_index = []
        validation_index = []
        skf = StratifiedKFold(n_splits=k_train_validation, shuffle=True, random_state=seed_)
        for train_, validation_ in skf.split(X_trainvalidation, y_trainvalidation):
            train_index.append(list(train_))
            validation_index.append(list(validation_))

        # separate train and validation
        X_train = []
        X_validation = []
        y_train = []
        y_validation = []
        for c in range(k_train_validation):
            X_train.append(X_trainvalidation.iloc[train_index[c],])
            X_validation.append(X_trainvalidation.iloc[validation_index[c],])
            y_train.append(y_trainvalidation[train_index[c]])
            y_validation.append(y_trainvalidation[validation_index[c]])
            
        # impute train+validation/testing
        X_trainvalidation_ = X_trainvalidation.copy()
        X_test_ = X_test.copy()
        imp = SimpleImputer()
        columns_to_add_back = [i for i,x in enumerate(X_trainvalidation.mean()) if pd.isna(x)]
        X_trainvalidation = imp.fit_transform(X_trainvalidation)
        for c in sorted(columns_to_add_back)[::-1]:
            X_trainvalidation = np.hstack((X_trainvalidation[:,:c], np.zeros(X_trainvalidation.shape[0]).reshape(-1,1), X_trainvalidation[:,c:]))
        X_test = imp.transform(X_test)
        for c in sorted(columns_to_add_back)[::-1]:
            X_test = np.hstack((X_test[:,:c], np.zeros(X_test.shape[0]).reshape(-1,1), X_test[:,c:]))
        scaler = StandardScaler()
        X_trainvalidation = scaler.fit_transform(X_trainvalidation)
        X_test = scaler.transform(X_test)
        
        # impute train/validation
        X_train_ = X_train.copy()
        X_validation_ = X_validation.copy()
        for c in range(k_train_validation):
            imp = SimpleImputer()
            columns_to_add_back = [i for i,x in enumerate(X_train[c].mean()) if pd.isna(x)]
            X_train[c] = imp.fit_transform(X_train[c])
            for d in sorted(columns_to_add_back)[::-1]:
                X_train[c] = np.hstack((X_train[c][:,:d], np.zeros(X_train[c].shape[0]).reshape(-1,1), X_train[c][:,d:]))
            X_validation[c] = imp.transform(X_validation[c])
            for d in sorted(columns_to_add_back)[::-1]:
                X_validation[c] = np.hstack((X_validation[c][:,:d], np.zeros(X_validation[c].shape[0]).reshape(-1,1), X_validation[c][:,d:]))
            scaler = StandardScaler()
            X_train[c] = scaler.fit_transform(X_train[c])
            X_validation[c] = scaler.transform(X_validation[c])
            
        # initialize test predictions
        classifier_test_predictions = []

        # xgb parameters
        criterion_options = ['gini','entropy']
        max_features_options = ['sqrt','log2']
        parameters = {
            'n_estimators': scope.int(hp.qloguniform('n_estimators', np.log(1e0), np.log(1e3), 1)),
            'criterion': hp.choice('criterion', criterion_options), 
            'max_depth': scope.int(hp.uniform('max_depth', 1, 11)),
            'min_samples_split': hp.uniform('min_samples_split', 0., 1.),
            'min_samples_leaf': hp.uniform('min_samples_leaf', 0., 0.5),
            'max_features': hp.choice('max_features', max_features_options)
                     }

        # save info for hyperopt
        with open('_files/validation_%s.pickle' % timestamp,'wb') as f:
            pickle.dump(1000., f)
        with open('_files/data_%s.pickle' % timestamp,'wb') as f:
            pickle.dump([X_train, y_train, X_validation, y_validation], f)

        # hyperopt to find best parameters
        trials = Trials()
        best = fmin(hyperopt_function, parameters, algo=tpe.suggest, max_evals=n_hyperopt_iterations, trials=trials, rstate=np.random.RandomState(seed_), verbose=0, show_progressbar=True)      
            
        # create classifier using best parameters
        pos_weight = len([x for x in y_trainvalidation if x==0])/len([x for x in y_trainvalidation if x==1])
            
        # parameters
        param = {'class_weight':{0:1, 1:pos_weight}, 'random_state':seed_, 'n_estimators':int(best['n_estimators']), 'criterion':criterion_options[best['criterion']], 'max_depth':best['max_depth'], 'min_samples_split':best['min_samples_split'], 'min_samples_leaf':best['min_samples_leaf'], 'max_features':max_features_options[best['max_features']]}

        # train on training
        clf = RandomForestClassifier(**param).fit(X_trainvalidation, y_trainvalidation)

        # evaluate on validation
        y_pred = clf.predict_proba(X_test)

        # calculate test performance - weighted log loss
        pos_weight = len([x for x in y_test if x==0])/len([x for x in y_test if x==1])
        sample_weights = [pos_weight if x==1 else 1 for x in y_test]
        performance = log_loss(y_test, y_pred, sample_weight=sample_weights)
        performance_files_weightedlogloss.at['split_%d' % (b+1), datasets[a]] = performance
        performance_files_weightedlogloss.at['MEAN', datasets[a]] = np.nanmean(performance_files_weightedlogloss.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())
        performance_files_weightedlogloss.at['STERR', datasets[a]] = np.nanstd(performance_files_weightedlogloss.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())/np.sqrt(b+1)
        performance_files_weightedlogloss.to_csv('%s/_individual/weightedlogloss.csv' % output_folder)

        # calculate test performance - balanced accuracy
        y_pred_ = [1 if x>=0.5 else 0 for x in y_pred[:,1]]
        performance = balanced_accuracy_score(y_test, y_pred_)
        performance_files_balancedaccuracy.at['split_%d' % (b+1), datasets[a]] = performance
        performance_files_balancedaccuracy.at['MEAN', datasets[a]] = np.nanmean(performance_files_balancedaccuracy.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())
        performance_files_balancedaccuracy.at['STERR', datasets[a]] = np.nanstd(performance_files_balancedaccuracy.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())/np.sqrt(b+1)
        performance_files_balancedaccuracy.to_csv('%s/_individual/balancedaccuracy.csv' % output_folder)

        # calculate test performance - auroc
        performance = roc_auc_score(dummy_y(y_test), y_pred)
        performance_files_auroc.at['split_%d' % (b+1), datasets[a]] = performance
        performance_files_auroc.at['MEAN', datasets[a]] = np.nanmean(performance_files_auroc.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())
        performance_files_auroc.at['STERR', datasets[a]] = np.nanstd(performance_files_auroc.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())/np.sqrt(b+1)
        performance_files_auroc.to_csv('%s/_individual/auroc.csv' % output_folder)

        # load validation predictions
        with open('_files/validation_xgb_%s.pickle' % timestamp,'rb') as f:
            validation_predictions = pickle.load(f)
        validation_y = np.concatenate(y_validation)
        validation_X = pd.concat(X_validation_)

        # save results
        with open('%s/_individual/%s/iter_%d.pickle' % (output_folder, datasets[a], b+1), 'wb') as f:
            pickle.dump([validation_X, validation_y, validation_predictions, X_test_, y_test, y_pred, clf], f)

-------------------------
DATASET: clinical
-------------------------
Split 1
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [05:25<00:00,  2.32it/s, best loss: 0.4616142253801125]
Split 2
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [03:31<00:00,  1.98it/s, best loss: 0.4825529504465158]
Split 3
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [02:52<00:00,  1.43it/s, best loss: 0.4441096460459606]
Split 4
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [02:48<