In [1]:
import gc
import glob
import numpy as np
import os
import pandas as pd
import pickle
import random
import time
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import log_loss, balanced_accuracy_score, roc_auc_score, roc_curve

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import xgboost as xgb

import shap

### Input

Output folder name

In [2]:
output_folder = 'withhyperopt_all'

if os.path.isdir(output_folder):
    raise Exception('Already run!')
else:
    os.mkdir(output_folder)
    os.mkdir('%s/_individual' % output_folder)

Datasets

In [3]:
datasets = ['objscreen_kegg']

Hyperopt parameters

In [4]:
n_hyperopt_iterations = 2**8
optimize_learning_rate = True
default_learning_rate = 0.1

Data splits

In [5]:
n_splits_trainvalidation_test = 20
test_size = 0.2
k_train_validation = 5
early_stopping_size = 0.125

Seed

In [6]:
seed_ = 1

# implement seed
random.seed(seed_)
np.random.seed(seed_)

### HyperOpt Functions

In [7]:
def hyperopt_function(parameters):

    # load data
    with open('_files/data.pickle', 'rb') as f:
        X_training_train, y_training_train, X_earlystopping_train, y_earlystopping_train, X_validation, y_validation = pickle.load(f, encoding='latin1')
    
    # calculate performance
    mean_validation_weightedlogloss, validation_pred = hyperopt_performance(X_training_train, y_training_train, X_earlystopping_train, y_earlystopping_train, X_validation, y_validation, parameters)
    gc.collect()
    
    # save validation predictions if best classifier
    with open('_files/validation.pickle','rb') as f:
        best_weightedlogloss = pickle.load(f)
    if mean_validation_weightedlogloss < best_weightedlogloss:
        with open('_files/validation.pickle','wb') as f:
            pickle.dump(mean_validation_weightedlogloss, f)
        with open('_files/validation_xgb.pickle','wb') as f:
            pickle.dump(validation_pred, f)
    
    # return performance
    return {'loss':mean_validation_weightedlogloss, 'status':STATUS_OK}

In [8]:
def hyperopt_performance(X_training_train, y_training_train, X_earlystopping_train, y_earlystopping_train, X_validation, y_validation, parameters):
    
    # initialize validation performance and predictions
    validation_weightedlogloss = []
    validation_pred = []
    
    # iterate over number of training/validation splits
    for i in range(k_train_validation):
        
        # positive weight
        pos_weight = len([x for x in y_training_train[i] if x==0])/len([x for x in y_training_train[i] if x==1])

        # xgb datasets
        xgb_training = xgb.DMatrix(X_training_train[i], label=y_training_train[i])
        xgb_earlystopping = xgb.DMatrix(X_earlystopping_train[i], label=y_earlystopping_train[i])
        xgb_validation = xgb.DMatrix(X_validation[i], label=y_validation[i])

        # parameters
        param = parameters.copy()
        param['objective'] = 'binary:logistic'
        param['eval_metric'] = 'logloss'
        param['scale_pos_weight'] = pos_weight
        param['seed'] = seed_
        evallist = [(xgb_training, 'train'), (xgb_earlystopping, 'eval')]

        # train on training
        bst = xgb.train(param, xgb_training, num_boost_round=10000, evals=evallist, early_stopping_rounds=10, verbose_eval=False)

        # evaluate on validation
        y_pred = bst.predict(xgb_validation, ntree_limit=bst.best_ntree_limit)
        pos_weight = len([x for x in y_validation[i] if x==0])/len([x for x in y_validation[i] if x==1])
        sample_weights = [pos_weight if x==1 else 1 for x in y_validation[i]]
        weightedlogloss = log_loss(y_validation[i], y_pred, sample_weight=sample_weights)
        validation_weightedlogloss.append(weightedlogloss)
        validation_pred.append(y_pred)
    
    # average validation performance over all folds
    mean_validation_weightedlogloss = np.mean(validation_weightedlogloss) + np.std(validation_weightedlogloss)/np.sqrt(len(validation_weightedlogloss))
    return mean_validation_weightedlogloss, np.concatenate(validation_pred)

### Custom functions

In [9]:
def dummy_y(y):
    
    dummy_y_ = [[],[]]
    for i in range(len(y)):
        if y[i] == 0:
            dummy_y_[0].append(1)
            dummy_y_[1].append(0)
        else:
            dummy_y_[0].append(0)
            dummy_y_[1].append(1)
    dummy_y_ = np.array(dummy_y_).T
    return dummy_y_

### Output folders and files

In [10]:
# iterate over datasets
for a in range(len(datasets)):
    
    # folder
    os.mkdir('%s/_individual/%s' % (output_folder, datasets[a]))
    
# performance files
performance_files_weightedlogloss = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=datasets)
performance_files_weightedlogloss.to_csv('%s/_individual/weightedlogloss.csv' % output_folder)

performance_files_balancedaccuracy = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=datasets)
performance_files_balancedaccuracy.to_csv('%s/_individual/balancedaccuracy.csv' % output_folder)

performance_files_auroc = pd.DataFrame(index=['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)]+['MEAN','STERR'], columns=datasets)
performance_files_auroc.to_csv('%s/_individual/auroc.csv' % output_folder)

### Pipeline

In [11]:
# iterate over datasets
for a in range(len(datasets)):
    print('-------------------------')
    print('DATASET: %s' % datasets[a])
    print('-------------------------')
    
    # load dataset
    with open('_datasets/%s.pickle' % datasets[a], 'rb') as f:
        X_matrix, y_vector, categorical_conversion = pickle.load(f, encoding='latin1')
    X_matrix.columns = ['%s # %s' % (datasets[a], feature) for feature in X_matrix.columns.tolist()]

    # divide train+validation from testing
    trainvalidation_index = []
    test_index = []
    sss = StratifiedShuffleSplit(n_splits=n_splits_trainvalidation_test, test_size=test_size, random_state=seed_)
    for trainvalidation_, test_ in sss.split(X_matrix, y_vector):
        trainvalidation_index.append(list(trainvalidation_))
        test_index.append(list(test_))

    # iterate over number of training+validation/testing splits
    for b in range(n_splits_trainvalidation_test):
        print('Split %d' % (b+1))
        
        # separate train+validation and testing
        X_trainvalidation = X_matrix.iloc[trainvalidation_index[b],]
        X_test = X_matrix.iloc[test_index[b],]
        y_trainvalidation = y_vector[trainvalidation_index[b]]
        y_test = y_vector[test_index[b]]

        # separate training_trainvalidation from earlystopping_trainvalidation
        training_index = []
        earlystopping_index = []
        sss = StratifiedShuffleSplit(n_splits=1, test_size=early_stopping_size, random_state=seed_)
        for training_, earlystopping_ in sss.split(X_trainvalidation, y_trainvalidation):
            training_index.append(list(training_))
            earlystopping_index.append(list(earlystopping_))
        X_training_trainvalidation = X_trainvalidation.iloc[training_index[0],]
        X_earlystopping_trainvalidation = X_trainvalidation.iloc[earlystopping_index[0],]
        y_training_trainvalidation = y_trainvalidation[training_index[0]]
        y_earlystopping_trainvalidation = y_trainvalidation[earlystopping_index[0]]
        
        # divide train from validation
        train_index = []
        validation_index = []
        skf = StratifiedKFold(n_splits=k_train_validation, shuffle=True, random_state=seed_)
        for train_, validation_ in skf.split(X_trainvalidation, y_trainvalidation):
            train_index.append(list(train_))
            validation_index.append(list(validation_))

        # separate train and validation
        X_train = []
        X_validation = []
        y_train = []
        y_validation = []
        for c in range(k_train_validation):
            X_train.append(X_trainvalidation.iloc[train_index[c],])
            X_validation.append(X_trainvalidation.iloc[validation_index[c],])
            y_train.append(y_trainvalidation[train_index[c]])
            y_validation.append(y_trainvalidation[validation_index[c]])
        
        # separate training_train from earlystopping_train
        X_training_train = []
        X_earlystopping_train = []
        y_training_train = []
        y_earlystopping_train = []
        for c in range(k_train_validation):
            training_index = []
            earlystopping_index = []
            sss = StratifiedShuffleSplit(n_splits=1, test_size=early_stopping_size, random_state=seed_)
            for training_, earlystopping_ in sss.split(X_train[c], y_train[c]):
                training_index.append(list(training_))
                earlystopping_index.append(list(earlystopping_))
            X_training_train.append(X_train[c].iloc[training_index[0],])
            X_earlystopping_train.append(X_train[c].iloc[earlystopping_index[0],])
            y_training_train.append(y_train[c][training_index[0]])
            y_earlystopping_train.append(y_train[c][earlystopping_index[0]])
            
        # initialize test predictions
        classifier_test_predictions = []
        
        # initialize shap explainers
        explainers = []
        
        # xgb parameters
        parameters = {
            'gamma': hp.loguniform('gamma', np.log(0.0001), np.log(5)) - 0.0001,
            'max_depth': scope.int(hp.uniform('max_depth', 1, 11)),
            'subsample': hp.uniform('subsample', 0.5, 1),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
            'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1),
            'reg_lambda': hp.loguniform('reg_lambda', np.log(1), np.log(4)),
            'reg_alpha': hp.loguniform('reg_alpha', np.log(0.0001), np.log(1)) - 0.0001
                     }
        if optimize_learning_rate:
            parameters['eta'] = hp.loguniform('eta', np.log(0.01), np.log(0.5))
        else:
            parameters['eta'] = hp.choice('eta', [default_learning_rate])

        # save info for hyperopt
        with open('_files/validation.pickle','wb') as f:
            pickle.dump(1000., f)
        with open('_files/data.pickle','wb') as f:
            pickle.dump([X_training_train, y_training_train, X_earlystopping_train, y_earlystopping_train, X_validation, y_validation], f)

        # hyperopt to find best parameters
        trials = Trials()
        best = fmin(hyperopt_function, parameters, algo=tpe.suggest, max_evals=n_hyperopt_iterations, trials=trials, rstate=np.random.RandomState(seed_), verbose=0, show_progressbar=True)
            
        # create classifier using best parameters
        pos_weight = len([x for x in y_training_trainvalidation if x==0])/len([x for x in y_training_trainvalidation if x==1])
            
        # xgb datasets
        xgb_training = xgb.DMatrix(X_training_trainvalidation, label=y_training_trainvalidation)
        xgb_earlystopping = xgb.DMatrix(X_earlystopping_trainvalidation, label=y_earlystopping_trainvalidation)
        xgb_test = xgb.DMatrix(X_test, label=y_test)

        # parameters
        if optimize_learning_rate:
            param = {'objective':'binary:logistic', 'eval_metric':'logloss', 'scale_pos_weight':pos_weight, 'seed':seed_, 'eta':best['eta'], 'gamma':best['gamma'], 'max_depth':int(best['max_depth']), 'subsample':best['subsample'], 'colsample_bytree':best['colsample_bytree'], 'colsample_bylevel':best['colsample_bylevel'], 'reg_lambda':best['reg_lambda'], 'reg_alpha':best['reg_alpha']}
        else:
            param = {'objective':'binary:logistic', 'eval_metric':'logloss', 'scale_pos_weight':pos_weight, 'seed':seed_, 'eta':default_learning_rate, 'gamma':best['gamma'], 'max_depth':int(best['max_depth']), 'subsample':best['subsample'], 'colsample_bytree':best['colsample_bytree'], 'colsample_bylevel':best['colsample_bylevel'], 'reg_lambda':best['reg_lambda'], 'reg_alpha':best['reg_alpha']}
        evallist = [(xgb_training, 'train'), (xgb_earlystopping, 'eval')]

        # train on training+validation
        bst = xgb.train(param, xgb_training, num_boost_round=10000, evals=evallist, early_stopping_rounds=10, verbose_eval=False)
                
        # predicted probabilities on test set
        y_pred = bst.predict(xgb_test, ntree_limit=bst.best_ntree_limit)

        # calculate test performance - weighted log loss
        pos_weight = len([x for x in y_test if x==0])/len([x for x in y_test if x==1])
        sample_weights = [pos_weight if x==1 else 1 for x in y_test]
        performance = log_loss(y_test, y_pred, sample_weight=sample_weights)
        performance_files_weightedlogloss.at['split_%d' % (b+1), datasets[a]] = performance
        performance_files_weightedlogloss.at['MEAN', datasets[a]] = np.nanmean(performance_files_weightedlogloss.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())
        performance_files_weightedlogloss.at['STERR', datasets[a]] = np.nanstd(performance_files_weightedlogloss.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())/np.sqrt(b+1)
        performance_files_weightedlogloss.to_csv('%s/_individual/weightedlogloss.csv' % output_folder)

        # calculate test performance - balanced accuracy
        y_pred_ = [1 if x>=0.5 else 0 for x in y_pred]
        performance = balanced_accuracy_score(y_test, y_pred_)
        performance_files_balancedaccuracy.at['split_%d' % (b+1), datasets[a]] = performance
        performance_files_balancedaccuracy.at['MEAN', datasets[a]] = np.nanmean(performance_files_balancedaccuracy.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())
        performance_files_balancedaccuracy.at['STERR', datasets[a]] = np.nanstd(performance_files_balancedaccuracy.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())/np.sqrt(b+1)
        performance_files_balancedaccuracy.to_csv('%s/_individual/balancedaccuracy.csv' % output_folder)

        # calculate test performance - auroc
        y_pred_ = np.concatenate((np.array([1-x for x in y_pred]).reshape(-1,1), y_pred.reshape(-1,1)), axis=1)
        performance = roc_auc_score(dummy_y(y_test), y_pred_)
        performance_files_auroc.at['split_%d' % (b+1), datasets[a]] = performance
        performance_files_auroc.at['MEAN', datasets[a]] = np.nanmean(performance_files_auroc.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())
        performance_files_auroc.at['STERR', datasets[a]] = np.nanstd(performance_files_auroc.loc[['split_%d' % x for x in range(1,n_splits_trainvalidation_test+1)], datasets[a]].values.tolist())/np.sqrt(b+1)
        performance_files_auroc.to_csv('%s/_individual/auroc.csv' % output_folder)
        
        # shap tree explainer
        explainer_independent = shap.TreeExplainer(bst, data=X_trainvalidation, feature_dependence='independent', model_output='probability')
        explainer_dependent = shap.TreeExplainer(bst, data=X_trainvalidation, feature_dependence='tree_path_dependent', model_output='margin')
        tree_limit = bst.best_ntree_limit

        # load validation predictions
        with open('_files/validation_xgb.pickle','rb') as f:
            validation_predictions = pickle.load(f)
        validation_y = np.concatenate(y_validation)
        validation_X = pd.concat(X_validation)

        # save results
        with open('%s/_individual/%s/iter_%d.pickle' % (output_folder, datasets[a], b+1), 'wb') as f:
            pickle.dump([validation_X, validation_y, validation_predictions, X_test, y_test, y_pred, explainer_independent, explainer_dependent, tree_limit], f)

-------------------------
DATASET: objscreen_kegg
-------------------------
Split 1
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [07:58<00:00,  2.62s/it, best loss: 0.5385721912261707]
Split 2
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [13:57<00:00,  3.15s/it, best loss: 0.5588604841463368]
Split 3
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [10:40<00:00,  3.32s/it, best loss: 0.5645374503127029]
Split 4
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 