# Bayesian Optimization of XGBoost hyperparameters for the ISO and QCD rejection BDT

In [None]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import xgboost as xgb
import matplotlib
import pickle
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.lines as mlines
from scipy.optimize import curve_fit
from scipy.special import btdtri # beta quantile function
from bayes_opt import BayesianOptimization

In [None]:
indir = '/home/llr/cms/motta/HGCAL/CMSSW_11_1_0/src/GRAPHAnalysis/L1BDT/hdf5dataframes/isolated'

FE = 'threshold'

inFileTraining_dict = {
    'threshold'    : indir+'/Training_PU200_th_isoCalculated.hdf5',
    'mixed'        : indir+'/'
}

# features for BDT training
features = ['cl3d_pt_c1', 'cl3d_pt_c2', 'cl3d_pt_c3', 'cl3d_abseta', 'cl3d_showerlength', 'cl3d_coreshowerlength', 'cl3d_firstlayer', 'cl3d_maxlayer', 'cl3d_szz', 'cl3d_seetot', 'cl3d_spptot', 'cl3d_srrtot', 'cl3d_srrmean', 'cl3d_hoe', 'cl3d_meanz', 'cl3d_layer10', 'cl3d_layer50', 'cl3d_layer90', 'cl3d_ntc67', 'cl3d_ntc90', 'cl3d_NclIso_dR4', 'cl3d_etIso_dR4', 'tower_etSgn_dRsgn1', 'tower_eSgn_dRsgn1', 'tower_etSgn_dRsgn2', 'tower_eSgn_dRsgn2', 'tower_etIso_dRsgn1_dRiso3', 'tower_eIso_dRsgn1_dRiso3', 'tower_etEmIso_dRsgn1_dRiso3', 'tower_etHadIso_dRsgn1_dRiso7', 'tower_etIso_dRsgn2_dRiso4', 'tower_eIso_dRsgn2_dRiso4', 'tower_etEmIso_dRsgn2_dRiso4', 'tower_etHadIso_dRsgn2_dRiso7']
output = 'iso_pid'

In [None]:
store_tr = pd.HDFStore(inFileTraining_dict[FE], mode='r')
dfTraining = store_tr[FE]
store_tr.close()

dfTraining_dict[name]['iso_pid'] = dfTraining_dict[name]['gentau_decayMode'].copy(deep=True)
dfTraining_dict[name]['iso_pid'].replace([0,1,10,11], 1, inplace=True)
dfTraining_dict[name]['iso_pid'].replace([-2,-1], 0, inplace=True)
dfTr = dfTraining_dict[name].query('cl3d_pubdt_passWP{0}==True'.format(args.PUWP)).copy(deep=True)

del dfTraining

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dfTr[features], dfTr[output], test_size=0.3)
dtrain = xgb.DMatrix(data=X_train,label=y_train, feature_names=features)
dtest = xgb.DMatrix(data=X_test,label=y_test,feature_names=features)

In [None]:
def xgb4bo(eta, max_depth, subsample, colsample_bytree, num_trees):
    hyperparams = {'eval_metric'      : 'logloss',
                   'objective'        : 'binary:logistic', # objective function
                   'nthread'          : 10, # limit number of threads
                   'eta'              : eta, # learning rate
                   'max_depth'        : int(round(max_depth,0)), # maximum depth of a tree
                   'subsample'        : subsample, # fraction of events to train tree on
                   'colsample_bytree' : colsample_bytree,# fraction of features to train tree on
    }

    booster = xgb.train(hyperparams, dtrain, num_boost_round=int(round(num_trees,0)))
    X_train['bdt_output'] = booster.predict(dtrain)
    X_test['bdt_output'] = booster.predict(dtest)
    auroc_test = metrics.roc_auc_score(y_test,X_test['bdt_output'])
    auroc_train = metrics.roc_auc_score(y_train,X_train['bdt_output'])

    # this function has a maximum for abs(auroc_train-auroc_test)=0 and auroc_train=1 which is our ideal goal
    # its shape allows to have more control on the overtraining as the function plummets as soon as x moves from 0
    # it give a little less control on the train auroc as there the function does not plummet as much
    return 1/10**(abs(auroc_train-auroc_test)) - 1/100**(auroc_train)

hypar_bounds = {'eta'              : (0.1, 0.3), 
                'max_depth'        : (3, 7),
                'subsample'        : (0.6, 0.8),
                'colsample_bytree' : (0.3, 0.9),
                'num_trees'        : (50,150)}

In [None]:
xgb_bo = BayesianOptimization(f = xgb4bo, pbounds = hypar_bounds)

In [None]:
start = time.time()

xgb_bo.maximize(init_points=10, n_iter=20, acq='ei', alpha=1e-3)

end = time.time()
print('\nRunning time = %02dh %02dm %02ds'%((end-start)/3600, ((end-start)%3600)/60, (end-start)% 60))

In [None]:
xgb_bo.max['params']