# Bayesian Optimization of XGBoost hyperparameters for the PU rejection BDT

In [1]:
import os
import time
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import xgboost as xgb
import matplotlib
import pickle
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.lines as mlines
from scipy.optimize import curve_fit
from scipy.special import btdtri # beta quantile function
from bayes_opt import BayesianOptimization

In [2]:
def prepareCat(row):
    if row['cl3d_isbestmatch'] == True and row['gentau_decayMode']>=0:
        return 1
    else:
        return 0 

In [3]:
indir = '/home/llr/cms/motta/HGCAL/CMSSW_11_1_0/src/GRAPHAnalysis/L1BDT/hdf5dataframes/calibrated'

FE = 'threshold'

inFileTraining_dict = {
    'threshold'    : indir+'/Training_PU200_th_calibrated.hdf5',
    'mixed'        : indir+'/'
}

# features for BDT training
features = ['cl3d_abseta', 'cl3d_showerlength','cl3d_coreshowerlength', 'cl3d_firstlayer', 'cl3d_maxlayer', 'cl3d_szz', 'cl3d_seetot', 'cl3d_spptot', 'cl3d_srrtot', 'cl3d_srrmean', 'cl3d_hoe', 'cl3d_meanz', 'cl3d_layer10', 'cl3d_layer50', 'cl3d_layer90', 'cl3d_ntc67', 'cl3d_ntc90']
output = 'gentau_pid'

In [4]:
store_tr = pd.HDFStore(inFileTraining_dict[FE], mode='r')
dfTraining = store_tr[FE]
store_tr.close()

dfTraining['gentau_pid'] = dfTraining.apply(lambda row: prepareCat(row), axis=1)
dfTr = dfTraining.query('gentau_pid==1 or (gentau_pid==0 and gentau_decayMode!=-2)').copy(deep=True) # take all the taus and all the PU not coming from QCD sample

del dfTraining

In [5]:
X_train, X_test, y_train, y_test = train_test_split(dfTr[features], dfTr[output], stratufy=dfTr[output] test_size=0.3)
dtrain = xgb.DMatrix(data=X_train,label=y_train, feature_names=features)
dtest = xgb.DMatrix(data=X_test,label=y_test,feature_names=features)

In [6]:
def xgb4bo(eta, max_depth, subsample, colsample_bytree, num_trees):
    hyperparams = {'eval_metric'      : 'logloss',
                   'objective'        : 'binary:logistic', # objective function
                   'nthread'          : 10, # limit number of threads
                   'eta'              : eta, # learning rate
                   'max_depth'        : int(round(max_depth,0)), # maximum depth of a tree
                   'subsample'        : subsample, # fraction of events to train tree on
                   'colsample_bytree' : colsample_bytree,# fraction of features to train tree on
    }

    booster = xgb.train(hyperparams, dtrain, num_boost_round=int(round(num_trees,0)))
    X_train['bdt_output'] = booster.predict(dtrain)
    X_test['bdt_output'] = booster.predict(dtest)
    auroc_test = metrics.roc_auc_score(y_test,X_test['bdt_output'])
    auroc_train = metrics.roc_auc_score(y_train,X_train['bdt_output'])

    # this function has a maximum for abs(auroc_train-auroc_test)=0 and auroc_train=1 which is our ideal goal
    # its shape allows to have more control on the overtraining as the function plummets as soon as x moves from 0
    # it give a little less control on the train auroc as there the function does not plummet as much
    return 1/10**(abs(auroc_train-auroc_test)) - 1/100**(auroc_train)

hypar_bounds = {'eta'              : (0.1, 0.3), 
                'max_depth'        : (3, 7),
                'subsample'        : (0.6, 0.8),
                'colsample_bytree' : (0.3, 0.9),
                'num_trees'        : (50,150)}

In [7]:
xgb_bo = BayesianOptimization(f = xgb4bo, pbounds = hypar_bounds)

In [8]:
start = time.time()

xgb_bo.maximize(init_points=10, n_iter=20, acq='ei', alpha=1e-3)

end = time.time()
print('\nRunning time = %02dh %02dm %02ds'%((end-start)/3600, ((end-start)%3600)/60, (end-start)% 60))

|   iter    |  target   | colsam... |    eta    | max_depth | num_trees | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9462  [0m | [0m 0.3966  [0m | [0m 0.2382  [0m | [0m 6.38    [0m | [0m 131.9   [0m | [0m 0.7759  [0m |
| [95m 2       [0m | [95m 0.9828  [0m | [95m 0.857   [0m | [95m 0.2072  [0m | [95m 3.681   [0m | [95m 65.47   [0m | [95m 0.6827  [0m |
| [0m 3       [0m | [0m 0.9758  [0m | [0m 0.7874  [0m | [0m 0.2482  [0m | [0m 4.781   [0m | [0m 51.17   [0m | [0m 0.6727  [0m |
| [0m 4       [0m | [0m 0.978   [0m | [0m 0.5686  [0m | [0m 0.1783  [0m | [0m 5.395   [0m | [0m 67.25   [0m | [0m 0.7535  [0m |
| [0m 5       [0m | [0m 0.9788  [0m | [0m 0.781   [0m | [0m 0.1186  [0m | [0m 6.468   [0m | [0m 53.71   [0m | [0m 0.6332  [0m |
| [95m 6       [0m | [95m 0.9829  [0m | [95m 0.4368  [0m | [95m 0.2405  [0m | [95m 3.421   [0m | [95

In [9]:
xgb_bo.max['params']

{'colsample_bytree': 0.30196448730679565,
 'eta': 0.14117133696445328,
 'max_depth': 3.209643695436438,
 'num_trees': 95.7124023637034,
 'subsample': 0.7298113379939855}