# XGBoost for b-tagging

In [1]:
%load_ext autoreload
%autoreload 2
import xgboost as xgb
from bob import *
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

Welcome to JupyROOT 6.12/06


Load dataframe

In [2]:
inputFileName = 'Hybrid_25_July_bugfixed_fullStat.pkl'

# Subsample the dataset for fast execution
subsampleFlag = True

In [8]:
tree = pd.read_pickle(inputFileName)
features = select_features(tree, to_remove=[])

if subsampleFlag:
    tree = tree.head(int(tree.shape[0]*0.05))
    num_boost_round=100
else:
    num_boost_round=1000
    
# Replace missing values with NaNs
d = dict.fromkeys([-100, -1, -99, -1000], np.nan)
tree.replace(d, inplace=True)

train, test = train_test_splitting(tree, )

In [9]:
# Wrap data into DMatrix, optimized for XGBoost
train_dmatrix = xgb.DMatrix(data=train[features], label=train['jet_LabDr_HadF']==5, nthread=-1, weight=train['weights'])
test_dmatrix = xgb.DMatrix(data=test[features], label=test['jet_LabDr_HadF']==5, nthread=-1, weight=test['weights'])

Define the classifier

In [10]:
params = {
    'objective':'binary:logistic',
    'max_depth':20,
    'gamma':.01,
    'min_child_weight':1,
    'colsample_bytree': .8,
    'eta':.3,
    'subsample':.8,
    'silent':True
}

Training

In [7]:
%%time
xg_clf = xgb.train(params, train_dmatrix, num_boost_round=num_boost_round)

KeyboardInterrupt: 

Testing

In [None]:
predicted_probabilities = = xg_clf.predict(test_dmatrix)

In [None]:
fpr_c_xgb, tpr_c_xgb = compute_roc(test['jet_LabDr_HadF'].values, predicted_probabilities, 'c')
fpr_l_xgb, tpr_l_xgb = compute_roc(test['jet_LabDr_HadF'].values, predicted_probabilities, 'l')
fpr_c_mv, tpr_c_mv = compute_roc(test['jet_LabDr_HadF'].values, test['jet_mv2c10'].values, 'c')
fpr_l_mv, tpr_l_mv = compute_roc(test['jet_LabDr_HadF'].values, test['jet_mv2c10'].values, 'l')

Compute rates for comparison in ROC plot

In [None]:
rate_light = 1 /fpr_l_xgb[1:] / interpolate.spline(tpr_l_mv[1:], 1/fpr_l_mv[1:], tpr_l_xgb[1:], order=1)
rate_c = 1 /fpr_c_xgb[1:] / interpolate.spline(tpr_c_mv[1:], 1/fpr_c_mv[1:], tpr_c_xgb[1:], order=1)

rate_light[rate_light==np.inf] = np.nan
rate_c[rate_c==np.inf] = np.nan

Plot ROCs (figures are saved in /figures/figureName.eps)

In [None]:
figureName = 'ROC__xgboost_vs_mv2'
fig=plt.figure(figsize=(10,7))

gs=GridSpec(5,1)

ax1=fig.add_subplot(gs[0:4,0])
ax2=fig.add_subplot(gs[4,0])

ax1.set_ylabel("light / c rejection")
ax1.semilogy(tpr_l_mv, 1/fpr_l_mv, label='light MV2', c='orangered')
ax1.semilogy(tpr_c_mv, 1/fpr_c_mv, label='c MV2', c='dodgerblue')

ax1.semilogy(tpr_l_xgb, 1/fpr_l_xgb, label='light xgb', c='brown')
ax1.semilogy(tpr_c_xgb, 1/fpr_c_xgb, label='c xgb', c='navy')

plt.setp(ax1.get_xticklabels(), visible=False)
ax1.set_xlim([0.55, 1])
ax1.set_ylim([1, 1e3])

ax1.grid()
ax1.legend()

ax2.plot(tpr_l_xgb[1:], 1/rate_light, c='r', lw=.8,  label='l-jets')
ax2.plot(tpr_c_xgb[1:], 1/rate_c, c='b', lw=.8,  label='c-jets')

ax2.axhline(y=1, color='black', linestyle='-.', lw=.5)
ax2.grid()
ax2.set_xlabel("b-efficiency")
ax2.set_ylabel("rate")
ax2.set_xlim([0.55, 1])
ax2.set_ylim([0.5, 1.5])
ax2.legend(fontsize = 'x-small')

plt.savefig('figures/'+ figureName + '.eps', format='eps')
plt.show()

Rejection vs pt for flat b-efficiencies

In [None]:
rejection_pt(test, test['jet_mv2c10'].values, probs, num_cuts=20, b_eff=.77)
plt.savefig('figures/_random_forest_vs_mv2_pt_flat_eff.eps', format='eps')
plt.show()