# XGBoost for b-hadron pT regression

In [1]:
%load_ext autoreload
%autoreload 2
import xgboost as xgb
from bob import *
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

Welcome to JupyROOT 6.12/06


Load dataframe

In [2]:
inputFileName = 'MC16d_newTrain_Zprime.pkl'

# Subsample the dataset for fast execution
subsampleFlag = True

In [3]:
tree = pd.read_pickle(inputFileName)
features = select_features(tree, to_remove=[])

# jet_bH_pt is in lists of one element, this takes it out
tree['jet_bH_pt'] = tree['jet_bH_pt'].apply(lambda x: x[0])

# Select only b_jets
tree = tree[tree['jet_LabDr_HadF'] == 5]

if subsampleFlag:
    tree = tree.head(int(tree.shape[0]*0.1))
    
# Replace missing values with NaNs
d = dict.fromkeys([-100, -1, -99, -1000], np.nan)
tree.replace(d, inplace=True)

train, test = train_test_splitting(tree, )

In [4]:
# Save pt_std to rescale pT in regression
bH_pt_std = np.std(tree['jet_bH_pt'].values)

In [6]:
# Wrap data into DMatrix, optimized for XGBoost, log is used to force positive predictions
train_dmatrix = xgb.DMatrix(data=train[features], label=np.log(train['jet_bH_pt']/bH_pt_std), nthread=-1, weight=train['weights'])
test_dmatrix = xgb.DMatrix(data=test[features], label=np.log(test['jet_bH_pt']/bH_pt_std), nthread=-1, weight=test['weights'])

Define the classifier

In [9]:
params = {
    'objective':'reg:linear',
    'max_depth':20,
    'gamma':.01,
    'min_child_weight':1,
    'colsample_bytree': .8,
    'eta':.3,
    'subsample':.8,
    'silent':True
}

Training

In [10]:
%%time
xg_clf = xgb.train(params, train_dmatrix, num_boost_round=1000)

CPU times: user 1h 9min 52s, sys: 18.9 s, total: 1h 10min 11s
Wall time: 17min 34s


Testing

In [11]:
predicted_probabilities = = xg_clf.predict(test_dmatrix)
predicted_probabilities = np.exp(predicted_probabilities)*bH_pt_std

SyntaxError: invalid syntax (<ipython-input-11-f49dbe5b66fd>, line 1)

Plot b-hadron pT (True and Predicted) / jet_pt distribution

In [None]:
plt.figure(figsize=(15,7))
plt.hist((pt_pred/tree['jet_pt'], tree['jet_bH_pt']/tree['jet_pt']), \
         log=True, density=False, label=('prediction','true'), bins=300, histtype = 'step');
plt.grid()
plt.xlim([0,6])
plt.legend()
plt.show()

Plot 2D histograms with fit

In [None]:
import matplotlib

x = np.linspace(0,1.4,50)

plt.figure(1,) #figsize=(10,6))
degrees = [1]       # list of degrees of x to use
matrix = np.stack([tree['jet_bH_pt']**d for d in degrees], axis=-1)   # stack them like columns
#slope, r, _, _ = np.linalg.lstsq(matrix, pt_pred)
slope = 1
plt.plot(x, x*slope, 'r')
print(slope, 1-sum((tree['jet_bH_pt'] - pt_pred)**2)/sum((pt_pred - pt_pred.mean())**2) )

h = np.histogram2d(tree['jet_bH_pt'], pt_pred, bins=(np.linspace(0,1.4e6,112),np.linspace(0,1.4e6,112)))
plt.imshow(h[0].T, norm=matplotlib.colors.LogNorm(), extent=[0,1.4,0,1.4], origin='lower')
plt.xlabel('jet_bH_pt [TeV]')
plt.ylabel('regression_pt [TeV]')
plt.colorbar()
plt.grid()
plt.show()

plt.figure(2,) #figsize=(10,6))

slope, r, _, _ = np.linalg.lstsq(matrix, tree['jet_pt'])
plt.plot(x, x*slope, 'r')
print(slope, 1-r/sum((tree['jet_pt'] - tree['jet_pt'].mean())**2) )

h = np.histogram2d(tree['jet_bH_pt'], tree['jet_pt'], bins=(np.linspace(0,1.4e6,112),np.linspace(0,1.4e6,112)))
plt.imshow(h[0].T, norm=matplotlib.colors.LogNorm(), extent=[0,1.4,0,1.4], origin='lower')
plt.xlabel('jet_bH_pt [TeV]')
plt.ylabel('jet_pt [TeV]')
plt.colorbar()
plt.grid()
plt.show()

Features importance

In [None]:
fig, ax = plt.subplots(figsize=(5.5,5))
xgb.plot_importance(xg_clf, max_num_features=10, height=0.3, ax=ax)
plt.show()