# What happens in this notebook :-)

* We read in MC samples for background and different signal models. We also read in the actual data from the L3 detector.

* We train a BDT for a binary classification problem aiming to seperate signal from background.

* Different checks such as generalization of the BDT on the test sample as well as the confusion matrix are shown.

* We use the BDT response to place our cut, so we analyze efficiencies and purities for different cut positions.

* The BDT is saved in a pickel file.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

import plotting as pl
import helpers

from sklearn.metrics import confusion_matrix
import itertools

### Read in the data

In [None]:
from load_data import data, mc_higgs_models, mc_no_higgs_frames

# Pick the Signal Model and produce data frames

In [None]:

df_MC_noHiggs = pd.concat(mc_no_higgs_frames)
df_MC_mH85 = mc_higgs_models["85"]
#merge bkg and sig MC
df_mH85 = pd.concat([df_MC_noHiggs,df_MC_mH85], ignore_index=True)

## Perform selection cut analysis

In [None]:
# choose only those columns which have some kinematical meaning and thus can be used for training
df_MVA_mH85 = df_mH85[helpers.kinematical_vars + ["class", "weight"]]

In [None]:
print(df_MVA_mH85.columns)

print(len(df_MVA_mH85.columns)-2)

In [None]:
# extract target values
# 0 <-> bkg
# 1 <-> sig
target = df_MVA_mH85['class']
del df_MVA_mH85['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_MVA_mH85, target, 
                                                    stratify=target, 
                                                    random_state=42,
                                                   train_size=0.65)
X_train_w = X_train['weight']
X_test_w = X_test['weight']


print('No of train events', len(y_train))
print('No of signal events',np.count_nonzero(y_train))
print('fraction of bkg in training set',1 - 1.*np.count_nonzero(y_train) / len(y_train))

print('-----------------------------')

print('No of test events',len(y_test))
print('No of signal events',np.count_nonzero(y_test))
print('fraction of bkg in test set',1 - 1.*np.count_nonzero(y_test) / len(y_test))


del X_train['weight']
del X_test['weight']

# Initialize the Gradient Boosting Classifier

In [None]:
grbcl = GradientBoostingClassifier(max_depth=3,random_state=0,learning_rate=0.01,n_estimators=300)#,n_estimators=100,learning_rate=0.3)
grbcl.fit(X_train,y_train,sample_weight=X_train_w)

In [None]:
print("Training set score: {:.3f}".format(grbcl.score(X_train, y_train,sample_weight=X_train_w)))
print("Test set score: {:.3f}".format(grbcl.score(X_test, y_test,sample_weight=X_test_w)))

In [None]:
importance = grbcl.feature_importances_
feature = np.arange(len(X_train.columns))

print(importance)
print(importance.sum())
fig, ax = plt.subplots()

plt.title(r"feature importance ($m_\mathrm{H} = 85$ GeV)")
plt.plot(feature,importance*100,'b*')
plt.xticks(feature)
ax.set_xticklabels(X_train.columns,rotation=90)
plt.xlabel('features')
plt.ylabel('feature importance [%]')
#plt.savefig("./plots/feature_importance")
plt.show()

In [None]:
y_pred = grbcl.predict(X_test)

"""import cPickle
with open('BDT_85higgs.pkl', 'rb') as fid:
    gnb_loaded = cPickle.load(fid)
y_pred = gnb_loaded.predict(X_test)
"""
print(len(y_pred))
print(len(y_test))

cm = confusion_matrix(y_pred,y_test)#,labels=["bkg","sig"])
cm = cm.T
print(cm)

cmap = plt.cm.Blues
# normalize
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.colorbar()
tick_marks = np.arange(2)
classes = ['bkg','sig']
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)



thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, str(np.round(cm[i, j]*100,0))+' %',
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.title(r'Confusion Matrix ($m_\mathrm{H}$ = 85 GeV)')
plt.tight_layout()
plt.ylabel('true category')
plt.xlabel('predicted category')
#plt.savefig("./plots/confusion_matrix")

plt.show()

In [None]:
X_train_df = X_train.copy()
X_train_df['class'] = y_train

X_test_df = X_test.copy()
X_test_df['class'] = y_test

In [None]:
X_train_bkg = X_train_df[X_train_df['class']==0]
X_train_sig = X_train_df[X_train_df['class']==1]
del X_train_bkg['class']
del X_train_sig['class']

X_test_bkg = X_test_df[X_test_df['class']==0]
X_test_sig = X_test_df[X_test_df['class']==1]
del X_test_bkg['class']
del X_test_sig['class']



In [None]:
# evaluate test statistics on training set
a_bkg = grbcl.decision_function(X_train_bkg)
a_sig = grbcl.decision_function(X_train_sig)

# evaluate test statistics on test set
b_bkg = grbcl.decision_function(X_test_bkg)
b_sig = grbcl.decision_function(X_test_sig)



In [None]:
print(len(a_bkg))
binning = np.linspace(min(b_bkg),max(a_sig),30)
binning = np.linspace(-8,5,30)


In [None]:
binw = binning[1] - binning[0]

N_a_bkg = np.histogram(a_bkg,bins=binning)[0]
norm_a_bkg = 1.*len(a_bkg)
N_a_sig = np.histogram(a_sig,bins=binning)[0]
norm_a_sig = 1.*len(a_sig)

plt.bar(binning[:-1], N_a_bkg/norm_a_bkg, width=binw, label='train bkg', alpha=0.5)
plt.bar(binning[:-1], N_a_sig/norm_a_sig, width=binw, label='train sig', alpha=0.5)


# test set part
N_bkg = np.histogram(b_bkg,bins=binning)[0]
norm_bkg = 1.*len(b_bkg)

N_sig = np.histogram(b_sig,bins=binning)[0]
norm_sig = 1.*len(b_sig)

plt.errorbar(binning[:-1],N_bkg/norm_bkg,xerr=binw/2.,label='test bkg',
            yerr=np.sqrt(N_bkg)/norm_bkg,fmt='.')
plt.errorbar(binning[:-1],N_sig/norm_sig,xerr=binw/2.,label='test sig',
            yerr=np.sqrt(N_sig)/norm_sig,fmt='.')
            
plt.ylabel(r'normalized event count: $\frac{1}{N} \, \frac{\mathrm{d} N}{\mathrm{d}t}$ ')
plt.xlabel(r'BDT response $t$ (decision function)')

plt.legend()
#plt.savefig("./plots/BDT_generalization_performance")

plt.show()

In [None]:
# calculate the number of events up to certain bin in t
# of the BDT response
sum_sig_cuts = []
sum_bkg_cuts = []

count = 0
for s in N_sig :
    count += s
    sum_sig_cuts.append(count)

count = 0
for s in N_bkg :
    count += s
    sum_bkg_cuts.append(count)


In [None]:
#plt.plot(binning[:-1],1.*N_sig/(np.sqrt(N_sig + N_bkg + N_bkg)),'mo-',label='significance')
#print(binning[:-1][np.argmax(1.*N_sig/(np.sqrt(N_sig + N_bkg + N_bkg)))])
#plt.plot(np.sqrt(-2*(N_sig+N_bkg)*np.log(1+N_sig/N_bkg) + 2*N_sig),'ro--', label='lr significance')
#plt.hlines(3.,0,30)
#plt.vlines(14,0,7)
#plt.legend()
#plt.show()

#plt.vlines(14,0,1)
plt.plot(binning[:-1],1.-(np.array(sum_sig_cuts))/norm_sig,'co-',label='signal efficiency')
plt.plot(binning[:-1],1.-(np.array(sum_bkg_cuts))/norm_bkg,'bo-',label='background efficiency')
plt.plot(binning[:-1],1.*N_sig / (N_sig+N_bkg),'go-',label='signal purity')
plt.plot(binning[:-1],(1.*N_sig / (N_sig+N_bkg))*(1.-(np.array(sum_sig_cuts))/norm_sig),'ro-',label='signal efficiency * purity')
plt.legend()
plt.ylabel('efficiency (purity)')
plt.xlabel(r'BDT response $t$ (decision function)')
plt.xlim(-6.655,2.31)

#plt.savefig("./plots/BDT_eff_purity")
plt.show()

# Save Classifier

In [None]:
import pickle
# save the classifier
with open('BDT_85higgs1.pkl', 'wb') as fid:
    pickle.dump(grbcl, fid)    

# Visualize Decision Tree

In [None]:
from sklearn.tree import export_graphviz
import graphviz

In [None]:
tree = DecisionTreeClassifier(max_depth=2,random_state=0)
tree.fit(X_train,y_train,sample_weight=X_train_w.values)

In [None]:
#save tree
export_graphviz(tree,out_file='tree.dot',class_names=['bkg','sig'],feature_names=X_train.columns,
               impurity=False,filled=True)



In [None]:
#open tree
with open('tree.dot') as f :
    dot_graph = f.read()
graphviz.Source(dot_graph)


in order to convert .dot to ps use:

dot -Tps tree.dot -o myTree.ps