# 1. Higgs@L3_BDT

## Prerequisites
This notebook does not need any data from other code executions.
It can be run directly after downloading the data sets.

## BDT training
A BDT is trained for each of the Higgs mass hypothesis.

## BDT evaluation
Later on, some plots descibing a BDT are produced.

## Tree visualtization
Independently from the previous calculations, a simple tree visualtization is performed.

In [None]:
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

import helpers
from load_data import data, mc_higgs_models, mc_no_higgs_frames

## Obtain a classifier for each mass hypothesis

In [None]:
def getTrainAndTest(higgs_mass):
    df_no_higgs = pd.concat(mc_no_higgs_frames)
    df_higgs = mc_higgs_models[higgs_mass]
    df_MVA = pd.concat([df_no_higgs, df_higgs])
    df_MVA = df_MVA[helpers.kinematical_vars + ["class", "weight"]]

    target = df_MVA.pop("class")

    X_train, X_test, y_train, y_test = train_test_split(df_MVA, target, 
        stratify=target, random_state=42,train_size=0.65)

    X_train_w = X_train.pop("weight")
    X_test_w = X_test.pop("weight")

    return X_train, X_test, y_train, y_test, X_train_w, X_test_w 
     

def fitClassifier(higgs_mass):
    grbcl = GradientBoostingClassifier(max_depth=3, random_state=0,
        learning_rate=0.01,
        n_estimators=300,
    )
    print("A classifier with the following parameters will be fitted "
        f"for the mass hypothesis {higgs_mass}:\n\n", grbcl.get_params())
    X_train, X_test, y_train, y_test, X_train_w, X_test_w = getTrainAndTest(higgs_mass)
    grbcl.fit(X_train, y_train, sample_weight=X_train_w)
    return grbcl

def getClassifier(higgs_mass):
    bdt_path = f"tmp/BDT_{higgs_mass}.pkl"
    if not os.path.exists(bdt_path):
        grbcl = fitClassifier(higgs_mass)
        with open(bdt_path, "wb") as fid:
            pickle.dump(grbcl, fid)  
    else:
        with open(bdt_path, "rb") as fid:
            grbcl = pickle.load(fid)
    return grbcl

In [None]:
bdts = {}
#for higgs_mass in ["higgs_85"]:
for higgs_mass in mc_higgs_models:
    bdts[higgs_mass] = getClassifier(higgs_mass)
    X_train, X_test, y_train, y_test, X_train_w, X_test_w = getTrainAndTest(higgs_mass)
    print(f"{higgs_mass} training score: {100*bdts[higgs_mass].score(X_train, y_train, sample_weight=X_train_w):.2f} %.")
    print(f"{higgs_mass} test set score: {100*bdts[higgs_mass].score(X_test, y_test, sample_weight=X_test_w):.2f} %.\n")

In [None]:
for higgs_mass, bdt in bdts.items():
    plt.plot(100*bdt.feature_importances_, helpers.symbol[higgs_mass], 
             label=f"$m_\mathrm{{H}}$ = {higgs_mass[-2:]} GeV")
plt.title("BDT feature importance")
plt.xlabel("features")
plt.ylabel("feature importance [%]")
plt.xticks(np.arange(len(X_train.columns)), X_train.columns, rotation=90)
plt.legend()
plt.savefig("plots/feature_importance.png")

## Look further into one of the Gradient Boosting Classifiers

In [None]:
higgs_mass = "higgs_85"

bdt = bdts[higgs_mass]
X_train, X_test, y_train, y_test, X_train_w, X_test_w = getTrainAndTest(higgs_mass)

In [None]:
y_pred = bdt.predict(X_test)

cm_total_counts = confusion_matrix(y_pred,y_test)
cm_normalized = cm_total_counts / cm_total_counts.sum(axis=0)
cm = cm_normalized.T

plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
classes = ["bkg", "sig"]
plt.xticks(np.arange(len(classes)), classes)
plt.yticks(np.arange(len(classes)), classes)

thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, f"{np.round(cm[i, j]*100, 0)} %",
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.title(f"Confusion Matrix ($m_\mathrm{{H}}$ = {higgs_mass[-2:]} GeV)")
plt.ylabel("true category")
plt.xlabel("predicted category")
plt.savefig(f"plots/confusion_matrix_{higgs_mass}.png")

### The BDT response

In [None]:
train_bkg = bdt.decision_function(X_train[y_train == 0])
train_sig = bdt.decision_function(X_train[y_train == 1])

test_bkg = bdt.decision_function(X_test[y_test == 0])
test_sig = bdt.decision_function(X_test[y_test == 1])

In [None]:
binning = np.linspace(
    min((min(b) for b in [train_bkg, train_sig, test_bkg, test_sig])),
    max((max(b) for b in [train_bkg, train_sig, test_bkg, test_sig])),
    30,
)
x = (binning[1:] + binning[:-1]) / 2.
w = binning[1] - binning[0]

In [None]:
h_train_bkg = np.histogram(train_bkg, bins=binning)[0]
h_train_sig = np.histogram(train_sig, bins=binning)[0]

plt.bar(x, h_train_bkg/len(train_bkg), width=binw, label="train bkg", alpha=0.5)
plt.bar(x, h_train_sig/len(train_sig), width=binw, label="train sig", alpha=0.5)


h_test_bkg = np.histogram(test_bkg, bins=binning)[0]
h_test_sig = np.histogram(test_sig, bins=binning)[0]

plt.errorbar(x, h_test_bkg/len(test_bkg), xerr=w/2., label="test bkg", fmt=".",
            yerr=np.sqrt(h_test_bkg)/len(test_bkg))
plt.errorbar(x, h_test_sig/len(test_sig), xerr=w/2., label="test sig", fmt='.',
            yerr=np.sqrt(h_test_sig)/len(test_sig))
            
plt.xlabel("BDT response")
plt.ylabel("Normalized event count")
plt.title("BDT response for signal and background")

plt.legend()
plt.savefig("plots/BDT_generalization_performance.png")

In [None]:
sig_eff = 1 - np.cumsum(h_test_sig)/len(test_sig)
bkg_eff = 1 - np.cumsum(h_test_bkg)/len(test_bkg)
sig_purity = h_test_sig / (h_test_sig + h_test_bkg)

plt.plot(x, sig_eff, "co-", label="signal efficiency")
plt.plot(x, bkg_eff, "bo-", label="background efficiency")
plt.plot(x, sig_purity, "go-", label="signal purity")
plt.plot(x, sig_eff * sig_purity,   
                        "ro-", label="signal efficiency * purity")
plt.legend()
plt.xlabel("BDT response")
plt.xlim(min(x)-w/2, max(test_sig)+w)
plt.title("BDT test efficiency and purity")
plt.savefig("plots/BDT_eff_purity.png")

## Visualization of a  Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz

In [None]:
X_train, X_test, y_train, y_test, X_train_w, X_test_w = getTrainAndTest("higgs_85")
tree = DecisionTreeClassifier(max_depth=2,random_state=0)
tree.fit(X_train, y_train, sample_weight=X_train_w.values)

In [None]:
export_graphviz(tree, out_file="plots/tree.dot", 
    class_names=["bkg", "sig"], feature_names=X_train.columns, 
    impurity=False, filled=True
)

with open("plots/tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)
