# Tabular Data and BDTs: Classifying Collider Events
Authors: Javier Duarte, Raghav Kansal

## Download the data

In [None]:
import os
import requests

files = {
    "bkg": (os.path.join("data", "ntuple_4mu_bkg.root"), "https://zenodo.org/record/3901869/files/ntuple_4mu_bkg.root"),
    "VV": (os.path.join("data", "ntuple_4mu_VV.root"), "https://zenodo.org/record/3901869/files/ntuple_4mu_VV.root"),
}

for name, url in files.values():
    if not os.path.exists(name):
        print(f"Downloading file {name}.")

        os.makedirs(os.path.dirname(name), exist_ok=True)

        response = requests.get(url)
        with open(name, "wb") as file:
            file.write(response.content)
    print(f"File {name} is downloaded")

## Loading `NumPy` arrays
Now we load two different `NumPy` arrays. 
One corresponding to the VV signal and one corresponding to the background.

In [None]:
import uproot
import numpy as np
import h5py

# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

treename = "HZZ4LeptonsAnalysisReduced"

VARS = ["f_mass4l", "f_massjj"]
cut = "(f_mass4l > -999) & (f_massjj > -999)"

Xs = []
Ys = []
for key in files.keys():
    with uproot.open(f"{files[key][0]}:{treename}") as tree:
        arrays = tree.arrays(VARS, cut=cut, library="np")
        Xs.append(np.stack([arrays[var] for var in VARS], axis=-1))
        Ys.append(np.full_like(arrays[VARS[0]], key == "VV", dtype=float))

X = np.concatenate(Xs)
Y = np.concatenate(Ys)

## Preprocessing the data

We will split the data into two parts (one for training+validation and one for testing). 
We will also apply "standard scaling" preprocessing: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html i.e. making the mean = 0 and the RMS = 1 for all input variables (based **only** on the training/validation dataset).

In [None]:
from sklearn.model_selection import train_test_split

X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=0.2, random_state=7)

# preprocessing: standard scalar
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train_val)
X_train_val = scaler.transform(X_train_val)
X_test = scaler.transform(X_test)

## Define the model
We'll start with a dense (fully-connected) NN layer.
Our model will have a single fully-connected hidden layer with the same number of neurons as input variables. 
The weights are initialized using a small Gaussian random number. 
We will switch between linear and tanh activation functions for the hidden layer.
The output layer contains a single neuron in order to make predictions. 
It uses the sigmoid activation function in order to produce a probability output in the range of 0 to 1.

We are using the `binary_crossentropy` loss function during training, a standard loss function for binary classification problems. 
We will optimize the model with the Adam algorithm for stochastic gradient descent and we will collect accuracy metrics while the model is trained.
We will also define our early stopping criteria to prevent over-fitting and we will save the model based on the best `val_loss`.

In [None]:
# baseline keras model
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Input, Activation, Dense, Convolution2D, MaxPooling2D, Dropout, Flatten

inputs = Input(shape=(len(VARS),), name="input")
outputs = Dense(1, name="output", kernel_initializer="normal", activation="sigmoid")(inputs)

# creae the model
model = Model(inputs=inputs, outputs=outputs)
# compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
# print the model summary
model.summary()

# early stopping callback
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor="val_loss", patience=10)

# model checkpoint callback
# this saves our model architecture + parameters into dense_model.h5
from tensorflow.keras.callbacks import ModelCheckpoint

model_checkpoint = ModelCheckpoint(
    "dense_model.h5", monitor="val_loss", verbose=0, save_best_only=True, save_weights_only=False, mode="auto", save_freq=1
)

## Run training 
Here, we run the training.

In [None]:
# Train classifier
history = model.fit(
    X_train_val,
    Y_train_val,
    epochs=100,
    batch_size=1024,
    verbose=0,  # switch to 1 for more verbosity
    callbacks=[early_stopping, model_checkpoint],
    validation_split=0.25,
)

## Plot performance
Here, we plot the history of the training and the performance in a ROC curve

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline
# plot loss vs epoch
plt.figure(figsize=(15, 10))
ax = plt.subplot(2, 2, 1)
ax.plot(history.history["loss"], label="loss")
ax.plot(history.history["val_loss"], label="val_loss")
ax.legend(loc="upper right")
ax.set_xlabel("epoch")
ax.set_ylabel("loss")

# plot accuracy vs epoch
ax = plt.subplot(2, 2, 2)
ax.plot(history.history["accuracy"], label="acc")
ax.plot(history.history["val_accuracy"], label="val_acc")
ax.legend(loc="upper left")
ax.set_xlabel("epoch")
ax.set_ylabel("acc")

# Plot ROC
Y_predict = model.predict(X_test)
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(Y_test, Y_predict)
roc_auc = auc(fpr, tpr)
ax = plt.subplot(2, 2, 3)
ax.plot(fpr, tpr, lw=2, color="cyan", label="auc = %.3f" % (roc_auc))
ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="k", label="random chance")
ax.set_xlim([0, 1.0])
ax.set_ylim([0, 1.0])
ax.set_xlabel("false positive rate")
ax.set_ylabel("true positive rate")
ax.set_title("receiver operating curve")
ax.legend(loc="lower right")
plt.show()

# Plot NN output vs input variables
Here, we will plot the NN output and devision boundary as a function of the input variables.

In [None]:
# make a regular 2D grid for the inputs
myXI, myYI = np.meshgrid(np.linspace(-2, 2, 200), np.linspace(-2, 2, 200))
# print shape
print(myXI.shape)

# run prediction at each point
myZI = model.predict(np.c_[myXI.ravel(), myYI.ravel()])
myZI = myZI.reshape(myXI.shape)

The code below shoes how to plot the NN output and decision boundary. Does it look optimal?

In [None]:
from matplotlib.colors import ListedColormap

plt.figure(figsize=(20, 7))

# plot contour map of NN output
# overlaid with test data points
ax = plt.subplot(1, 2, 1)
cm = plt.cm.RdBu
cm_bright = ListedColormap(["#FF0000", "#0000FF"])
cont_plot = ax.contourf(myXI, myYI, myZI, cmap=cm, alpha=0.8)
ax.scatter(X_test[:, 0], X_test[:, 1], c=Y_test, cmap=cm_bright, edgecolors="k")
ax.set_xlim(-2, 2)
ax.set_ylim(-2, 2)
ax.set_xlabel(VARS[0])
ax.set_ylabel(VARS[1])
plt.colorbar(cont_plot, ax=ax, boundaries=[0, 1], label="NN output")

# plot decision boundary
# overlaid with test data points
ax = plt.subplot(1, 2, 2)
cm = plt.cm.RdBu
cm_bright = ListedColormap(["#FF0000", "#0000FF"])
cont_plot = ax.contourf(myXI, myYI, myZI > 0.5, cmap=cm, alpha=0.8)
ax.scatter(X_test[:, 0], X_test[:, 1], c=Y_test, cmap=cm_bright, edgecolors="k")
ax.set_xlim(-2, 2)
ax.set_ylim(-2, 2)
ax.set_xlabel(VARS[0])
ax.set_ylabel(VARS[1])
plt.colorbar(cont_plot, ax=ax, boundaries=[0, 1], label="NN output")

**Question 1:** What happens if you increase/decrease the number of hidden layers?

**Question 2:** What happens if you increase/decrease the number of nodes per hidden layer?

**Question 3:** What happens if you add/remove dropout?

**Question 4:** What happens if you add/remove early stopping?