**MSc Computational Physics AUTH**<br>
**Computational Quantum Physics**<br>
**Academic Year: 2023-2024**<br>
**Semester 2**<br>
**Implemented by: Ioannis Stergakis**<br>
**AEM: 4439**<br>

# **FINAL PROJECT 2:**<br>
# ***Part2: Applying artificial neural networks (TensorFlow) classification models in hypersymmetric Higgs bosons detection***

**Jupyter Notebook 2**<br> 

**Contents:**<br>
*->1. Data importing and preprocessing*<br>
*->2. TensorFlow classification algorithm*<br>

## **1. Data Importing and Preprocessing**

In [1]:
# Numpy module import
import numpy as np

# Module for data importation and presentaion
import pandas as pd
from pandas import DataFrame

# Module for detecting and imputing missing data
import missingno as msno

# ML package for splitting the dataframe into train and test set
from sklearn.model_selection import train_test_split

# ML package for data scalling
from sklearn.preprocessing import StandardScaler

### **1.1 Data importation, imputation and presentation**

#### **A. Importation**

In [None]:
# Importing the Higgs detection data
df_Higgs=pd.read_csv("HIGGS_8K.csv",header=None)

# Showing the data
df_Higgs


#### **B. Imputation**

In [None]:
# Showing the 18th column's (or 17th column in Python enumeration) first element
# this element (as all elements of this column are in str type) and can not be converted to float
df_Higgs[17][0]

In [None]:
# To avoid errors in future caclulations we replace this element's value with 0
df_Higgs[17][0] = 0

# Then we cast all elements of 17th column to float type
n = len(df_Higgs[17])
for i in range(0,n):
    df_Higgs[17][i] = float(df_Higgs[17][i])  

In [None]:
# Visualizing the missing data
msno.bar(df_Higgs)

In [None]:
# Applying mean imputation
df_Higgs_imputed = df_Higgs.fillna(df_Higgs.mean())

# Confirming the imputation
msno.bar(df_Higgs_imputed)

#### **C. Presentation**

In [71]:
# Getting the first column and all rows (class column)
Higgs_class = df_Higgs_imputed.iloc[:,0]
# Getting the rest column and all rows (all explanatory variables columns)
Higgs = df_Higgs_imputed.iloc[:,1:]
# Getting the columns 2 to 22 (or 1 to 21 in Python enumeration) and all rows (low level quantities columns)
Higgs_low = df_Higgs_imputed.iloc[:,1:22]
# Getting the column's 23 to 29 (or 22 to 28 in Python enumeration) and all rows (high level quantities columns)
Higgs_high = df_Higgs_imputed.iloc[:,22:]

In [None]:
# Showing the head of all explanatory variables (low and high level quantities) data
Higgs.head()

In [None]:
# Showing the head of low level quantities data
Higgs_low.head()

In [None]:
# Showing the head of high level quantities data
Higgs_high.head()

In [None]:
# Showing the head of class data
Higgs_class.head()

### **1.2 Data splitting into train and test sets**

In [13]:
# Splitting the data into test and training sets
# test_size = 0.25, the percentage of the test size to the complete dataset (here we choose 25%)
# random_state is the seed. A specific pseudorandom number to split the data set
# in order to produce same splitting every time we run the script.

# Splitting the all explanatory quantities data set
Higgs_train, Higgs_test, Higgs_class_train, Higgs_class_test = train_test_split(Higgs, Higgs_class, test_size = 0.25, random_state = 50)

# Splitting the low level quantities data set
Higgs_low_train, Higgs_low_test, Higgs_low_class_train, Higgs_low_class_test = train_test_split(Higgs_low, Higgs_class, test_size = 0.25, random_state = 50)

# Splitting the high level quantities data set
Higgs_high_train, Higgs_high_test, Higgs_high_class_train, Higgs_high_class_test = train_test_split(Higgs_high, Higgs_class, test_size = 0.25, random_state = 50)

### **1.3 Feature Scaling**

In [14]:
# Definition of scaler
scaler = StandardScaler()

# Applying feature scalling on all explanatory quantities train and test sets
Higgs_train_scaled = scaler.fit_transform(Higgs_train)
Higgs_test_scaled =scaler.transform(Higgs_test)

# Applying feature scalling on low level quantities train and test sets
Higgs_low_train_scaled = scaler.fit_transform(Higgs_low_train)
Higgs_low_test_scaled =scaler.transform(Higgs_low_test)

# Applying feature scalling on low level quantities train and test sets
Higgs_high_train_scaled = scaler.fit_transform(Higgs_high_train)
Higgs_high_test_scaled =scaler.transform(Higgs_high_test)

## **2. TensorFlow classification algorithm**

In [15]:
# The plotting module matplotlib
import matplotlib.pyplot as plt

# Calculating the confusion matrix and the accuracy 
from sklearn.metrics import confusion_matrix, accuracy_score

# Display the confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Summarize the results of the classification
from sklearn.metrics import classification_report

# Creating the ROC curve and calculating the AUC score
from sklearn.metrics import roc_curve,roc_auc_score

# Importing the ANN module we will use
import tensorflow as tf

# Packages for the build of the ANN
from keras.models import Sequential
from keras.layers import Dense, Input


### **2.1 Classification using all the explanatory quantities**

#### A. Building the ANNs

In [16]:
# Initializing a NN (1 hidden layer)
Higgs_NN = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer on NN
Higgs_NN.add(Input(shape=(28,)))
Higgs_NN.add(tf.keras.layers.Dense(20, activation='relu'))

# Adding the output layer on NN
Higgs_NN.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [17]:
# Initializing a DN (2 hidden layers)
Higgs_DN = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer on DN
Higgs_DN.add(Input(shape=(28,)))
Higgs_DN.add(tf.keras.layers.Dense(20, activation='relu'))

# Adding the second hidden layer on DN
Higgs_DN.add(tf.keras.layers.Dense(20,activation='relu'))

# Adding the output layer on DN
Higgs_DN.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [18]:
# Compiling the NN
Higgs_NN.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])

# Compiling the DN
Higgs_DN.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])

In [None]:
# Summary of NN
Higgs_NN.summary()

In [None]:
# Summary of DN
Higgs_DN.summary()

#### B. Training the ANNs

In [None]:
# Training the NN using the scaled training set
Higgs_NN.fit(Higgs_train_scaled, Higgs_class_train, batch_size = 100, epochs = 150)

In [None]:
# Training the DN using the scaled training set
Higgs_DN.fit(Higgs_train_scaled, Higgs_class_train, batch_size = 100, epochs = 150)

#### C. Making the predictions and evaluating the model

In [None]:
# Predictions using the NN
Higgs_NN_pred = Higgs_NN.predict(Higgs_test_scaled)
Higgs_NN_pred

In [None]:
# Predictions using the NN
Higgs_DN_pred = Higgs_DN.predict(Higgs_test_scaled)
Higgs_DN_pred

In [None]:
# Creating the ROC curve
fpr_Higgs_NN,tpr_Higgs_NN,_ = roc_curve(Higgs_class_test, Higgs_NN_pred)
fpr_Higgs_DN,tpr_Higgs_DN,_ = roc_curve(Higgs_class_test, Higgs_DN_pred)

# Calculating the AUC score
AUC_Higgs_NN = roc_auc_score(Higgs_class_test, Higgs_NN_pred)
AUC_Higgs_DN = roc_auc_score(Higgs_class_test, Higgs_DN_pred)

plt.plot(fpr_Higgs_NN, tpr_Higgs_NN, label="NN\nAUC: %.4f"%AUC_Higgs_NN)
plt.plot(fpr_Higgs_DN, tpr_Higgs_DN, label="DN\nAUC: %.4f"%AUC_Higgs_DN)

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("ANNs models for all explanatory quantities")
plt.legend(fontsize=8)
plt.show()


print(f'NN model AUC score: {AUC_Higgs_NN}')
print(f'DN model AUC score: {AUC_Higgs_DN}')  

In [None]:
# Trasnforming the predictions of NN into True or False
Higgs_NN_pred = (Higgs_NN_pred >= 0.5)
Higgs_NN_pred

In [None]:
# Trasnforming the predictions of DN into True or False
Higgs_DN_pred = (Higgs_DN_pred >= 0.5)
Higgs_DN_pred

In [None]:
# Calculating the confusion matrix and accuracy score of NN 
cm_Higgs_NN = confusion_matrix(Higgs_class_test, Higgs_NN_pred)
print(cm_Higgs_NN)
print(accuracy_score(Higgs_class_test, Higgs_NN_pred))
disp_cm_Higgs_NN = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_NN, display_labels=["0","1"])
disp_cm_Higgs_NN.plot()

In [None]:
# Calculating the confusion matrix and accuracy score of DN 
cm_Higgs_DN = confusion_matrix(Higgs_class_test, Higgs_DN_pred)
print(cm_Higgs_DN)
print(accuracy_score(Higgs_class_test, Higgs_DN_pred))
disp_cm_Higgs_DN = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_DN, display_labels=["0","1"])
disp_cm_Higgs_DN.plot()

In [None]:
# Evaluate the NN model on test data
loss, accuracy = Higgs_NN.evaluate(Higgs_test_scaled, Higgs_class_test)
print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')

### **2.2 Classification using only the low level explanatory quantities**

#### A. Building the ANNs

In [32]:
# Initializing a NN (1 hidden layer)
Higgs_low_NN = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer on NN
Higgs_low_NN.add(Input(shape=(21,)))
Higgs_low_NN.add(tf.keras.layers.Dense(30, activation='relu'))

# Adding the output layer on NN
Higgs_low_NN.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [33]:
# Initializing a DN (2 hidden layers)
Higgs_low_DN = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer on DN
Higgs_low_DN.add(Input(shape=(21,)))
Higgs_low_DN.add(tf.keras.layers.Dense(30, activation='relu'))

# Adding the second hidden layer on DN
Higgs_low_DN.add(tf.keras.layers.Dense(30,activation='relu'))

# Adding the output layer on DN
Higgs_low_DN.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [34]:
# Compiling the NN
Higgs_low_NN.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])

# Compiling the DN
Higgs_low_DN.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])

In [None]:
# Summary of NN
Higgs_low_NN.summary()

#### B. Training the ANNs

In [None]:
# Training the NN using the scaled training set
Higgs_low_NN.fit(Higgs_low_train_scaled, Higgs_low_class_train, batch_size = 100, epochs = 150)

In [None]:
# Training the DN using the scaled training set
Higgs_low_DN.fit(Higgs_low_train_scaled, Higgs_low_class_train, batch_size = 100, epochs = 150)

#### C. Making the predictions and evaluating the model

In [None]:
# Predictions using the NN
Higgs_low_NN_pred = Higgs_low_NN.predict(Higgs_low_test_scaled)
Higgs_low_NN_pred

In [None]:
# Predictions using the NN
Higgs_low_DN_pred = Higgs_low_DN.predict(Higgs_low_test_scaled)
Higgs_low_DN_pred

In [None]:
# Creating the ROC curve
fpr_Higgs_low_NN,tpr_Higgs_low_NN,_ = roc_curve(Higgs_low_class_test, Higgs_low_NN_pred)
fpr_Higgs_low_DN,tpr_Higgs_low_DN,_ = roc_curve(Higgs_low_class_test, Higgs_low_DN_pred)

# Calculating the AUC score
AUC_Higgs_low_NN = roc_auc_score(Higgs_low_class_test, Higgs_low_NN_pred)
AUC_Higgs_low_DN = roc_auc_score(Higgs_low_class_test, Higgs_low_DN_pred)

plt.plot(fpr_Higgs_low_NN, tpr_Higgs_low_NN, label="NN\nAUC: %.4f"%AUC_Higgs_low_NN)
plt.plot(fpr_Higgs_low_DN, tpr_Higgs_low_DN, label="DN\nAUC: %.4f"%AUC_Higgs_low_DN)

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("ANNs models for low level quantities")
plt.legend(fontsize=8)
plt.show()


print(f'NN model AUC score: {AUC_Higgs_low_NN}')
print(f'DN model AUC score: {AUC_Higgs_low_DN}')  

In [None]:
# Trasnforming the predictions of NN into True or False
Higgs_low_NN_pred = (Higgs_low_NN_pred >= 0.5)
Higgs_low_NN_pred

In [None]:
# Trasnforming the predictions of DN into True or False
Higgs_low_DN_pred = (Higgs_low_DN_pred >= 0.5)
Higgs_low_DN_pred

In [None]:
# Calculating the confusion matrix and accuracy score of NN 
cm_Higgs_low_NN = confusion_matrix(Higgs_low_class_test, Higgs_low_NN_pred)
print(cm_Higgs_low_NN)
print(accuracy_score(Higgs_low_class_test, Higgs_low_NN_pred))
disp_cm_Higgs_low_NN = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_low_NN, display_labels=["0","1"])
disp_cm_Higgs_low_NN.plot()

In [None]:
# Calculating the confusion matrix and accuracy score of DN 
cm_Higgs_low_DN = confusion_matrix(Higgs_low_class_test, Higgs_low_DN_pred)
print(cm_Higgs_low_DN)
print(accuracy_score(Higgs_low_class_test, Higgs_low_DN_pred))
disp_cm_Higgs_low_DN = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_low_DN, display_labels=["0","1"])
disp_cm_Higgs_low_DN.plot()

In [None]:
# Evaluate the NN model on test data
loss, accuracy = Higgs_low_NN.evaluate(Higgs_low_test_scaled, Higgs_low_class_test)
print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')

In [None]:
# Evaluate the DN model on test data
loss, accuracy = Higgs_low_DN.evaluate(Higgs_low_test_scaled, Higgs_low_class_test)
print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')

### **2.3 Classification using only the high level explanatory quantities**

#### A. Building the ANNs

In [48]:
# Initializing a NN (1 hidden layer)
Higgs_high_NN = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer on NN
Higgs_high_NN.add(Input(shape=(7,)))
Higgs_high_NN.add(tf.keras.layers.Dense(12, activation='relu'))

# Adding the output layer on NN
Higgs_high_NN.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [49]:
# Initializing a DN (2 hidden layers)
Higgs_high_DN = tf.keras.models.Sequential()

# Adding the input layer and the first hidden layer on DN
Higgs_high_DN.add(Input(shape=(7,)))
Higgs_high_DN.add(tf.keras.layers.Dense(12, activation='relu'))

# Adding the second hidden layer on DN
Higgs_high_DN.add(tf.keras.layers.Dense(12,activation='relu'))

# Adding the output layer on DN
Higgs_high_DN.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [50]:
# Compiling the NN
Higgs_high_NN.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])

# Compiling the DN
Higgs_high_DN.compile(optimizer="adam",loss="binary_crossentropy",metrics=['accuracy'])

In [None]:
# Summary of NN
Higgs_high_NN.summary()

In [None]:
# Summary of DN
Higgs_high_DN.summary()

#### B. Training the ANNs

In [None]:
# Training the NN using the scaled training set
Higgs_high_NN.fit(Higgs_high_train_scaled, Higgs_high_class_train, batch_size = 100, epochs = 150)

In [None]:
# Training the DN using the scaled training set
Higgs_high_DN.fit(Higgs_high_train_scaled, Higgs_high_class_train, batch_size = 100, epochs = 150)

#### C. Making the predictions and evaluating the model

In [None]:
# Predictions using the NN
Higgs_high_NN_pred = Higgs_high_NN.predict(Higgs_high_test_scaled)
Higgs_high_NN_pred

In [None]:
# Predictions using the NN
Higgs_high_DN_pred = Higgs_high_DN.predict(Higgs_high_test_scaled)
Higgs_high_DN_pred

In [None]:
# Creating the ROC curve
fpr_Higgs_high_NN,tpr_Higgs_high_NN,_ = roc_curve(Higgs_high_class_test, Higgs_high_NN_pred)
fpr_Higgs_high_DN,tpr_Higgs_high_DN,_ = roc_curve(Higgs_high_class_test, Higgs_high_DN_pred)

# Calculating the AUC score
AUC_Higgs_high_NN = roc_auc_score(Higgs_high_class_test, Higgs_high_NN_pred)
AUC_Higgs_high_DN = roc_auc_score(Higgs_high_class_test, Higgs_high_DN_pred)

plt.plot(fpr_Higgs_high_NN, tpr_Higgs_high_NN, label="NN\nAUC: %.4f"%AUC_Higgs_high_NN)
plt.plot(fpr_Higgs_high_DN, tpr_Higgs_high_DN, label="DN\nAUC: %.4f"%AUC_Higgs_high_DN)

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("ANNs models for high level quantities")
plt.legend(fontsize=8)
plt.show()


print(f'NN model AUC score: {AUC_Higgs_high_NN}')
print(f'DN model AUC score: {AUC_Higgs_high_DN}')  

In [None]:
# Trasnforming the predictions of NN into True or False
Higgs_high_NN_pred = (Higgs_high_NN_pred >= 0.5)
Higgs_high_NN_pred

In [None]:
# Trasnforming the predictions of DN into True or False
Higgs_high_DN_pred = (Higgs_high_DN_pred >= 0.5)
Higgs_high_DN_pred

In [None]:
# Calculating the confusion matrix and accuracy score of NN 
cm_Higgs_high_NN = confusion_matrix(Higgs_high_class_test, Higgs_high_NN_pred)
print(cm_Higgs_high_NN)
print(accuracy_score(Higgs_high_class_test, Higgs_high_NN_pred))
disp_cm_Higgs_high_NN = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_high_NN, display_labels=["0","1"])
disp_cm_Higgs_high_NN.plot()

In [None]:
# Calculating the confusion matrix and accuracy score of DN 
cm_Higgs_high_DN = confusion_matrix(Higgs_high_class_test, Higgs_high_DN_pred)
print(cm_Higgs_high_DN)
print(accuracy_score(Higgs_high_class_test, Higgs_high_DN_pred))
disp_cm_Higgs_high_DN = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_high_DN, display_labels=["0","1"])
disp_cm_Higgs_high_DN.plot()

In [None]:
# Evaluate the NN model on test data
loss, accuracy = Higgs_high_NN.evaluate(Higgs_high_test_scaled, Higgs_high_class_test)
print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')

In [None]:
# Evaluate the DN model on test data
loss, accuracy = Higgs_high_DN.evaluate(Higgs_high_test_scaled, Higgs_high_class_test)
print(f'Test loss: {loss}')
print(f'Test accuracy: {accuracy}')

### **2.4 Showing all ROC curves**

In [None]:
plt.plot(fpr_Higgs_NN, tpr_Higgs_NN, label="NN whole\nAUC: %.4f"%AUC_Higgs_NN)
plt.plot(fpr_Higgs_DN, tpr_Higgs_DN, '--', label="DN whole\nAUC: %.4f"%AUC_Higgs_DN)

plt.plot(fpr_Higgs_low_NN, tpr_Higgs_low_NN, label="NN low\nAUC: %.4f"%AUC_Higgs_low_NN)
plt.plot(fpr_Higgs_low_DN, tpr_Higgs_low_DN, '--', label="DN low\nAUC: %.4f"%AUC_Higgs_low_DN)

plt.plot(fpr_Higgs_high_NN, tpr_Higgs_high_NN, label="NN high\nAUC: %.4f"%AUC_Higgs_high_NN)
plt.plot(fpr_Higgs_high_DN, tpr_Higgs_high_DN,'--', label="DN high\nAUC: %.4f"%AUC_Higgs_high_DN)

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("TensorFlow models")
plt.legend(fontsize=8)
plt.show()