**MSc Computational Physics AUTH**<br>
**Computational Quantum Physics**<br>
**Academic Year: 2023-2024**<br>
**Semester 2**<br>
**Implemented by: Ioannis Stergakis**<br>
**AEM: 4439**<br>

# **FINAL PROJECT 2:**<br>
# ***Part1: Applying machine learning classification models in hypersymmetric Higgs bosons detection***

**Jupyter Notebook 1**<br> 

**Contents:**<br>
*->1. Data importing and preprocessing*<br>
*->2. SVM classification algorithm*<br>
*->3. Random Forest classification algorithm*<br>

## **1. Data Importing and Preprocessing**

In [1]:
# Numpy module import
import numpy as np

# Module for data importation and presentaion
import pandas as pd
from pandas import DataFrame

# Module for detecting and imputing missing data
import missingno as msno

# ML package for splitting the dataframe into train and test set
from sklearn.model_selection import train_test_split

# ML package for data scalling
from sklearn.preprocessing import StandardScaler

### **1.1 Data importation, imputation and presentation**

#### **A. Importation**

In [None]:
# Importing the Higgs detection data
df_Higgs=pd.read_csv("HIGGS_8K.csv",header=None)

# Showing the data
df_Higgs


#### **B. Imputation**

In [3]:
# Showing the 18th column's (or 17th column in Python enumeration) first element
# this element (as all elements of this column are in str type) and can not be converted to float
df_Higgs[17][0]

'0.000000000000000000e+00.1'

In [None]:
# To avoid errors in future caclulations we replace this element's value with 0
df_Higgs[17][0] = 0

# Then we cast all elements of 17th column to float type
n = len(df_Higgs[17])
for i in range(0,n):
    df_Higgs[17][i] = float(df_Higgs[17][i])  

In [None]:
# Visualizing the missing data
msno.bar(df_Higgs)

In [None]:
# Applying mean imputation
df_Higgs_imputed = df_Higgs.fillna(df_Higgs.mean())

# Confirming the imputation
msno.bar(df_Higgs_imputed)

#### **C. Presentation**

In [8]:
# Getting the first column and all rows (class column)
Higgs_class = df_Higgs_imputed.iloc[:,0]
# Getting the rest column and all rows (all explanatory variables columns)
Higgs = df_Higgs_imputed.iloc[:,1:]
# Getting the columns 2 to 22 (or 1 to 21 in Python enumeration) and all rows (low level quantities columns)
Higgs_low = df_Higgs_imputed.iloc[:,1:22]
# Getting the column's 23 to 29 (or 22 to 28 in Python enumeration) and all rows (high level quantities columns)
Higgs_high = df_Higgs_imputed.iloc[:,22:]

In [None]:
# Showing the head of all explanatory variables (low and high level quantities) data
Higgs.head()

In [None]:
# Showing the head of low level quantities data
Higgs_low.head()

In [None]:
# Showing the head of high level quantities data
Higgs_high.head()

In [None]:
# Showing the head of class data
Higgs_class.head()

### **1.2 Data splitting into train and test sets**

In [13]:
# Splitting the data into test and training sets
# test_size = 0.25, the percentage of the test size to the complete dataset (here we choose 25%)
# random_state is the seed. A specific pseudorandom number to split the data set
# in order to produce same splitting every time we run the script.

# Splitting the all explanatory quantities data set
Higgs_train, Higgs_test, Higgs_class_train, Higgs_class_test = train_test_split(Higgs, Higgs_class, test_size = 0.25, random_state = 50)

# Splitting the low level quantities data set
Higgs_low_train, Higgs_low_test, Higgs_low_class_train, Higgs_low_class_test = train_test_split(Higgs_low, Higgs_class, test_size = 0.25, random_state = 50)

# Splitting the high level quantities data set
Higgs_high_train, Higgs_high_test, Higgs_high_class_train, Higgs_high_class_test = train_test_split(Higgs_high, Higgs_class, test_size = 0.25, random_state = 50)

### **1.3 Feature Scaling**

In [14]:
# Definition of scaler
scaler = StandardScaler()

# Applying feature scalling on all explanatory quantities train and test sets
Higgs_train_scaled = scaler.fit_transform(Higgs_train)
Higgs_test_scaled =scaler.transform(Higgs_test)

# Applying feature scalling on low level quantities train and test sets
Higgs_low_train_scaled = scaler.fit_transform(Higgs_low_train)
Higgs_low_test_scaled =scaler.transform(Higgs_low_test)

# Applying feature scalling on low level quantities train and test sets
Higgs_high_train_scaled = scaler.fit_transform(Higgs_high_train)
Higgs_high_test_scaled =scaler.transform(Higgs_high_test)

## **2. SVM classification algorithm**

In [15]:
# The plotting module matplotlib
import matplotlib.pyplot as plt

# The SVM classifier package
from sklearn.svm import SVC

# Calculating the confusion matrix and the accuracy 
from sklearn.metrics import confusion_matrix, accuracy_score

# Display the confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Summarize the results of the classification
from sklearn.metrics import classification_report

# Creating the ROC curve and calculating the AUC score
from sklearn.metrics import roc_curve,roc_auc_score

# Defining the SVC model
model_1 = SVC()

## Cross validation, here K-fold
# 6-fold cross validation with initial seed=50
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
kfold = KFold(n_splits=6, shuffle=True, random_state=50)

## Grid search initialization using two hyperparameters
grid_1 = { 
'C': [0.1, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
}

### **2.1 Determining the best SVM model (combining K-fold and grid search)**

#### **A. Classification using all explanatory quantities**

In [None]:
SVC_cv_grid_Higgs = GridSearchCV(estimator=model_1, param_grid=grid_1, scoring='accuracy', cv=kfold)
SVC_cv_grid_Higgs.fit(Higgs_train_scaled, Higgs_class_train)

In [None]:
# Print the best parameters and the best score
print("Best hyperparameters found: ", SVC_cv_grid_Higgs.best_params_)
print("Best cross-validation score: ", SVC_cv_grid_Higgs.best_score_)

In [None]:
# Building the best SVM model
SVC_Higgs_best = SVC(C=2.0, kernel='rbf',random_state=50,probability=True)
SVC_Higgs_best.fit(Higgs_train_scaled,Higgs_class_train) 

#### **B. Classification using the low level quantities**

In [None]:
SVC_cv_grid_Higgs_low = GridSearchCV(estimator=model_1, param_grid=grid_1, scoring='accuracy', cv=kfold)
SVC_cv_grid_Higgs_low.fit(Higgs_low_train, Higgs_low_class_train)

In [None]:
# Building the best SVM model
SVC_Higgs_low_best = SVC(C=1.75, kernel='rbf',random_state=50,probability=True)
SVC_Higgs_low_best.fit(Higgs_low_train_scaled,Higgs_low_class_train) 

#### **C. Classification using the high level quantities**

In [None]:
SVC_cv_grid_Higgs_high = GridSearchCV(estimator=model_1, param_grid=grid_1, scoring='accuracy', cv=kfold)
SVC_cv_grid_Higgs_high.fit(Higgs_high_train_scaled, Higgs_high_class_train)

In [None]:
# Building the best SVM model
SVC_Higgs_high_best = SVC(C=1.75, kernel='rbf',random_state=50,probability=True)
SVC_Higgs_high_best.fit(Higgs_high_train_scaled,Higgs_high_class_train) 

### **2.2 Metrics of the best SVM model**

#### **A. Metrics for all explanatory quantities classification**

In [None]:
# Predict on the test set using the best estimator
Higgs_test_pred_svm = SVC_Higgs_best.predict(Higgs_test_scaled)
Higgs_test_pred_svm

In [None]:
print(">ALL QUANTITIES CLASSIFICATION\n")
# Confusion matrix calculation
cm_Higgs_SVC = confusion_matrix(Higgs_class_test,Higgs_test_pred_svm)
print(">>Confusion matrix:")
print(cm_Higgs_SVC)

# Accuracy score calculation
print("\n>>Accuracy score:")
print(accuracy_score(Higgs_class_test,Higgs_test_pred_svm))

# Display the confusion matrix
print("\n>>Display of the confusion matrix:")
disp_cm_Higgs_SVC = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_SVC, display_labels=SVC_Higgs_best.classes_)
disp_cm_Higgs_SVC.plot()
plt.show()

# Classification report
print("\n>>Classification report:")
print(classification_report(Higgs_class_test, Higgs_test_pred_svm))

In [None]:
# Predicted probabilities
Higgs_pred_prob_svm = SVC_Higgs_best.predict_proba(Higgs_test_scaled)
display(Higgs_pred_prob_svm)

In [None]:
# Plotting the ROC curve
fpr_Higgs_svm,tpr_Higgs_svm,_ = roc_curve(Higgs_class_test, Higgs_pred_prob_svm[:,1])

plt.plot(fpr_Higgs_svm, tpr_Higgs_svm, label="svm")

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("Best SVM model for all quantities")
plt.legend(fontsize=15)
plt.show()

# Calculating the AUC score
AUC_Higgs_svm = roc_auc_score(Higgs_class_test, Higgs_pred_prob_svm[:,1])
print(f'SVM model AUC score: {AUC_Higgs_svm}')  

#### **B. Metrics for low level quantities classification**

In [None]:
# Predict on the test set using the best estimator
Higgs_low_test_pred_svm = SVC_Higgs_low_best.predict(Higgs_low_test_scaled)
Higgs_low_test_pred_svm

In [None]:
print(">LOW LEVEL QUANTITIES CLASSIFICATION\n")
# Confusion matrix calculation
cm_Higgs_low_SVC = confusion_matrix(Higgs_low_class_test,Higgs_low_test_pred_svm)
print(">>Confusion matrix:")
print(cm_Higgs_low_SVC)

# Accuracy score calculation
print("\n>>Accuracy score:")
print(accuracy_score(Higgs_low_class_test,Higgs_low_test_pred_svm))

# Display the confusion matrix
print("\n>>Display of the confusion matrix:")
disp_cm_Higgs_low_SVC = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_low_SVC, display_labels=SVC_Higgs_low_best.classes_)
disp_cm_Higgs_low_SVC.plot()
plt.show()

# Classification report
print("\n>>Classification report:")
print(classification_report(Higgs_low_class_test, Higgs_low_test_pred_svm))

In [None]:
# Predicted probabilities
Higgs_low_pred_prob_svm = SVC_Higgs_low_best.predict_proba(Higgs_low_test_scaled)
display(Higgs_low_pred_prob_svm)

In [None]:
# Plotting the ROC curve
fpr_Higgs_low_svm,tpr_Higgs_low_svm,_ = roc_curve(Higgs_low_class_test,Higgs_low_pred_prob_svm[:,1])

plt.plot(fpr_Higgs_low_svm, tpr_Higgs_low_svm, label="svm")

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("Best SVC model for low level quantities")
plt.legend(fontsize=15)
plt.show()

# Calculating the AUC score
AUC_Higgs_low_svm = roc_auc_score(Higgs_low_class_test, Higgs_low_pred_prob_svm[:,1])
print(f'SVM model AUC score: {AUC_Higgs_low_svm}')  

#### **C. Metrics for high level quantities classification**

In [None]:
# Predict on the test set using the best estimator
Higgs_high_test_pred_svm = SVC_Higgs_high_best.predict(Higgs_high_test_scaled)
Higgs_high_test_pred_svm

In [None]:
# Predicted probabilities
Higgs_high_pred_prob_svm = SVC_Higgs_high_best.predict_proba(Higgs_high_test_scaled)
display(Higgs_high_pred_prob_svm)

In [None]:
# Plotting the ROC curve
fpr_Higgs_high_svm,tpr_Higgs_high_svm,_ = roc_curve(Higgs_high_class_test, Higgs_high_pred_prob_svm[:,1])

plt.plot(fpr_Higgs_high_svm, tpr_Higgs_high_svm, label="svm")

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("Best SVC model for high level quantities")
plt.legend(fontsize=15)
plt.show()

# Calculating the AUC score
AUC_Higgs_high_svm = roc_auc_score(Higgs_high_class_test, Higgs_high_pred_prob_svm[:,1])
print(f'SVM model AUC score: {AUC_Higgs_high_svm}')  

#### **D. Combined ROC curves**

In [None]:
plt.plot(fpr_Higgs_svm, tpr_Higgs_svm, label="svm all\nAUC: %.4f"%AUC_Higgs_svm)
plt.plot(fpr_Higgs_low_svm, tpr_Higgs_low_svm, label="svm low\nAUC: %.4f"%AUC_Higgs_low_svm)
plt.plot(fpr_Higgs_high_svm, tpr_Higgs_high_svm, label="svm high\nAUC: %.4f"%AUC_Higgs_high_svm)

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("Best SVC model")
plt.legend(fontsize=8)
plt.show()

## **3. Random forest (RF) classification algorithm**

In [38]:
# Package of the random forest algorith,
from sklearn.ensemble import RandomForestClassifier

# Defining the RF model
model_2 = RandomForestClassifier()

## Grid search initialization using two hyperparameters
grid_2 = { 
'n_estimators': [10, 50, 100, 200],
'criterion' : ["entropy","gini","log_loss"]
}

### **3.1 Determining the best RF model (combining K-fold and grid search)**

#### **A. Classification using all explanatory quantities**

In [None]:
RF_cv_grid_Higgs = GridSearchCV(estimator=model_2, param_grid=grid_2, scoring='accuracy', cv=kfold)
RF_cv_grid_Higgs.fit(Higgs_train_scaled, Higgs_class_train)

In [None]:
# Print the best parameters and the best score
print("Best hyperparameters found: ", RF_cv_grid_Higgs.best_params_)
print("Best cross-validation score: ", RF_cv_grid_Higgs.best_score_)
RF_Higgs_best = RF_cv_grid_Higgs.best_estimator_

#### **B. Classification using low level quantities**

In [None]:
RF_cv_grid_Higgs_low = GridSearchCV(estimator=model_2, param_grid=grid_2, scoring='accuracy', cv=kfold)
RF_cv_grid_Higgs_low.fit(Higgs_low_train_scaled, Higgs_low_class_train)

In [None]:
# Print the best parameters and the best score
print("Best hyperparameters found: ", RF_cv_grid_Higgs_low.best_params_)
print("Best cross-validation score: ", RF_cv_grid_Higgs_low.best_score_)
RF_Higgs_low_best = RF_cv_grid_Higgs_low.best_estimator_

#### **C. Classification using high level quantities**

In [None]:
RF_cv_grid_Higgs_high = GridSearchCV(estimator=model_2, param_grid=grid_2, scoring='accuracy', cv=kfold)
RF_cv_grid_Higgs_high.fit(Higgs_high_train_scaled, Higgs_high_class_train)

In [None]:
# Print the best parameters and the best score
print("Best hyperparameters found: ", RF_cv_grid_Higgs_high.best_params_)
print("Best cross-validation score: ", RF_cv_grid_Higgs_high.best_score_)
RF_Higgs_high_best = RF_cv_grid_Higgs_high.best_estimator_

### **3.2 Metrics of the best RF model**

#### **A. Metrics for all explanatory quantities classification**

In [None]:
# Predict on the test set using the best estimator
Higgs_test_pred_rf = RF_Higgs_best.predict(Higgs_test_scaled)
Higgs_test_pred_rf

In [None]:
print(">ALL QUANTITIES CLASSIFICATION\n")
# Confusion matrix calculation
cm_Higgs_RF = confusion_matrix(Higgs_class_test,Higgs_test_pred_rf)
print(">>Confusion matrix:")
print(cm_Higgs_RF)

# Accuracy score calculation
print("\n>>Accuracy score:")
print(accuracy_score(Higgs_class_test,Higgs_test_pred_rf))

# Display the confusion matrix
print("\n>>Display of the confusion matrix:")
disp_cm_Higgs_RF = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_RF, display_labels=RF_Higgs_best.classes_)
disp_cm_Higgs_RF.plot()
plt.show()

# Classification report
print("\n>>Classification report:")
print(classification_report(Higgs_class_test, Higgs_test_pred_rf))

In [None]:
# Predicted probabilities
Higgs_pred_prob_rf = RF_Higgs_best.predict_proba(Higgs_test_scaled)
display(Higgs_pred_prob_rf)

In [None]:
# Plotting the ROC curve
fpr_Higgs_rf,tpr_Higgs_rf,_ = roc_curve(Higgs_class_test, Higgs_pred_prob_rf[:,1])

plt.plot(fpr_Higgs_rf, tpr_Higgs_rf, label="rf")

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("Best RF model for all quantities")
plt.legend(fontsize=15)
plt.show()

# Calculating the AUC score
AUC_Higgs_rf = roc_auc_score(Higgs_class_test, Higgs_pred_prob_rf[:,1])
print(f'RF model AUC score: {AUC_Higgs_rf}')    

#### **B. Metrics for low level quantities classification**

In [None]:
# Predict on the test set using the best estimator
Higgs_low_test_pred_rf = RF_Higgs_low_best.predict(Higgs_low_test_scaled)
Higgs_low_test_pred_rf

In [None]:
print(">LOW LEVEL QUANTITIES CLASSIFICATION\n")
# Confusion matrix calculation
cm_Higgs_low_RF = confusion_matrix(Higgs_low_class_test,Higgs_low_test_pred_rf)
print(">>Confusion matrix:")
print(cm_Higgs_low_RF)

# Accuracy score calculation
print("\n>>Accuracy score:")
print(accuracy_score(Higgs_low_class_test,Higgs_low_test_pred_rf))

# Display the confusion matrix
print("\n>>Display of the confusion matrix:")
disp_cm_Higgs_low_RF = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_low_RF, display_labels=RF_Higgs_low_best.classes_)
disp_cm_Higgs_low_RF.plot()
plt.show()

# Classification report
print("\n>>Classification report:")
print(classification_report(Higgs_low_class_test, Higgs_low_test_pred_rf))

In [None]:
# Predicted probabilities
Higgs_low_pred_prob_rf = RF_Higgs_low_best.predict_proba(Higgs_low_test_scaled)
display(Higgs_low_pred_prob_rf)

In [None]:
# Plotting the ROC curve
fpr_Higgs_low_rf,tpr_Higgs_low_rf,_ = roc_curve(Higgs_low_class_test,Higgs_low_pred_prob_rf[:,1])

plt.plot(fpr_Higgs_low_rf, tpr_Higgs_low_rf, label="rf")

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("Best RF model for low level quantities")
plt.legend(fontsize=15)
plt.show()

# Calculating the AUC score
AUC_Higgs_low_rf = roc_auc_score(Higgs_low_class_test, Higgs_low_pred_prob_rf[:,1])
print(f'RF model AUC score: {AUC_Higgs_low_rf}')  

#### **C. Metrics for high level quantities classification**

In [None]:
# Predict on the test set using the best estimator
Higgs_high_test_pred_rf = RF_Higgs_high_best.predict(Higgs_high_test_scaled)
Higgs_high_test_pred_rf

In [None]:
print(">HIGH LEVEL QUANTITIES CLASSIFICATION\n")
# Confusion matrix calculation
cm_Higgs_high_RF = confusion_matrix(Higgs_high_class_test,Higgs_high_test_pred_rf)
print(">>Confusion matrix:")
print(cm_Higgs_high_RF)

# Accuracy score calculation
print("\n>>Accuracy score:")
print(accuracy_score(Higgs_high_class_test,Higgs_high_test_pred_rf))

# Display the confusion matrix
print("\n>>Display of the confusion matrix:")
disp_cm_Higgs_high_RF = ConfusionMatrixDisplay(confusion_matrix=cm_Higgs_high_RF, display_labels=RF_Higgs_high_best.classes_)
disp_cm_Higgs_high_RF.plot()
plt.show()

# Classification report
print("\n>>Classification report:")
print(classification_report(Higgs_high_class_test, Higgs_high_test_pred_rf))

In [None]:
# Predicted probabilities
Higgs_high_pred_prob_rf = RF_Higgs_high_best.predict_proba(Higgs_high_test_scaled)
display(Higgs_high_pred_prob_rf)

In [None]:
# Plotting the ROC curve
fpr_Higgs_high_rf,tpr_Higgs_high_rf,_ = roc_curve(Higgs_high_class_test, Higgs_high_pred_prob_rf[:,1])

plt.plot(fpr_Higgs_high_rf, tpr_Higgs_high_rf, label="rf")

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("Best RF model for high level quantities")
plt.legend(fontsize=15)
plt.show()

# Calculating the AUC score
AUC_Higgs_high_rf = roc_auc_score(Higgs_high_class_test, Higgs_high_pred_prob_rf[:,1])
print(f'RF model AUC score: {AUC_Higgs_high_rf}')  

#### **D. Combined ROC curves**

In [None]:
plt.plot(fpr_Higgs_rf, tpr_Higgs_rf, label="rf all\nAUC: %.4f"%AUC_Higgs_rf)
plt.plot(fpr_Higgs_low_rf, tpr_Higgs_low_rf, label="rf low\nAUC: %.4f"%AUC_Higgs_low_rf)
plt.plot(fpr_Higgs_high_rf, tpr_Higgs_high_rf, label="rf high\nAUC: %.4f"%AUC_Higgs_high_rf)

plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title("Best RF model")
plt.legend(fontsize=8)
plt.show()