#Higgins Bosson Event Detection
This project train artificial neural networks to detect the decay of the Higgs boson to tau leptons on a dataset of 82 million simulated collision events.

In [1]:
#libraries to use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from autoviz.AutoViz_Class import AutoViz_Class
from autoviz import data_cleaning_suggestions

import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly.io as pio

from utilerias import exact_values_table
from utilerias import features_by_type
pio.renderers.default='notebook'

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

<hr>
<p>Read the dataset</p>
<hr>

In [2]:
df_bosson = pd.read_csv('../datasets/training.csv')

In [3]:
df_bosson.shape

(250000, 33)

In [4]:
df_bosson

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.470,51.655,97.827,27.980,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.150,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.00,-999.000,-999.000,3.473,2.078,...,1,46.226,0.725,1.158,-999.000,-999.00,-999.000,46.226,2.233584,b
2,100002,-999.000,162.172,125.953,35.635,-999.00,-999.000,-999.000,3.148,9.336,...,1,44.251,2.053,-2.028,-999.000,-999.00,-999.000,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.00,-999.000,-999.000,3.310,0.414,...,0,-999.000,-999.000,-999.000,-999.000,-999.00,-999.000,-0.000,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.00,-999.000,-999.000,3.891,16.405,...,0,-999.000,-999.000,-999.000,-999.000,-999.00,-999.000,0.000,6.245333,b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,349995,-999.000,71.989,36.548,5.042,-999.00,-999.000,-999.000,1.392,5.042,...,0,-999.000,-999.000,-999.000,-999.000,-999.00,-999.000,0.000,4.505083,b
249996,349996,-999.000,58.179,68.083,22.439,-999.00,-999.000,-999.000,2.585,22.439,...,0,-999.000,-999.000,-999.000,-999.000,-999.00,-999.000,-0.000,2.497259,b
249997,349997,105.457,60.526,75.839,39.757,-999.00,-999.000,-999.000,2.390,22.183,...,1,41.992,1.800,-0.166,-999.000,-999.00,-999.000,41.992,0.018636,s
249998,349998,94.951,19.362,68.812,13.504,-999.00,-999.000,-999.000,3.365,13.504,...,0,-999.000,-999.000,-999.000,-999.000,-999.00,-999.000,0.000,1.681611,b


In [5]:
#let's standarize some features, before to proceed with the EDA
#Eliminate spaces and upper cases from name columns
df_bosson.columns=df_bosson.columns.str.lower().str.replace(' ', '_')

In [6]:
df_bosson.dtypes

eventid                          int64
der_mass_mmc                   float64
der_mass_transverse_met_lep    float64
der_mass_vis                   float64
der_pt_h                       float64
der_deltaeta_jet_jet           float64
der_mass_jet_jet               float64
der_prodeta_jet_jet            float64
der_deltar_tau_lep             float64
der_pt_tot                     float64
der_sum_pt                     float64
der_pt_ratio_lep_tau           float64
der_met_phi_centrality         float64
der_lep_eta_centrality         float64
pri_tau_pt                     float64
pri_tau_eta                    float64
pri_tau_phi                    float64
pri_lep_pt                     float64
pri_lep_eta                    float64
pri_lep_phi                    float64
pri_met                        float64
pri_met_phi                    float64
pri_met_sumet                  float64
pri_jet_num                      int64
pri_jet_leading_pt             float64
pri_jet_leading_eta      

it can happen that for some entries some variables are meaningless or cannot be computed; in this case, their value is −999.0, which is outside the normal range of all variables

In [7]:
# Count of column datatypes for the training dataset
print('Number on features by type')
print('============================')
table = features_by_type(df_bosson)

table.style.background_gradient(cmap='Greens')
#table.shape

Number on features by type


Unnamed: 0,Integer,Float,Object
Features,2,30,1


In [6]:
data_cleaning_suggestions(df_bosson)

Data cleaning improvement suggestions. Complete them before proceeding to ML modeling.


Unnamed: 0,Nuniques,dtype,Nulls,Nullpercent,NuniquePercent,Value counts Min,Data cleaning improvement suggestions
eventid,250000,int64,0,0.0,100.0,0,possible ID column: drop
pri_met_sumet,179740,float64,0,0.0,71.896,0,skewed: cap or drop outliers
der_sum_pt,156098,float64,0,0.0,62.4392,0,skewed: cap or drop outliers
der_pt_h,115563,float64,0,0.0,46.2252,0,skewed: cap or drop outliers
der_mass_mmc,108338,float64,0,0.0,43.3352,0,skewed: cap or drop outliers
weight,104096,float64,0,0.0,41.6384,0,
pri_jet_all_pt,103559,float64,0,0.0,41.4236,0,skewed: cap or drop outliers
der_mass_transverse_met_lep,101637,float64,0,0.0,40.6548,0,skewed: cap or drop outliers
der_mass_vis,100558,float64,0,0.0,40.2232,0,skewed: cap or drop outliers
pri_met,87836,float64,0,0.0,35.1344,0,highly skewed: drop outliers or do box-cox transform


In [8]:
#lets count how many columns and rows have -999 value
table = exact_values_table(df_bosson,-999)
table.style.background_gradient(cmap='Greens')

Sumary :
Columns     :33
f values :33


Unnamed: 0,f Values,% of Total Values
pri_jet_subleading_pt,177457,71.0
pri_jet_subleading_phi,177457,71.0
der_deltaeta_jet_jet,177457,71.0
der_mass_jet_jet,177457,71.0
der_prodeta_jet_jet,177457,71.0
pri_jet_subleading_eta,177457,71.0
der_lep_eta_centrality,177457,71.0
pri_jet_leading_phi,99913,40.0
pri_jet_leading_eta,99913,40.0
pri_jet_leading_pt,99913,40.0


This is the real overview of missing values.. those features that have -999

In [9]:
# to include categorical columns
df_bosson.describe(include=['O']).T

Unnamed: 0,count,unique,top,freq
label,250000,2,b,164333


In [11]:
df_bosson['label'].value_counts()

b    164333
s     85667
Name: label, dtype: int64

In [12]:
((df_bosson.groupby(['label']).size() / df_bosson["label"].count()) * 100).add_prefix('Events (in %) results in ')

label
Events (in %) results in b    65.7332
Events (in %) results in s    34.2668
dtype: float64

'label' is the target and is a binary variable. and an event has a probability of 65.73% to result as "backgound noise", over 34.26 to result a "signal"

In [13]:
#duplicates
print("Number of duplicates: ", df_bosson.duplicated().sum())

Number of duplicates:  0


In [16]:
df_bosson['pri_jet_num'].value_counts()

0    99913
1    77544
2    50379
3    22164
Name: pri_jet_num, dtype: int64