In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
from fitter import Fitter

from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import f_classif 
from sklearn.feature_selection import RFE 
from sklearn.ensemble import ExtraTreesClassifier  

In [None]:
train = pd.read_csv("datasets/training.csv") #filepath
train

# Divir en 3 subsets según sus características físicas:

1. Inicial (tau)
2. Decaimiento (muon)
3. Información del detector (detector)

In [None]:
subset_tau =train[["LifeTime", "FlightDistance", "dira", "mass" , "pt" , "IP", "IPSig" , "VertexChi2","signal" ]]
subset_mu = train[["p0_p", "p1_p", "p2_p", "p0_pt" , "p1_pt" , "p2_pt", "p0_eta" , "p1_eta","p2_eta","signal"  ]]
subset_detector = train.drop(["LifeTime", "FlightDistance", "dira", "mass" , "pt" , "IP", "IPSig" , "VertexChi2","signal", "p0_p", "p1_p", "p2_p", "p0_pt" , "p1_pt" , "p2_pt", "p0_eta" , "p1_eta","p2_eta" ], axis=1)

# Correlación entre señal y background de las variables numéricas (Ejemplo tau)


In [None]:
# Variable numérica
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(9, 5))
axes = axes.flat
columnas = subset_tau.select_dtypes(include=['float64', 'int']).columns
columnas = columnas.drop('signal') # objetivo

for i, colum in enumerate(columnas):
    sns.regplot(
        x           = subset_tau[colum],
        y           = subset_tau['signal'],
        color       = "gray",
        marker      = '.',
        scatter_kws = {"alpha":0.4},
        line_kws    = {"color":"r","alpha":0.7},
        ax          = axes[i]
    )
    axes[i].set_title(f"Señal/Background vs {colum}", fontsize = 7, fontweight = "bold")
    axes[i].tick_params(labelsize = 6)
    axes[i].set_xlabel("")
    axes[i].set_ylabel("")

# Se eliminan los axes vacíos
for i in [8]:
    fig.delaxes(axes[i])
    
fig.tight_layout()
plt.subplots_adjust(top=0.9)
fig.suptitle('Correlación con Señal/background', fontsize = 10, fontweight = "bold");

In [None]:
subset_tau.isnull().any().any()

# Selección de variables (Ejemplo tau)

1. Objetivo: y
2. Características: x

En este caso queremos separar la señal del background

In [None]:
x = subset_tau['signal']
k = 4  # número de atributos a seleccionar
entrenar = subset_tau.drop(['signal'], axis=1)
columnas = list(entrenar.columns.values)
seleccionadas = SelectKBest(f_classif, k=k).fit(entrenar, x)
atrib = seleccionadas.get_support()
atributos = [columnas[i] for i in list(atrib.nonzero()[0])]
atributos

# Selección de variables: Todas las variables

In [None]:
#51 variables 
x_all = train['signal'] Objetivo
entrenar_all = train.drop(['signal'], axis=1)
columnas_all = list(entrenar_all.columns.values) 
modelo_all = ExtraTreesClassifier()
erec_all = RFE(modelo_all)  
erec_all = erec_all.fit(entrenar_all, x_all)

In [None]:
atrib_all = erec_all.support_
atributos_all = [columnas_all[i] for i in list(atrib_all.nonzero()[0])]
atributos_all

In [None]:
# Importancia de atributos.
modelo_all.fit(entrenar_all, x_all)
modelo_all.feature_importances_[:24]

# Ajustar distribuciones a los datos (Ejemplo tau)

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(9, 5))
axes = axes.flat
for i, colum in enumerate(subset_tau):
    sns.histplot(
        data     = subset_tau,
        x        = colum,
        stat     = "count",
        kde      = True,
        color    = (list(plt.rcParams['axes.prop_cycle'])*2)[i]["color"],
        line_kws = {'linewidth': 2},
        alpha    = 0.3,
        ax       = axes[i]
    )
    axes[i].set_title(colum, fontsize = 7, fontweight = "bold")
    axes[i].tick_params(labelsize = 6)
    axes[i].set_xlabel("")
    fig.tight_layout()
plt.subplots_adjust(top = 0.9)


# Ajustar distribución a la variable mas importanate (Lifetime)

In [None]:
# 80 distribuciones
distribuciones = ['cauchy', 'chi2', 'expon',  'exponpow', 'gamma',
                  'norm', 'powerlaw', 'beta', 'logistic']
# Atributo
fitter = Fitter(train.LifeTime, distributions=distribuciones)
fitter.fit()
fitter.summary(Nbest=10, plot=False)