<a href="https://colab.research.google.com/github/jeguns/EP7173/blob/main/Unidad%2006/Selecci%C3%B3n_de_atributos_en_modelos_de_clasificaci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instalación de paquetes

In [1]:
pip install boruta

Collecting boruta
  Downloading Boruta-0.4.3-py3-none-any.whl.metadata (8.8 kB)
Downloading Boruta-0.4.3-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: boruta
Successfully installed boruta-0.4.3


## Lectura de datos

El archivo diabetes.csv es originalmente del Instituto Nacional de Diabetes y Enfermedades Digestivas y Renales. El objetivo del conjunto de datos es predecir diagnósticamente si un paciente tiene o no diabetes, en función de ciertas mediciones diagnósticas incluidas en el conjunto de datos. Se impusieron varias restricciones a la selección de estos casos de una base de datos más grande. En particular, todos los pacientes aquí son mujeres de al menos 21 años de edad de ascendencia indígena Pima.

Variables contenidas:
- Pregnancies: Número de embarazos

- Glucose: Concentración de glucosa en un test oral de tolerancia a la glucosa.

- BloodPresure: Presión sistólica (en mmHg)

- SkinThickness: Grosor del pliegue cutáneo del tríceps (mm)

- BMI: Índice de Masa Corporal (en kg/altura^2)

- DiabetesPedigreeFunction: Diabetes pedigree function

- Age: Edad (en años)

- Outcome: Target que indica si tiene o no diabetes

In [2]:
import pandas as pd
import numpy as np
datos = pd.read_csv('diabetes.csv')
datos.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
array = datos.values
X = array[:,0:8] # Desde Pregnancies hasta Age
Y = array[:,8] # variable target o respuesta

In [6]:
datos.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


# Filtros

## Selección por umbral de varianza

In [11]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, SelectFpr, SelectFdr, SelectFwe

selector_var = VarianceThreshold(threshold = 20) # Definimos el selector por umbral de varianza
selector_var.fit_transform(X) # Aplicamos el selector a la matriz X

array([[148. ,  72. ,  35. ,   0. ,  33.6,  50. ],
       [ 85. ,  66. ,  29. ,   0. ,  26.6,  31. ],
       [183. ,  64. ,   0. ,   0. ,  23.3,  32. ],
       ...,
       [121. ,  72. ,  23. , 112. ,  26.2,  30. ],
       [126. ,  60. ,   0. ,   0. ,  30.1,  47. ],
       [ 93. ,  70. ,  31. ,   0. ,  30.4,  23. ]])

In [12]:
selector_var.get_support() # True = La variable es incluida /// # False = La variable es excluida

array([False,  True,  True,  True,  True,  True, False,  True])

In [13]:
selected_columns = datos.drop('Outcome', axis=1).columns[selector_var.get_support()]
selected_columns

Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')

In [14]:
df_selected = pd.DataFrame(selector_var.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Age
0,148.0,72.0,35.0,0.0,33.6,50.0
1,85.0,66.0,29.0,0.0,26.6,31.0
2,183.0,64.0,0.0,0.0,23.3,32.0
3,89.0,66.0,23.0,94.0,28.1,21.0
4,137.0,40.0,35.0,168.0,43.1,33.0


Sugerencia: Estandarizar antes de aplicar el umbral

## Selección por el estadístico Chi^2

In [15]:
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif # criterios de filtro

selector_chi2 = SelectKBest(score_func=chi2, k=4).fit(X, Y) # definimos y aplicamos el selector chi2 indicando que se seleccionen las 4 mejores variables
selector_chi2

In [16]:
selector_chi2.scores_ # valores chi cuadrado para cada X vs Y

array([ 111.51969064, 1411.88704064,   17.60537322,   53.10803984,
       2175.56527292,  127.66934333,    5.39268155,  181.30368904])

In [17]:
selector_chi2.pvalues_

array([4.55261043e-026, 5.48728628e-309, 2.71819252e-005, 3.15697650e-013,
       0.00000000e+000, 1.32590849e-029, 2.02213728e-002, 2.51638830e-041])

In [18]:
selector_chi2.n_features_in_ # cantidad inicial de variables

8

In [19]:
selector_chi2.get_support()

array([False,  True, False, False,  True,  True, False,  True])

In [20]:
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2.get_support()]
selected_columns

Index(['Glucose', 'Insulin', 'BMI', 'Age'], dtype='object')

In [21]:
selector_chi2.transform(X)

array([[148. ,   0. ,  33.6,  50. ],
       [ 85. ,   0. ,  26.6,  31. ],
       [183. ,   0. ,  23.3,  32. ],
       ...,
       [121. , 112. ,  26.2,  30. ],
       [126. ,   0. ,  30.1,  47. ],
       [ 93. ,   0. ,  30.4,  23. ]])

In [22]:
df_selected = pd.DataFrame(selector_chi2.transform(X), columns = selected_columns)
df_selected.head()

Unnamed: 0,Glucose,Insulin,BMI,Age
0,148.0,0.0,33.6,50.0
1,85.0,0.0,26.6,31.0
2,183.0,0.0,23.3,32.0
3,89.0,94.0,28.1,21.0
4,137.0,168.0,43.1,33.0


In [23]:
selector_chi2_k = SelectKBest(score_func=chi2, k=4).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_k.get_support()]
df_selected = pd.DataFrame(selector_chi2_k.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Glucose,Insulin,BMI,Age
0,148.0,0.0,33.6,50.0
1,85.0,0.0,26.6,31.0
2,183.0,0.0,23.3,32.0
3,89.0,94.0,28.1,21.0
4,137.0,168.0,43.1,33.0


In [26]:
selector_chi2_perc = SelectPercentile(score_func = chi2, percentile = 50).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_perc.get_support()]
df_selected = pd.DataFrame(selector_chi2_perc.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Glucose,Insulin,BMI,Age
0,148.0,0.0,33.6,50.0
1,85.0,0.0,26.6,31.0
2,183.0,0.0,23.3,32.0
3,89.0,94.0,28.1,21.0
4,137.0,168.0,43.1,33.0


In [27]:
selector_chi2_fpr = SelectFpr(score_func = chi2, alpha = 1e-2).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_fpr.get_support()]
df_selected = pd.DataFrame(selector_chi2_fpr.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Age
0,6.0,148.0,72.0,35.0,0.0,33.6,50.0
1,1.0,85.0,66.0,29.0,0.0,26.6,31.0
2,8.0,183.0,64.0,0.0,0.0,23.3,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,33.0


In [28]:
selector_chi2_fpr.pvalues_

array([4.55261043e-026, 5.48728628e-309, 2.71819252e-005, 3.15697650e-013,
       0.00000000e+000, 1.32590849e-029, 2.02213728e-002, 2.51638830e-041])

In [29]:
selector_chi2_fpr.get_support()

array([ True,  True,  True,  True,  True,  True, False,  True])

2.02213728e-002 = 0.02022 > 0.01 (alpha), entonces esa variable se retira

In [30]:
selector_chi2_fpr = SelectFpr(score_func = chi2, alpha = 1e-5).fit(X, Y) # Fpr (False positive rate) no tiene que ver con matriz de confusión, sino con alfa
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_fpr.get_support()]
df_selected = pd.DataFrame(selector_chi2_fpr.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,SkinThickness,Insulin,BMI,Age
0,6.0,148.0,35.0,0.0,33.6,50.0
1,1.0,85.0,29.0,0.0,26.6,31.0
2,8.0,183.0,0.0,0.0,23.3,32.0
3,1.0,89.0,23.0,94.0,28.1,21.0
4,0.0,137.0,35.0,168.0,43.1,33.0


In [31]:
selector_chi2_fpr.pvalues_

array([4.55261043e-026, 5.48728628e-309, 2.71819252e-005, 3.15697650e-013,
       0.00000000e+000, 1.32590849e-029, 2.02213728e-002, 2.51638830e-041])

In [32]:
selector_chi2_fpr.get_support()

array([ True,  True, False,  True,  True,  True, False,  True])

In [35]:
selector_chi2_fdr = SelectFdr(score_func = chi2, alpha = 0.01).fit(X, Y) # Corrección o ajuste de Benjamini-Hochberg
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_fdr.get_support()]
df_selected = pd.DataFrame(selector_chi2_fdr.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Age
0,6.0,148.0,72.0,35.0,0.0,33.6,50.0
1,1.0,85.0,66.0,29.0,0.0,26.6,31.0
2,8.0,183.0,64.0,0.0,0.0,23.3,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,33.0


In [36]:
selector_chi2_fwe = SelectFwe(score_func = chi2, alpha = 0.01).fit(X, Y) # Compara cada pvalor con alfa/n (n = cantidad de variables)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_fwe.get_support()]
df_selected = pd.DataFrame(selector_chi2_fwe.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Age
0,6.0,148.0,72.0,35.0,0.0,33.6,50.0
1,1.0,85.0,66.0,29.0,0.0,26.6,31.0
2,8.0,183.0,64.0,0.0,0.0,23.3,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,33.0


## Selección por el estadístico F de ANOVA

In [37]:
selector_anova = SelectKBest(score_func=f_classif, k=4).fit(X, Y)
selector_anova.scores_

array([ 39.67022739, 213.16175218,   3.2569504 ,   4.30438091,
        13.28110753,  71.7720721 ,  23.8713002 ,  46.14061124])

In [38]:
selector_anova.pvalues_

array([5.06512730e-10, 8.93543165e-43, 7.15139001e-02, 3.83477048e-02,
       2.86186460e-04, 1.22980749e-16, 1.25460701e-06, 2.20997546e-11])

In [39]:
selector_anova.n_features_in_

8

In [40]:
selector_anova.get_support()

array([ True,  True, False, False, False,  True, False,  True])

In [41]:
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova.get_support()]
df_selected = pd.DataFrame(selector_anova.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BMI,Age
0,6.0,148.0,33.6,50.0
1,1.0,85.0,26.6,31.0
2,8.0,183.0,23.3,32.0
3,1.0,89.0,28.1,21.0
4,0.0,137.0,43.1,33.0


In [42]:
selector_anova_k = SelectKBest(score_func=f_classif, k=4).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova_k.get_support()]
df_selected = pd.DataFrame(selector_anova_k.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BMI,Age
0,6.0,148.0,33.6,50.0
1,1.0,85.0,26.6,31.0
2,8.0,183.0,23.3,32.0
3,1.0,89.0,28.1,21.0
4,0.0,137.0,43.1,33.0


In [43]:
selector_anova_perc = SelectPercentile(score_func=f_classif, percentile = 50).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova_perc.get_support()]
df_selected = pd.DataFrame(selector_anova_perc.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BMI,Age
0,6.0,148.0,33.6,50.0
1,1.0,85.0,26.6,31.0
2,8.0,183.0,23.3,32.0
3,1.0,89.0,28.1,21.0
4,0.0,137.0,43.1,33.0


In [44]:
selector_anova_fpr = SelectFpr(score_func=f_classif, alpha = 0.01).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova_fpr.get_support()]
df_selected = pd.DataFrame(selector_anova_fpr.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,0.0,33.6,0.627,50.0
1,1.0,85.0,0.0,26.6,0.351,31.0
2,8.0,183.0,0.0,23.3,0.672,32.0
3,1.0,89.0,94.0,28.1,0.167,21.0
4,0.0,137.0,168.0,43.1,2.288,33.0


In [45]:
selector_anova_fdr = SelectFdr(score_func=f_classif, alpha = 0.01).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova_fdr.get_support()]
df_selected = pd.DataFrame(selector_anova_fdr.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,0.0,33.6,0.627,50.0
1,1.0,85.0,0.0,26.6,0.351,31.0
2,8.0,183.0,0.0,23.3,0.672,32.0
3,1.0,89.0,94.0,28.1,0.167,21.0
4,0.0,137.0,168.0,43.1,2.288,33.0


In [46]:
selector_anova_fwe = SelectFwe(score_func=f_classif, alpha = 0.01).fit(X, Y) # BUSCA LOS pvalores < alfa/n
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova_fwe.get_support()]
df_selected = pd.DataFrame(selector_anova_fwe.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,0.0,33.6,0.627,50.0
1,1.0,85.0,0.0,26.6,0.351,31.0
2,8.0,183.0,0.0,23.3,0.672,32.0
3,1.0,89.0,94.0,28.1,0.167,21.0
4,0.0,137.0,168.0,43.1,2.288,33.0


## Selección por el indicador de información mutua

$$I(X;Y) = \sum_x\sum_y{p(x,y)\log\left(\frac{p(x,y)}{p(x)p(y)}\right)}$$

In [47]:
selector_im = SelectKBest(score_func=lambda X, y: mutual_info_classif(X, Y, random_state=42), k=4).fit(X, Y)
selector_im.scores_ #Indicadores de información mutua

array([0.06098491, 0.11463368, 0.        , 0.00466746, 0.01194782,
       0.08006052, 0.01458981, 0.05139532])

In [48]:
selector_im.get_support()

array([ True,  True, False, False, False,  True, False,  True])

In [49]:
selected_columns = datos.drop('Outcome', axis=1).columns[selector_im.get_support()]
df_selected = pd.DataFrame(selector_im.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BMI,Age
0,6.0,148.0,33.6,50.0
1,1.0,85.0,26.6,31.0
2,8.0,183.0,23.3,32.0
3,1.0,89.0,28.1,21.0
4,0.0,137.0,43.1,33.0


In [50]:
selector_im_k = SelectKBest(score_func=lambda X, y: mutual_info_classif(X, Y, random_state=42), k=4).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_im_k.get_support()]
df_selected = pd.DataFrame(selector_im_k.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BMI,Age
0,6.0,148.0,33.6,50.0
1,1.0,85.0,26.6,31.0
2,8.0,183.0,23.3,32.0
3,1.0,89.0,28.1,21.0
4,0.0,137.0,43.1,33.0


In [51]:
selector_im_perc = SelectPercentile(score_func=lambda X, y: mutual_info_classif(X, Y, random_state=42), percentile=50).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_im_perc.get_support()]
df_selected = pd.DataFrame(selector_im_perc.transform(X), columns=selected_columns)
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BMI,Age
0,6.0,148.0,33.6,50.0
1,1.0,85.0,26.6,31.0
2,8.0,183.0,23.3,32.0
3,1.0,89.0,28.1,21.0
4,0.0,137.0,43.1,33.0


# Wrappers

In [52]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter = 400)

In [53]:
x = datos.drop('Outcome', axis=1)
y = datos['Outcome']

## Forward selection

Forward = Hacia adelante

In [54]:
from sklearn.feature_selection import SequentialFeatureSelector, RFE
selector_forward = SequentialFeatureSelector(model, n_features_to_select= 'auto', scoring = 'f1', direction='forward', tol = 0.001).fit(x, y)

Revisar [aquí](https://scikit-learn.org/dev/modules/model_evaluation.html#scoring-parameter) los posibles valores para el argumento scoring

In [55]:
selector_forward.n_features_in_

8

In [56]:
selector_forward.feature_names_in_

array(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'], dtype=object)

In [57]:
selector_forward.get_support()

array([ True,  True, False, False, False,  True, False, False])

In [58]:
selector_forward.get_feature_names_out()

array(['Pregnancies', 'Glucose', 'BMI'], dtype=object)

In [59]:
selector_forward.transform(x)

array([[  6. , 148. ,  33.6],
       [  1. ,  85. ,  26.6],
       [  8. , 183. ,  23.3],
       ...,
       [  5. , 121. ,  26.2],
       [  1. , 126. ,  30.1],
       [  1. ,  93. ,  30.4]])

In [60]:
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BMI
0,6,148,33.6
1,1,85,26.6
2,8,183,23.3
3,1,89,28.1
4,0,137,43.1


In [61]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='forward', tol = 0.10).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Glucose
0,148
1,85
2,183
3,89
4,137


In [62]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='forward', tol = 1e-3).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BMI
0,6,148,33.6
1,1,85,26.6
2,8,183,23.3
3,1,89,28.1
4,0,137,43.1


In [63]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='forward', tol = 1e-5).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,Age
0,6,148,72,35,33.6,50
1,1,85,66,29,26.6,31
2,8,183,64,0,23.3,32
3,1,89,66,23,28.1,21
4,0,137,40,35,43.1,33


In [64]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='forward').fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI
0,6,148,72,33.6
1,1,85,66,26.6
2,8,183,64,23.3
3,1,89,66,28.1
4,0,137,40,43.1


## Backward selection

Backwward = Hacia atrás

In [65]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='backward', tol = -1e-1).fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Glucose
0,148
1,85
2,183
3,89
4,137


In [66]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='backward', tol = -1e-3).fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,Age
0,6,148,72,33.6,50
1,1,85,66,26.6,31
2,8,183,64,23.3,32
3,1,89,66,28.1,21
4,0,137,40,43.1,33


In [67]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='backward', tol = -1e-8).fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,Age
0,6,148,72,0,33.6,50
1,1,85,66,0,26.6,31
2,8,183,64,0,23.3,32
3,1,89,66,94,28.1,21
4,0,137,40,168,43.1,33


In [68]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='backward').fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI
0,6,148,72,33.6
1,1,85,66,26.6
2,8,183,64,23.3
3,1,89,66,28.1
4,0,137,40,43.1


## Recursive Feature Elimination (RFE)

(CV RFE: Permite encontrar el valor óptimo de n_features_to_select)

In [70]:
selector_rfe = RFE(model, n_features_to_select= 3).fit(x, y)

In [71]:
selector_rfe.ranking_

array([1, 2, 4, 6, 5, 1, 1, 3])

In [72]:
selector_rfe.get_support()

array([ True, False, False, False, False,  True,  True, False])

In [73]:
selector_rfe.get_feature_names_out()

array(['Pregnancies', 'BMI', 'DiabetesPedigreeFunction'], dtype=object)

In [74]:
df_selected = datos[selector_rfe.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,BMI,DiabetesPedigreeFunction
0,6,33.6,0.627
1,1,26.6,0.351
2,8,23.3,0.672
3,1,28.1,0.167
4,0,43.1,2.288


In [76]:
selector_rfe = RFE(model, n_features_to_select = 0.25).fit(x, y)
df_selected = datos[selector_rfe.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,DiabetesPedigreeFunction
0,6,0.627
1,1,0.351
2,8,0.672
3,1,0.167
4,0,2.288


In [77]:
selector_rfe = RFE(model).fit(x, y)
df_selected = datos[selector_rfe.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BMI,DiabetesPedigreeFunction
0,6,148,33.6,0.627
1,1,85,26.6,0.351
2,8,183,23.3,0.672
3,1,89,28.1,0.167
4,0,137,43.1,2.288


Otros modelos

In [78]:
from sklearn.tree import DecisionTreeClassifier #### otro modelo
model_tree = DecisionTreeClassifier()
selector_forward = SequentialFeatureSelector(model_tree, n_features_to_select = 'auto', direction='forward', tol = 1e-5).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Glucose
0,148
1,85
2,183
3,89
4,137


In [79]:
from sklearn.ensemble import RandomForestClassifier #### otro modelo
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
selector_forward = SequentialFeatureSelector(model_rf, n_features_to_select = 'auto', direction='forward', tol = 1e-5).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,Age
0,6,148,72,33.6,50
1,1,85,66,26.6,31
2,8,183,64,23.3,32
3,1,89,66,28.1,21
4,0,137,40,43.1,33


In [80]:
from sklearn.svm import SVC #### otro modelo
model_svm = SVC(kernel='linear')
selector_forward = SequentialFeatureSelector(model_svm, n_features_to_select = 'auto', direction='forward', tol = 1e-5).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

Unnamed: 0,Glucose,BMI,DiabetesPedigreeFunction
0,148,33.6,0.627
1,85,26.6,0.351
2,183,23.3,0.672
3,89,28.1,0.167
4,137,43.1,2.288


## Boruta

In [81]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [82]:
x = datos.drop('Outcome', axis=1)
y = datos['Outcome']

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

In [89]:
boruta_selector = BorutaPy(estimator=rf, n_estimators='auto', verbose=2, random_state=2024)

In [90]:
boruta_selector.fit(x.values, y.values)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	8
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	1
Iteration: 	9 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	1
Iteration: 	10 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	1
Iteration: 	11 / 100
Confirmed: 	5
Tentative: 	2
Rejected: 	1
Iteration: 	12 / 100
Confirmed: 	5
Tentative: 	1
Rejected: 	2
Iteration: 	13 / 100
Confirmed: 	5
Tentative: 	1
Rejected: 	2
Iteration: 	14 / 100
Confirmed: 	5
Tentative: 	1
Rejected: 	2
Iteration: 	15 / 100
Confirmed: 	5
Tentative: 	1
Rejected: 	2
Iteration: 	16 / 100
Confirmed: 	5
Tentative: 	1
Rejected: 	2
Iteration: 	17 / 

In [91]:
boruta_selector.support_

array([ True,  True, False, False,  True,  True,  True,  True])

In [92]:
boruta_selector.ranking_

array([1, 1, 2, 2, 1, 1, 1, 1])

In [93]:
selected_features = x.columns[boruta_selector.support_].tolist()
selected_features

['Pregnancies', 'Glucose', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

In [94]:
datos[selected_features + ['Outcome']]

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,0,33.6,0.627,50,1
1,1,85,0,26.6,0.351,31,0
2,8,183,0,23.3,0.672,32,1
3,1,89,94,28.1,0.167,21,0
4,0,137,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...
763,10,101,180,32.9,0.171,63,0
764,2,122,0,36.8,0.340,27,0
765,5,121,112,26.2,0.245,30,0
766,1,126,0,30.1,0.349,47,1
