<a href="https://colab.research.google.com/github/jeguns/EP7173/blob/main/Unidad%2006/Selecci%C3%B3n_de_atributos_en_modelos_de_clasificaci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instalación de paquetes

In [None]:
pip install boruta

## Lectura de datos

El archivo diabetes.csv es originalmente del Instituto Nacional de Diabetes y Enfermedades Digestivas y Renales. El objetivo del conjunto de datos es predecir diagnósticamente si un paciente tiene o no diabetes, en función de ciertas mediciones diagnósticas incluidas en el conjunto de datos. Se impusieron varias restricciones a la selección de estos casos de una base de datos más grande. En particular, todos los pacientes aquí son mujeres de al menos 21 años de edad de ascendencia indígena Pima.

Variables contenidas:
- Pregnancies: Número de embarazos

- Glucose: Concentración de glucosa en un test oral de tolerancia a la glucosa.

- BloodPresure: Presión sistólica (en mmHg)

- SkinThickness: Grosor del pliegue cutáneo del tríceps (mm)

- BMI: Índice de Masa Corporal (en kg/altura^2)

- DiabetesPedigreeFunction: Diabetes pedigree function

- Age: Edad (en años)

- Outcome: Target que indica si tiene o no diabetes

In [None]:
import pandas as pd
import numpy as np
datos = pd.read_csv('diabetes.csv')
datos.head()

In [None]:
array = datos.values
X = array[:,0:8]
Y = array[:,8]

In [None]:
datos.describe()

# Filtros

## Selección por umbral de varianza

In [None]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, SelectFpr, SelectFdr, SelectFwe

selector_var = VarianceThreshold(threshold = 20)
selector_var.fit_transform(X)

In [None]:
selected_columns = datos.drop('Outcome', axis=1).columns[selector_var.get_support()]
selected_columns

In [None]:
df_selected = pd.DataFrame(selector_var.transform(X), columns=selected_columns)
df_selected.head()

## Selección por el estadístico Chi^2

In [None]:
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

selector_chi2 = SelectKBest(score_func=chi2, k=4).fit(X, Y)
selector_chi2

In [None]:
selector_chi2.scores_

In [None]:
selector_chi2.pvalues_

In [None]:
selector_chi2.n_features_in_

In [None]:
selector_chi2.get_support()

In [None]:
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2.get_support()]
selected_columns

In [None]:
selector_chi2.transform(X)

In [None]:
df_selected = pd.DataFrame(selector_chi2.transform(X), columns = selected_columns)
df_selected.head()

In [None]:
selector_chi2_k = SelectKBest(score_func=chi2, k=4).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_k.get_support()]
df_selected = pd.DataFrame(selector_chi2_k.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_chi2_perc = SelectPercentile(score_func = chi2, percentile = 50).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_perc.get_support()]
df_selected = pd.DataFrame(selector_chi2_perc.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_chi2_fpr = SelectFpr(score_func = chi2, alpha = 1e-2).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_fpr.get_support()]
df_selected = pd.DataFrame(selector_chi2_fpr.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_chi2_fpr.pvalues_

In [None]:
selector_chi2_fpr.get_support()

In [None]:
selector_chi2_fpr = SelectFpr(score_func = chi2, alpha = 1e-5).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_fpr.get_support()]
df_selected = pd.DataFrame(selector_chi2_fpr.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_chi2_fpr.pvalues_

In [None]:
selector_chi2_fpr.get_support()

In [None]:
selector_chi2_fdr = SelectFdr(score_func = chi2, alpha = 0.01).fit(X, Y) #  Benjamini-Hochberg
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_fdr.get_support()]
df_selected = pd.DataFrame(selector_chi2_fdr.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_chi2_fwe = SelectFwe(score_func = chi2, alpha = 0.01).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_chi2_fwe.get_support()]
df_selected = pd.DataFrame(selector_chi2_fwe.transform(X), columns=selected_columns)
df_selected.head()

## Selección por el estadístico F de ANOVA

In [None]:
selector_anova = SelectKBest(score_func=f_classif, k=4).fit(X, Y)
selector_anova.scores_

In [None]:
selector_anova.pvalues_

In [None]:
selector_anova.n_features_in_

In [None]:
selector_anova.get_support()

In [None]:
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova.get_support()]
df_selected = pd.DataFrame(selector_anova.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_anova_k = SelectKBest(score_func=f_classif, k=4).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova_k.get_support()]
df_selected = pd.DataFrame(selector_anova_k.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_anova_perc = SelectPercentile(score_func=f_classif, percentile = 50).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova_perc.get_support()]
df_selected = pd.DataFrame(selector_anova_perc.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_anova_fpr = SelectFpr(score_func=f_classif, alpha = 0.01).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova_fpr.get_support()]
df_selected = pd.DataFrame(selector_anova_fpr.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_anova_fdr = SelectFpr(score_func=f_classif, alpha = 0.01).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova_fdr.get_support()]
df_selected = pd.DataFrame(selector_anova_fdr.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_anova_fwe = SelectFpr(score_func=f_classif, alpha = 0.01).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_anova_fwe.get_support()]
df_selected = pd.DataFrame(selector_anova_fwe.transform(X), columns=selected_columns)
df_selected.head()

## Selección por el indicador de información mutua

$$I(X;Y) = \sum_x\sum_y{p(x,y)\log\left(\frac{p(x,y)}{p(x)p(y)}\right)}$$

In [None]:
selector_im = SelectKBest(score_func=lambda X, y: mutual_info_classif(X, Y, random_state=42), k=4).fit(X, Y)
selector_im.scores_

In [None]:
selector_im.get_support()

In [None]:
selected_columns = datos.drop('Outcome', axis=1).columns[selector_im.get_support()]
df_selected = pd.DataFrame(selector_im.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_im_k = SelectKBest(score_func=lambda X, y: mutual_info_classif(X, Y, random_state=42), k=4).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_im_k.get_support()]
df_selected = pd.DataFrame(selector_im_k.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_im_perc = SelectPercentile(score_func=lambda X, y: mutual_info_classif(X, Y, random_state=42), percentile=50).fit(X, Y)
selected_columns = datos.drop('Outcome', axis=1).columns[selector_im_perc.get_support()]
df_selected = pd.DataFrame(selector_im_perc.transform(X), columns=selected_columns)
df_selected.head()

# Wrappers

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter = 400)

In [None]:
x = datos.drop('Outcome', axis=1)
y = datos['Outcome']

## Forward selection

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector, RFE
selector_forward = SequentialFeatureSelector(model, n_features_to_select= 'auto', scoring = 'f1', direction='forward', tol = 1e-3).fit(x, y)

Revisar [aquí](https://scikit-learn.org/dev/modules/model_evaluation.html#scoring-parameter) los posibles valores para el argumento scoring

In [None]:
selector_forward.n_features_in_

In [None]:
selector_forward.feature_names_in_

In [None]:
selector_forward.get_support()

In [None]:
selector_forward.get_feature_names_out()

In [None]:
selector_forward.transform(x)

In [None]:
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='forward', tol = 1e-1).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='forward', tol = 1e-3).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='forward', tol = 1e-5).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='forward').fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

## Backward selection

In [None]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='backward', tol = -1e-1).fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='backward', tol = -1e-3).fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='backward', tol = -1e-8).fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'f1', direction='backward').fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

## RFE

In [None]:
selector_rfe = RFE(model, n_features_to_select= 3).fit(x, y)

In [None]:
selector_rfe.ranking_

In [None]:
selector_rfe.get_support()

In [None]:
selector_rfe.get_feature_names_out()

In [None]:
df_selected = datos[selector_rfe.get_feature_names_out()]
df_selected.head()

In [None]:
selector_rfe = RFE(model, n_features_to_select = 0.7).fit(x, y)
df_selected = datos[selector_rfe.get_feature_names_out()]
df_selected.head()

In [None]:
selector_rfe = RFE(model).fit(x, y)
df_selected = datos[selector_rfe.get_feature_names_out()]
df_selected.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier()
selector_forward = SequentialFeatureSelector(model_tree, n_features_to_select = 'auto', direction='forward', tol = 1e-5).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
selector_forward = SequentialFeatureSelector(model_rf, n_features_to_select = 'auto', direction='forward', tol = 1e-5).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
from sklearn.svm import SVC
model_svm = SVC(kernel='linear')
selector_forward = SequentialFeatureSelector(model_svm, n_features_to_select = 'auto', direction='forward', tol = 1e-5).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

## Boruta

In [None]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [None]:
x = datos.drop('Outcome', axis=1)
y = datos['Outcome']

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

In [None]:
boruta_selector = BorutaPy(estimator=rf, n_estimators='auto', verbose=0, random_state=2024)

In [None]:
boruta_selector.fit(x.values, y.values)

In [None]:
selected_features = x.columns[boruta_selector.support_].tolist()
selected_features

In [None]:
datos[selected_features + ['Outcome']]