<a href="https://colab.research.google.com/github/jeguns/EP7173/blob/main/Unidad%2006/Selecci%C3%B3n_de_atributos_en_modelos_de_regresi%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instalación de paquetes

In [None]:
pip install boruta

## Lectura de datos

El dataset Boston tiene su origen en un estudio realizado en la década de 1970 por el Departamento de Investigación y Desarrollo de la Comisión de Boston en colaboración con el economista David Harrison y el profesor de estadística del MIT, Daniel L. Rubinfeld.

El objetivo del dataset es predecir el valor medio de las casas en diferentes áreas de Boston basándose en diversas características socioeconómicas, geográficas y ambientales.

Variables contenidas:

CRIM - Tasa de criminalidad per cápita por ciudad.

ZN - Proporción de suelo residencial zonificado para lotes de más de 25,000 pies cuadrados.

INDUS - Proporción de acres comerciales no minoristas por ciudad.

CHAS - Variable indicadora de Charles River (1 si el área está al lado del río; 0 en caso contrario).

NOX - Concentración de óxidos de nitrógeno (partes por 10 millones).

RM - Número medio de habitaciones por vivienda.

AGE - Proporción de unidades ocupadas por propietarios construidas antes de 1940.

DIS - Distancia ponderada a cinco centros de empleo de Boston.

RAD - Índice de accesibilidad a carreteras radiales.

TAX - Tasa de impuesto a la propiedad por cada $10,000.

PTRATIO - Ratio de alumnos por maestro en la ciudad.

B - 1000(Bk - 0.63)^2  donde Bk es la proporción de personas de raza negra en la ciudad.

LSTAT - Porcentaje de población con bajo estatus socioeconómico.

MEDV - Valor medio de las viviendas ocupadas por propietarios en $1,000 (variable objetivo).

In [None]:
import pandas as pd
import numpy as np
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
datos = pd.read_csv('housing.csv', delim_whitespace=True, names=column_names)
datos.head()

In [None]:
array = datos.values
X = array[:,0:13]
Y = array[:,13]

In [None]:
datos.describe()

# Filtros

## Selección por umbral de varianza

In [None]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, SelectFpr, SelectFdr, SelectFwe

selector_var = VarianceThreshold(threshold = 0)
selector_var.fit_transform(X)

In [None]:
selected_columns = datos.drop('MEDV', axis=1).columns[selector_var.get_support()]
selected_columns

In [None]:
df_selected = pd.DataFrame(selector_var.transform(X), columns=selected_columns)
df_selected.head()

## Selección por correlación

In [None]:
from sklearn.feature_selection import r_regression, f_regression, mutual_info_regression

selector_cor = SelectKBest(score_func = r_regression, k=4).fit(X, Y)
selector_cor

In [None]:
selector_cor.scores_

In [None]:
selector_cor.n_features_in_

In [None]:
selector_cor.get_support()

In [None]:
datos.drop('MEDV', axis=1).columns[selector_cor.get_support()]

Seleccionando las variables más asociadas (independientemente si la asociación es directa o inversa)

In [None]:
selected_indices = np.argsort(np.abs(selector_cor.scores_))[-4:]
selected_indices

In [None]:
selected_columns = datos.columns[selected_indices]
selected_columns

In [None]:
df_selected = pd.DataFrame(X[:, selected_indices], columns = selected_columns)
df_selected.head()

## Selección por el estadístico F de ANOVA

In [None]:
selector_anova = SelectKBest(score_func=f_regression, k=4).fit(X, Y)
selector_anova.scores_

In [None]:
selector_anova.pvalues_

In [None]:
selector_anova.n_features_in_

In [None]:
selector_anova.get_support()

In [None]:
selected_columns = datos.drop('MEDV', axis=1).columns[selector_anova.get_support()]
df_selected = pd.DataFrame(selector_anova.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_anova_k = SelectKBest(score_func=f_regression, k=4).fit(X, Y)
selected_columns = datos.drop('MEDV', axis=1).columns[selector_anova_k.get_support()]
df_selected = pd.DataFrame(selector_anova_k.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_anova_perc = SelectPercentile(score_func=f_regression, percentile = 50).fit(X, Y)
selected_columns = datos.drop('MEDV', axis=1).columns[selector_anova_perc.get_support()]
df_selected = pd.DataFrame(selector_anova_perc.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_anova_fpr = SelectFpr(score_func=f_regression, alpha = 0.01).fit(X, Y)
selected_columns = datos.drop('MEDV', axis=1).columns[selector_anova_fpr.get_support()]
df_selected = pd.DataFrame(selector_anova_fpr.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_anova_fdr = SelectFpr(score_func=f_regression, alpha = 0.01).fit(X, Y)
selected_columns = datos.drop('MEDV', axis=1).columns[selector_anova_fdr.get_support()]
df_selected = pd.DataFrame(selector_anova_fdr.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_anova_fwe = SelectFpr(score_func=f_regression, alpha = 0.01).fit(X, Y)
selected_columns = datos.drop('MEDV', axis=1).columns[selector_anova_fwe.get_support()]
df_selected = pd.DataFrame(selector_anova_fwe.transform(X), columns=selected_columns)
df_selected.head()

## Selección por el indicador de información mutua

In [None]:
selector_im = SelectKBest(score_func=lambda X, y: mutual_info_regression(X, Y, random_state=42), k=4).fit(X, Y)
selector_im.scores_

In [None]:
selector_im.get_support()

In [None]:
selected_columns = datos.drop('MEDV', axis=1).columns[selector_im.get_support()]
df_selected = pd.DataFrame(selector_im.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_im_k = SelectKBest(score_func=lambda X, y: mutual_info_regression(X, Y, random_state=42), k=4).fit(X, Y)
selected_columns = datos.drop('MEDV', axis=1).columns[selector_im_k.get_support()]
df_selected = pd.DataFrame(selector_im_k.transform(X), columns=selected_columns)
df_selected.head()

In [None]:
selector_im_perc = SelectPercentile(score_func=lambda X, y: mutual_info_regression(X, Y, random_state=42), percentile=50).fit(X, Y)
selected_columns = datos.drop('MEDV', axis=1).columns[selector_im_perc.get_support()]
df_selected = pd.DataFrame(selector_im_perc.transform(X), columns=selected_columns)
df_selected.head()

# Wrappers

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
x = datos.drop('MEDV', axis=1)
y = datos['MEDV']

## Forward selection

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector, RFE
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'neg_mean_squared_error', direction='forward', tol = 1e-3).fit(x, y)

Revisar [aquí](https://scikit-learn.org/dev/modules/model_evaluation.html#scoring-parameter) los posibles valores para el argumento scoring

In [None]:
selector_forward.n_features_in_

In [None]:
selector_forward.feature_names_in_

In [None]:
selector_forward.get_support()

In [None]:
selector_forward.get_feature_names_out()

In [None]:
selector_forward.transform(x)

In [None]:
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'neg_mean_squared_error', direction='forward', tol = 5e-1).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'neg_mean_squared_error', direction='forward', tol = 1e-1).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'neg_mean_squared_error', direction='forward', tol = 1e-3).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_forward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'neg_mean_squared_error', direction='forward').fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

## Backward selection

In [None]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'neg_mean_squared_error', direction='backward', tol = -8e-1).fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'neg_mean_squared_error', direction='backward', tol = -1e-1).fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

In [None]:
selector_backward = SequentialFeatureSelector(model, n_features_to_select = 'auto', scoring = 'neg_mean_squared_error', direction='backward').fit(x, y)
df_selected = datos[selector_backward.get_feature_names_out()]
df_selected.head()

## Recursive Feature Elimination (RFE)

In [None]:
selector_rfe = RFE(model, n_features_to_select= 3).fit(x, y)

In [None]:
selector_rfe.ranking_

In [None]:
selector_rfe.get_support()

In [None]:
selector_rfe.get_feature_names_out()

In [None]:
df_selected = datos[selector_rfe.get_feature_names_out()]
df_selected.head()

In [None]:
selector_rfe = RFE(model, n_features_to_select = 0.7).fit(x, y)
df_selected = datos[selector_rfe.get_feature_names_out()]
df_selected.head()

In [None]:
selector_rfe = RFE(model).fit(x, y)
df_selected = datos[selector_rfe.get_feature_names_out()]
df_selected.head()

## Exhaustive Feature Selection

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector
selector_efs = ExhaustiveFeatureSelector(model,
                                         min_features=1,
                                         max_features=4,
                                         scoring='neg_mean_squared_error',
                                         print_progress=True,
                                         cv=5)

In [None]:
import warnings
warnings.filterwarnings('ignore')
selector_efs.fit(x,y)

In [None]:
warnings.filterwarnings('default')

In [None]:
selector_efs.subsets_[0]

In [None]:
selector_efs.subsets_[20]

In [None]:
selector_efs.best_score_

In [None]:
selector_efs.best_idx_

In [None]:
selected_features = selector_efs.best_feature_names_
selected_features

In [None]:
df_selected = datos[list(selected_features)]
df_selected.head()

Otros modelos:

In [None]:
from sklearn.svm import SVR
model_svr = SVR(kernel='linear')
selector_forward = SequentialFeatureSelector(model_svr, n_features_to_select = 'auto', direction='forward', tol = 1e-5).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

In [None]:
from sklearn.neighbors import KNeighborsRegressor
model_knr = KNeighborsRegressor(n_neighbors=5)
selector_forward = SequentialFeatureSelector(model_knr, n_features_to_select = 'auto', direction='forward', tol = 1e-5).fit(x, y)
df_selected = datos[selector_forward.get_feature_names_out()]
df_selected.head()

## Boruta

In [None]:
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy

In [None]:
x = datos.drop('MEDV', axis=1)
y = datos['MEDV']

rf = RandomForestRegressor(n_estimators=100)

In [None]:
boruta_selector = BorutaPy(estimator=rf, n_estimators='auto', verbose=0, random_state=2024)

In [None]:
boruta_selector.fit(x.values, y.values)

In [None]:
boruta_selector.support_

In [None]:
boruta_selector.ranking_

In [None]:
selected_features = x.columns[boruta_selector.support_].tolist()
selected_features

In [None]:
datos[selected_features + ['MEDV']]