# Eliminación Recursiva de Atributos
Este ejemplo muestra cómo utilizar RFE para reducir el número de atributos.

## Conjunto de datos
El conjunto de datos es la caracterización de tumores benignos y malignos asociados al cáncer de mama. Cuenta con 30 atributos obtenidos de imágenes de tumores y dos clases posibles de tumores.

In [1]:
import pandas as pd

from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

atributos = pd.DataFrame(cancer.data)
atributos.columns = cancer.feature_names

etiquetas = cancer.target

atributos.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Partición de datos

In [2]:
from sklearn.model_selection import train_test_split

X_entrenamiento, X_prueba, y_entrenamiento, y_prueba = train_test_split(atributos, etiquetas, test_size=0.3, random_state=0)

## Estandarización de los datos

In [3]:
from sklearn.preprocessing import StandardScaler

estandarizacion = StandardScaler()
estandarizacion.fit(X_entrenamiento)

# RFE - Funciona mejor si se Estandarizan los datos
Z_entrenamiento = pd.DataFrame(estandarizacion.transform(X_entrenamiento))
Z_prueba = pd.DataFrame(estandarizacion.transform(X_prueba))

## Entrenamiento de un modelo de Regresión Logística sin RFE
Se utilizan los 4096 atributos

In [4]:
from sklearn.linear_model import LogisticRegression

modelo = LogisticRegression()
modelo.fit(Z_entrenamiento,y_entrenamiento)

y_sin_rfe = modelo.predict(Z_prueba)

from sklearn.metrics import classification_report

print(classification_report(y_prueba,y_sin_rfe))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97        63
           1       0.97      0.99      0.98       108

    accuracy                           0.98       171
   macro avg       0.98      0.97      0.97       171
weighted avg       0.98      0.98      0.98       171



## Entrenamiento de un modelo de Regresión Logística con RFE
Al final, se utilizan 15 atributos para el modelo

### (1) Obtención del subconjunto de atributos con RFE

In [5]:
from sklearn.feature_selection import RFE

# Estas tecnicas solo sirven para modelos supervisados
clasificador = LogisticRegression()
seleccionador = RFE(clasificador, step = 1)
seleccionador.fit(Z_entrenamiento,y_entrenamiento)

print(seleccionador.support_)
print(seleccionador.ranking_)

[False False False False False False  True  True  True False  True False
  True  True False  True False False False  True  True  True  True  True
 False False  True  True False  True]
[10  2  6  3 14  5  1  1  1 16  1 13  1  1  9  1 15  8 12  1  1  1  1  1
  4 11  1  1  7  1]


### (2) Entrenamiento del modelo de Regresión Logística con el subconjunto de atributos

In [9]:
Z_rfe = Z_entrenamiento.loc[:,seleccionador.support_]
Z_rfe_prueba = Z_prueba.loc[:,seleccionador.support_]

# Muestra del subconjunto de atributos seleccionados
Z_rfe.head(5)

Unnamed: 0,6,7,8,10,12,13,15,19,20,21,22,23,26,27,29
0,-0.45755,-0.76055,-0.09986,-0.700612,-0.616731,-0.543408,-0.235489,-0.045963,-0.798483,-0.591967,-0.746602,-0.714529,-0.046272,-0.623597,0.450628
1,-0.843301,-0.808805,-1.159759,-0.887604,-0.869191,-0.629005,-0.936002,-0.158707,-1.068703,-0.161981,-1.074343,-0.868941,-0.954894,-0.761238,-0.295414
2,-0.639856,-0.668125,0.581758,-0.644071,-0.656119,-0.499806,-0.563799,-0.474919,-0.558512,-0.051227,-0.61411,-0.552748,-0.384602,-0.504704,-0.133716
3,-0.091016,0.221141,-0.663712,-0.515663,-0.498094,-0.250449,-0.946767,-1.00523,0.998276,0.134448,0.91419,0.86376,0.441794,0.638339,-0.799466
4,-1.137578,-1.263267,0.439011,-0.630106,-0.623375,-0.633968,-0.848096,0.138526,-1.699888,-0.976348,-1.665615,-1.18748,-1.391304,-1.756275,0.56326


In [10]:
modelo_rfe = LogisticRegression()
modelo_rfe.fit(Z_rfe,y_entrenamiento)

y_con_rfe = modelo_rfe.predict(Z_rfe_prueba)

print(classification_report(y_prueba,y_con_rfe))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95        63
           1       0.96      0.98      0.97       108

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171

