# Recursive Feature Elimination (RFE)
ref: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## data dictionary
source: https://www.kaggle.com/uciml/pima-indians-diabetes-database  
* Pregnancies: Number of times pregnant
* Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
* BloodPressure: Diastolic blood pressure (mm Hg)
* SkinThickness: Triceps skin fold thickness (mm)
* Insulin: 2-Hour serum insulin (mu U/ml)
* BMI: Body mass index (weight in kg/(height in m)^2)
* DiabetesPedigreeFunction: Diabetes pedigree function
* Age: Age (years)
* Outcome: Class variable (0 or 1) 268 of 768 are 1, the others are 0

In [None]:
url='https://github.com/mathawanup/basic_dataset/raw/master/diabetes.csv'
df=pd.read_csv(url)
df.head()

In [None]:
df.info()

## sklearn: Classifier

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

# these classifiers contain either coef_ or feature_importances_ attribute.
# GradientBoostingClassifier, RandomForestClassifier,AdaBoostClassifier,ExtraTreesClassifier,SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.feature_selection import RFE 

In [None]:
df.columns

In [None]:
cols=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']
X=df[cols]
y=df['Outcome']
test_size=.3
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=test_size, 
                                                    stratify=y,
                                                    random_state=7)

estimator = DecisionTreeClassifier()

selector = RFE(estimator, 4, step=1) # select 4 features
selector = selector.fit(X_train, y_train)

In [None]:
selector.support_ 

In [None]:
selector.ranking_

In [None]:
selector.n_features_

In [None]:
sel_cols=np.array(cols)[selector.support_]
sel_cols

In [None]:
X_train_sel=pd.DataFrame(selector.transform(X_train), columns=np.array(cols)[selector.support_])
X_test_sel=pd.DataFrame(selector.transform(X_test), columns=np.array(cols)[selector.support_])

In [None]:
X_train_sel.head()

In [None]:
X_test_sel.head()

### run model with selected features

In [None]:
model_sel= DecisionTreeClassifier()
model_sel.fit(X_train_sel, y_train)

In [None]:
score_sel=model_sel.score(X_test_sel, y_test)
score_sel

In [None]:
fs=pd.Series(model_sel.feature_importances_, index=X_train_sel.columns).sort_values(ascending=True)
fs

In [None]:
fs.plot(kind='barh');

### run model with all features

In [None]:
X_train.head()

In [None]:
model= DecisionTreeClassifier()
model.fit(X_train, y_train)

In [None]:
score=model.score(X_test, y_test)
score

In [None]:
y_pred=model.predict(X_test)

In [None]:
metrics.confusion_matrix(y_test, y_pred)

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
fs=pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=True)
fs

In [None]:
fs.plot(kind='barh', color='orange');

---