# What is Recursive Feauture Elimination?

<div  style="color:blue;font-family:Candara,arial,helvetica;line-height:20px"><strong>

## Recursive feature elimination (RFE) is a feature selection method that fits a model and removes the weakest feature (or features) until the specified number of features is reached.

<img src="https://topepo.github.io/caret/premade/Algo1.png" alt="drawing" width="600" height="300"/>  

</strong></div> 

# Build Production Model without RFE

In [2]:
# -----------------------------------------------------------------
# Implement Recursive Feature Elimination.
# Predict product purchase for the Bank Telemarketing dataset
# -----------------------------------------------------------------

# Import libraries
import pandas as pd

# Read the file
f = pd.read_csv('bank.csv')
f = f.drop("duration", axis = 1)

display(f)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [5]:
# Split the columns into Dependent (Y) and independent (X) features
x = f.iloc[:,:-1]
y = f.iloc[:, -1]


# Create dummy variables
x = pd.get_dummies(x, drop_first=True)
y = pd.get_dummies(y, drop_first=True)


# Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
train_test_split(x, y, test_size = 0.3, random_state = 1234, stratify=y)

# Import Randon Forest Classifier
from sklearn.ensemble import RandomForestClassifier 

# Default Random Forest Object
rfc1 = RandomForestClassifier(random_state=1234)
rfc1.fit(X_train, Y_train)
Y_predict1 = rfc1.predict(X_test)


# Score and Evaluate the model 
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(Y_test, Y_predict1)
score1 = rfc1.score(X_test, Y_test)




In [6]:
print(cm1)
print(score1)

[[10653   312]
 [ 1031   361]]
0.8913166626203771


# Build Production Model without RFE

In [7]:
# Apply Recursive Feature Elimination
from sklearn.feature_selection import RFE
rfc2 = RandomForestClassifier(random_state=1234)

# Create an RFE selector object using RFC as an estimator
rfe = RFE(estimator=rfc2, n_features_to_select=30, step=1)

# Fit the data to the rfe selector
rfe.fit(x, y)

# Create new Train and Test datasets
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Fit the Random Forest classifier to the new train and test with 80 features
rfc2.fit(X_train_rfe, Y_train)

# Test the model with new Test dataset
Y_predict = rfc2.predict(X_test_rfe)

# Score and Evaluate the new model 
from sklearn.metrics import confusion_matrix
cm_rfe = confusion_matrix(Y_test, Y_predict)
score_rfe = rfc2.score(X_test_rfe, Y_test)

  y = column_or_1d(y, warn=True)
  app.launch_new_instance()


In [8]:
print(cm_rfe)
print(score_rfe)

[[10640   325]
 [ 1027   365]]
0.8905883305009307


# Get Column detials

In [9]:
# Get column names
columns = list(x.columns)

# Get the ranking of the features. Ranking 1 for selected features
ranking = rfe.ranking_

# Get the feature importance scores
feature_importance = rfc1.feature_importances_

# Create the dataframe of the Features selected, Ranking and their importance
rfe_selected = pd.DataFrame()
rfe_selected = pd.concat([pd.DataFrame(columns), 
                          pd.DataFrame(ranking),
                          pd.DataFrame(feature_importance)], axis=1)

rfe_selected.columns = ["Feature Name", "Ranking", "Feature Importance"]

In [10]:
display(rfe_selected)

Unnamed: 0,Feature Name,Ranking,Feature Importance
0,age,1,0.1733147
1,campaign,1,0.08701761
2,pdays,1,0.02126111
3,previous,1,0.0151896
4,emp.var.rate,1,0.03259009
5,cons.price.idx,1,0.01477479
6,cons.conf.idx,1,0.02813089
7,euribor3m,1,0.1136265
8,nr.employed,1,0.06261372
9,job_blue-collar,1,0.0150574
