### StepForward,StepBackward,Exhaustive Selection

In [2]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.17.2-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.17.2


In [3]:
import mlxtend

In [7]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [27]:
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler

In [28]:
data = load_wine()

In [29]:
X = pd.DataFrame(data.data,columns = data.feature_names)

In [30]:
X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


## Tree based algorithm doesnt require standardization and normalization

In [32]:
y=pd.DataFrame(data.target)

In [33]:
y

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
173,2
174,2
175,2
176,2


In [34]:
X.isnull().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64

In [35]:
X_train ,X_test ,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [36]:
X_train.shape,X_test.shape

((142, 13), (36, 13))

###  Step Forward Selection (SFS)

In [44]:
sfs = SFS(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1),
          k_features=9,
          forward = True,
          floating = False,
          verbose =2,
          scoring = 'accuracy',
          cv = 4,
          n_jobs=-1         
         ).fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    7.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    7.2s finished

[2020-04-05 13:02:26] Features: 1/9 -- score: 0.7674603174603174[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    5.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    5.8s finished

[2020-04-05 13:02:32] Features: 2/9 -- score: 0.9718253968253968[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    5.1s finished

[2020-04-05 13:02:38] Features: 3/9 -- score: 0.9859126984126985[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.8s finished

[2020-04-05 13:02:43] Features: 4/9 -- score: 0.978968

In [45]:
sfs.k_feature_names_

('alcohol',
 'ash',
 'magnesium',
 'flavanoids',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline')

In [46]:
sfs.k_feature_idx_

(0, 2, 4, 6, 8, 9, 10, 11, 12)

In [47]:
sfs.k_score_

0.9861111111111112

In [48]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(6,)","[0.7222222222222222, 0.8333333333333334, 0.742...",0.76746,"(flavanoids,)",0.0670901,0.0418533,0.024164
2,"(6, 9)","[0.9444444444444444, 1.0, 0.9714285714285714, ...",0.971825,"(flavanoids, color_intensity)",0.031492,0.0196459,0.0113425
3,"(4, 6, 9)","[0.9722222222222222, 1.0, 0.9714285714285714, ...",0.985913,"(magnesium, flavanoids, color_intensity)",0.0225862,0.0140901,0.00813492
4,"(4, 6, 9, 12)","[0.9722222222222222, 0.9722222222222222, 0.971...",0.978968,"(magnesium, flavanoids, color_intensity, proline)",0.0194714,0.012147,0.00701308
5,"(2, 4, 6, 9, 12)","[0.9444444444444444, 0.9722222222222222, 0.971...",0.972024,"(ash, magnesium, flavanoids, color_intensity, ...",0.0314903,0.0196449,0.011342
6,"(2, 4, 6, 8, 9, 12)","[0.9722222222222222, 0.9722222222222222, 0.971...",0.978968,"(ash, magnesium, flavanoids, proanthocyanins, ...",0.0194714,0.012147,0.00701308
7,"(0, 2, 4, 6, 8, 9, 12)","[0.9444444444444444, 0.9722222222222222, 1.0, ...",0.979167,"(alcohol, ash, magnesium, flavanoids, proantho...",0.0369201,0.0230321,0.0132976
8,"(0, 2, 4, 6, 8, 9, 11, 12)","[0.9444444444444444, 0.9722222222222222, 1.0, ...",0.979167,"(alcohol, ash, magnesium, flavanoids, proantho...",0.0369201,0.0230321,0.0132976
9,"(0, 2, 4, 6, 8, 9, 10, 11, 12)","[0.9722222222222222, 0.9722222222222222, 1.0, ...",0.986111,"(alcohol, ash, magnesium, flavanoids, proantho...",0.0222636,0.0138889,0.00801875


## Select programitically

In [49]:
sfs = SFS(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1),
          k_features=(1,9),
          forward = True,
          floating = False,
          verbose =2,
          scoring = 'accuracy',
          cv = 4,
          n_jobs=-1         
         ).fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   10.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   10.0s finished

[2020-04-05 13:28:57] Features: 1/9 -- score: 0.7674603174603174[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    6.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    6.0s finished

[2020-04-05 13:29:03] Features: 2/9 -- score: 0.9718253968253968[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    6.3s finished

[2020-04-05 13:29:10] Features: 3/9 -- score: 0.9859126984126985[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.6s finished

[2020-04-05 13:29:17] Features: 4/9 -- score: 0.978968

In [50]:
sfs.k_score_

0.9861111111111112

In [51]:
sfs.k_feature_names_

('alcohol',
 'ash',
 'magnesium',
 'flavanoids',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline')

## Step Backward Feature Selection

In [54]:
sfs = SFS(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1),
          k_features=(1,9),
          forward = False,
          floating = False,
          verbose =2,
          scoring = 'accuracy',
          cv = 4,
          n_jobs=-1         
         ).fit(X_train,y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    6.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:    6.3s finished

[2020-04-05 13:32:25] Features: 12/1 -- score: 0.9861111111111112[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    4.9s finished

[2020-04-05 13:32:30] Features: 11/1 -- score: 0.9861111111111112[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    5.3s finished

[2020-04-05 13:32:36] Features: 10/1 -- score: 0.9791666666666666[Parall

In [55]:
sfs.k_features

(1, 9)

In [56]:
sfs.k_score_

0.9861111111111112

In [57]:
sfs.k_feature_names_

('alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'color_intensity')

## Exhaustive Feature Selection

In [58]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [60]:
%%time
efs = EFS(RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1),
         min_features = 4,
         max_features = 5,
         scoring = 'accuracy',
         cv=None,
         n_jobs=-1).fit(X_train,y_train)


Features: 2002/2002

Wall time: 4min 16s


In [61]:
efs.best_feature_names_

('alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash')

In [62]:
efs.best_score_

1.0

In [63]:
efs.best_idx_

(0, 1, 2, 3)

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plt_sfs
plt_sfs(efs.get_metric_dict())