# Feature Selection

In [3]:
import pandas as pd
import numpy as np
import sklearn.feature_selection as selection
from sklearn.ensemble import RandomForestClassifier

In [16]:
glass_data = pd.read_csv('data/dati/glass.data.csv')

In [17]:
glass_data.head()

Unnamed: 0,Id number,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [12]:
glass_data.drop('Id number', axis=1,inplace=True)

### Recursive Feature Elimination

The Recursive Feature Elimination (RFE) method is a feature selection approach. It works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute.

In [5]:
rfc = RandomForestClassifier()
rfe = selection.RFE(rfc, n_features_to_select=3)

In [13]:
rfe = rfe.fit(glass_data.drop('Type of glass',axis=1), glass_data['Type of glass'])

In [14]:
print(rfe.support_)
print(rfe.ranking_)

[ True False False  True False False  True False False]
[1 3 2 1 5 4 1 6 7]


### Feature importance Decision Trees

Methods that use ensembles of decision trees (like Random Forest or Extra Trees) can also compute the relative importance of each attribute. These importance values can be used to inform a feature selection process.

In [18]:
rfc = rfc.fit(glass_data.drop('Type of glass',axis=1), glass_data['Type of glass'])

In [21]:
rfc.feature_importances_

array([ 0.43447702,  0.11154621,  0.06410203,  0.09441517,  0.05751135,
        0.03918151,  0.05647195,  0.03562611,  0.09973989,  0.00692876])

### Univariate feature selection

In [37]:
X = glass_data.drop('Type of glass',axis=1)
y = glass_data['Type of glass']
X.shape

(214, 10)

In [35]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [36]:
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
X_new.shape

(214, 2)

In [2]:
from sklearn.feature_selection import SelectPercentile, f_classif

In [None]:
selector = SelectPercentile(f_classif, percentile=50)
selector.fit(X, y)
selector.get_support()