In [2]:
# Univariate Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from numpy import set_printoptions
from pandas import read_csv
url = 'https://goo.gl/bDdBiA'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
y = array[:,8]
# feature extraction
test = SelectKBest(score_func=f_classif, k=4)
fit = test.fit(X, y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
# summarize selected features
features = fit.transform(X)
print(features[0:5,:])


[ 39.67  213.162   3.257   4.304  13.281  71.772  23.871  46.141]
[[  6.  148.   33.6  50. ]
 [  1.   85.   26.6  31. ]
 [  8.  183.   23.3  32. ]
 [  1.   89.   28.1  21. ]
 [  0.  137.   43.1  33. ]]


In [None]:
# Feature selection with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
url = 'https://goo.gl/bDdBiA'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression(solver='liblinear')
RFE = RFE(model, n_features_to_select=3)
fit = RFE.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)


Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]


### Notes
Notice that RFE code block above is only using the fit method (Fit and Multiple Transform idiom). We haven't called transform method yet because we're simply at the feature selection stage. RFE is a method that recursively eliminates features, fit to the training set, to see which ones most influence the output variable (class, in the case of the Logistic Regression model above). Great for reducing overfitting, improving accuracy, and reducing training time.



In [4]:
# Feature selection with RFE
model = LogisticRegression(solver='liblinear')
RFE = RFE(model, n_features_to_select=5)
fit = RFE.fit(X, Y)
# print RFE rankings
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 5
Selected Features: [ True  True  True False False  True  True False]
Feature Ranking: [1 1 1 3 4 1 1 2]


### Principal Component Analysis (PCA)

Data reduction technique for dimensionality reduction specifically by transforming data into a compressed form.

You can choose the number of components in the transformed result.

In [7]:
from sklearn.decomposition import PCA
# PCA for feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X) # note how we only pass X argument
# Summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [ 2.265e-02  9.722e-01  1.419e-01 -5.786e-02 -9.463e-02  4.697e-02
   8.168e-04  1.402e-01]
 [ 2.246e-02 -1.434e-01  9.225e-01  3.070e-01 -2.098e-02  1.324e-01
   6.400e-04  1.255e-01]]


In [8]:
from sklearn.ensemble import ExtraTreesClassifier
# feature extraction
model = ExtraTreesClassifier(n_estimators=100, random_state=0)
fit = model.fit(X, Y)
print(fit.feature_importances_)


[0.109 0.232 0.099 0.082 0.077 0.141 0.118 0.14 ]
