https://machinelearningmastery.com/feature-selection-machine-learning-python/

## 1. Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)

In [20]:
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pandas.read_csv(url, names=names)
array = df.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]
[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


In [21]:
print(df.shape)
df.head()

(768, 9)


Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [30]:
df.dtypes

preg       int64
plas       int64
pres       int64
skin       int64
test       int64
mass     float64
pedi     float64
age        int64
class      int64
dtype: object

In [28]:
from scipy.stats import ks_2samp

In [29]:
df['class'].value_counts()

0    500
1    268
Name: class, dtype: int64

In [54]:
f = 'plas'
f = 'preg'
f = 'pedi'
s0 = df[ df['class']==0 ][f]
s1 = df[ df['class']==1 ][f]
compare = ks_2samp(s0, s1)
print(compare)

Ks_2sampResult(statistic=0.1701492537313433, pvalue=6.841012156844073e-05)


In [60]:
f = 'plas'
#f = 'preg'
#f = 'pedi'
from scipy.stats import chi2_contingency
from scipy.stats import chi2 as chi2_stats

df_tmp = df[[f, 'class']].copy()
df_tmp['one'] = 1
dfc = df_tmp.groupby([f, 'class']).count().reset_index()
dfcp = dfc.pivot(index=f, columns='class', values='one').fillna(0)
table = dfcp.values.tolist()

stat, p, dof, expected = chi2_contingency(table)

# interpret test-statistic
threshold = 0.95
critical = chi2_stats.ppf(threshold, dof)

In [61]:
print(stat, p, dof, critical)

269.7332418198132 5.1045667798171764e-11 135 163.11610079386037


In [62]:
print(dfcp.shape)
dfcp

(136, 2)


class,0,1
plas,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3.0,2.0
44,1.0,0.0
56,1.0,0.0
57,2.0,0.0
61,1.0,0.0
62,1.0,0.0
65,1.0,0.0
67,1.0,0.0
68,3.0,0.0
71,4.0,0.0


In [43]:
df[df['class'] == 1]['age'].value_counts()

25    14
31    13
29    13
41    13
43    11
22    11
38    10
28    10
36    10
33    10
32     9
45     8
24     8
27     8
26     8
23     7
42     7
46     7
52     7
30     6
37     6
40     6
35     5
21     5
50     5
51     5
44     5
54     4
47     4
34     4
53     4
39     3
49     3
58     3
60     2
66     2
62     2
56     2
59     2
61     1
67     1
57     1
55     1
48     1
70     1
Name: age, dtype: int64

## 2. Feature Extraction with RFE

In [65]:
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]




In [66]:
fit

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=100,
                                 multi_class='warn', n_jobs=None, penalty='l2',
                                 random_state=None, solver='warn', tol=0.0001,
                                 verbose=0, warm_start=False),
    n_features_to_select=3, step=1, verbose=0)

In [67]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## 3. Principal Component Analysis

In [69]:
import numpy
from pandas import read_csv
from sklearn.decomposition import PCA
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]


## 4. Feature Importance with Extra Trees Classifier

In [70]:
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.108 0.211 0.097 0.082 0.088 0.154 0.12  0.14 ]


