## 1 通过卡方检验选定数据

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [2]:
filename = 'pima_data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv(filename, names=names)

In [3]:
data.head(5)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
array = data.values
x = array[:, :8]
y = array[:, 8]
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(x, y)
np.set_printoptions(precision=3)
print (fit.scores_)
features = fit.transform(x)
print (features)

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]
[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 ...
 [121.  112.   26.2  30. ]
 [126.    0.   30.1  47. ]
 [ 93.    0.   30.4  23. ]]


## 2 通过递归消除（RFE）来选定特征

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [8]:
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(x, y)
print ("feature num:{}".format(fit.n_features_))
print ("selected feature:{}".format(fit.support_))
print ("feature rank:{}".format(fit.ranking_))

feature num:3
selected feature:[ True False False False False  True  True False]
feature rank:[1 2 4 5 6 1 1 3]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## 3 通过主要成分分析选定数据特征

In [9]:
from sklearn.decomposition import PCA

In [10]:
pca = PCA(n_components=3)
fit = pca.fit(x)
print ("explained var: %s" % fit.explained_variance_ratio_)
print (fit.components_)

explained var: [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]


## 4 通过决策树计算特征的重要性

In [12]:
from sklearn.ensemble import ExtraTreesClassifier

In [14]:
model = ExtraTreesClassifier()
fit = model.fit(x, y)
print (fit.feature_importances_)

[0.107 0.229 0.101 0.079 0.074 0.141 0.122 0.147]
