In [1]:
#sklearn 特征选取
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
# chi2适用于分类任务
from sklearn.feature_selection import chi2

# url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
url = "pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv(url, names=names, header=None)

narr = data.values
X = narr[:, 0:8]
y = narr[:, 8]

# 使用chi2作为度量函数，选取前四个特征
skb= SelectKBest(score_func=chi2, k=4)

"""
score_func 的可选值有：

f_classif:  使用label和feature之间方差分析的F值作为选取准则，适用于分类
mutual_info_classif: 使用离散target的互信息作为选取准则，适用于分类.
chi2: 使用非负features的卡方测试作为选取准则，适用于分类.
f_regression:  使用label和feature的F值作为选取准则，适用于回归.
mutual_info_regression: 使用连续变量的互信息作为选取准则，适用于回归.
SelectPercentile: Select features based on percentile of the highest scores.
SelectFpr: Select features based on a false positive rate test.
SelectFdr: Select features based on an estimated false discovery rate.
SelectFwe: Select features based on family-wise error rate.
GenericUnivariateSelect: Univariate feature selector with configurable mode.

"""
fit =  skb.fit(X, y)
# 查看各个feature对应的score, score越高，该feature越重要
print(fit.scores_)
# 获取筛选之后的features
X_selected =  fit.transform(X)
# 执行到这一步之后，特征选取过程也就基本结束。

[ 111.51969064 1411.88704064   17.60537322   53.10803984 2175.56527292
  127.66934333    5.39268155  181.30368904]


In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.datasets import load_iris 


iris = load_iris()
X = iris.data
y = iris.target
logreg = LogisticRegression()
# 选取两个特征
rfe = RFE(logreg, 2)
fit = rfe.fit(X, y)

# fit.support_返回一个True/False 组成的list, 每个元素与每一个特征对应，元素为True时，
# 表示该特征被选取了
print(fit.support_)
# 返回一个由integer组成的list，值为1表示对应的特征被选取了
print(fit.ranking_)
# 打印出选出的特征
print([iris.feature_names[index] for index, ret in  enumerate(fit.support_) if ret])


[False  True False  True]
[3 1 2 1]
['sepal width (cm)', 'petal width (cm)']




In [3]:
from sklearn.decomposition import PCA

# 选取4个主要成分，其实PCA是特殊的SVD
pca = PCA(n_components=4)
fit = pca.fit(X, y)
print("每个主成分的方差: %s" % fit.explained_variance_ratio_)
# 打印出变换矩阵
print(fit.components_)
# 得到PCA变换的之后数据
print(fit.components_.dot(X.T).T)

每个主成分的方差: [0.92461872 0.05306648 0.01710261 0.00521218]
[[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
 [ 0.65658877  0.73016143 -0.17337266 -0.07548102]
 [-0.58202985  0.59791083  0.07623608  0.54583143]
 [-0.31548719  0.3197231   0.47983899 -0.75365743]]
[[ 2.81823951e+00  5.64634982e+00 -6.59767544e-01  3.10892758e-02]
 [ 2.78822345e+00  5.14995135e+00 -8.42316989e-01 -6.56748375e-02]
 [ 2.61337456e+00  5.18200315e+00 -6.13952460e-01  1.33833231e-02]
 [ 2.75702228e+00  5.00865360e+00 -6.00293343e-01  1.08927529e-01]
 [ 2.77364860e+00  5.65370709e+00 -5.41773476e-01  9.46103054e-02]
 [ 3.22150550e+00  6.06828303e+00 -4.63175058e-01  5.75525704e-02]
 [ 2.68182738e+00  5.23749119e+00 -3.73960558e-01  8.14948193e-02]
 [ 2.87622016e+00  5.49033754e+00 -6.53732034e-01  7.86495834e-02]
 [ 2.61598240e+00  4.74864082e+00 -6.11093146e-01  6.00964486e-02]
 [ 2.82960933e+00  5.21317833e+00 -8.29485441e-01  8.96471141e-02]
 [ 2.99541804e+00  5.97202148e+00 -7.07170726e-01  4.83716373e-02]
 [

In [6]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

tr = ExtraTreesClassifier()
fit = tr.fit(X, y)
# 打印出每个feature的重要性score
print(fit.feature_importances_)
# 打印出Top 2 important features
d  = list(enumerate(fit.feature_importances_))
d.sort(key = lambda a:a[1], reverse=True)
print([iris.feature_names[item[0]] for index, item in enumerate(d) if index < 2])

[0.04959334 0.05810149 0.27501935 0.61728582]
['petal width (cm)', 'petal length (cm)']


