# 特征工程

## 特征提取

## 特征创造

## 特征选择

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier as RFC
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv('train.csv')
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]
cross_val_score(RFC(n_estimators=10, random_state=0), X, Y, cv = 5).mean()

0.9373571428571429

In [8]:
X.shape
# 特征很多

(42000, 784)

### 过滤法 

一般先方差过滤，再互信息法过滤

#### 方差过滤

消除方差小的特征  
主要用于会遍历所有特征的算法  
对随机森林没用对决策树有用

In [None]:
from sklearn.feature_selection import VarianceThreshold


In [34]:
X_var0 = VarianceThreshold().fit_transform(X)
cross_val_score(RFC(n_estimators=10, random_state=0), X_var0, Y, cv = 5).mean()

0.9370238095238095

In [17]:
X_var0.shape
# 说明已经删除了方差为0的特征

(42000, 708)

找到方差中位数，删除一半特征

In [21]:
np.median(X.var().values)

1352.286703180131

In [35]:
X_var_median = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
cross_val_score(RFC(n_estimators=10, random_state=0), X_var_median, Y, cv = 5).mean()

0.9390476190476191

In [24]:
X_var_median.shape

(42000, 392)

In [25]:
# 若特征是伯努利随机变量，既非1即0
X_bvar = VarianceThreshold(0.8 * 0.2).fit_transform(X)
X_bvar.shape

(42000, 685)

#### 相关性过滤

用下面三种方法检验特征与标签之间的相关性

##### 卡方过滤$\chi^2$

专门针对离散型标签（即分类问题）的相关性过滤。

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


In [31]:
# 使用上面中位数过滤后的数据 X_var_median
X_chi2 = SelectKBest(chi2, k = 300).fit_transform(X_var_median, Y)
X_chi2.shape 

(42000, 300)

In [33]:
# 检验模型效果如何
cross_val_score(RFC(n_estimators=10, random_state=0), X_chi2, Y, cv = 5).mean()

0.9344761904761905

In [None]:
# 发现效果反而下降 说明300这个阈值让我们删去了一些有用的特征
# 用学习曲线选择阈值
score = []
for i in range(390, 200, -10):
    x_chi2 = SelectKBest(chi2, k = i).fit_transform(X_var_median, Y)
    defen = cross_val_score(RFC(n_estimators=10, random_state=0), x_chi2, Y, cv = 5).mean()
    score.append(defen)

plt.plot(range(390, 200, -10), score)
plt.show()

# 太久了不跑了

KeyboardInterrupt: 

In [40]:
# 根据p-values选择
'''
p<0.05 两组数据就相关
p>0.05 两组数据独立
'''
chi2value, pvalue = chi2(X_var_median, Y)

In [None]:
k = chi2value.shape[0] - (pvalue > 0.05).sum()
k # 所有median过滤后的特征都相关

392

##### F检验(ANOVA)

只能捕捉线性关系

In [42]:
from sklearn.feature_selection import f_classif

In [43]:
F, pvalue_f = f_classif(X_var_median, Y)
pvalue_f

array([0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 4.71193533e-220,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0

In [None]:
k = F.shape[0] - (pvalue_f > 0.05).sum()
k
# F检验也没法有效删去更多特征

392

##### 互信息法

捕捉标签和特征之间的任意关系（线性非线性）  
0表示两个变量独立，1表示相关

In [50]:
from sklearn.feature_selection import mutual_info_classif as MIC

In [51]:
result = MIC(X_var_median, Y)
result

array([0.07390524, 0.08637313, 0.10037046, 0.10916775, 0.11600727,
       0.10309818, 0.07901149, 0.05761935, 0.07532323, 0.0936405 ,
       0.12171314, 0.13828053, 0.15673188, 0.16713907, 0.15095287,
       0.1310024 , 0.09137681, 0.06009781, 0.03932831, 0.02759602,
       0.06897024, 0.09732816, 0.12323781, 0.15359126, 0.1718669 ,
       0.20387153, 0.22567108, 0.23848653, 0.22131858, 0.17874015,
       0.14178139, 0.10172765, 0.07732508, 0.06751436, 0.04470299,
       0.0338991 , 0.0521822 , 0.07372836, 0.10296574, 0.12012267,
       0.13596952, 0.15084789, 0.1630685 , 0.17877314, 0.1864663 ,
       0.16910898, 0.15560988, 0.13239599, 0.11071021, 0.09723356,
       0.07607583, 0.06146645, 0.06635341, 0.04801223, 0.07006331,
       0.07714146, 0.087065  , 0.09388751, 0.10774141, 0.10857233,
       0.11033953, 0.11573556, 0.11559478, 0.11622077, 0.1123884 ,
       0.1150373 , 0.11087119, 0.1073912 , 0.09770074, 0.07787573,
       0.07602304, 0.04019894, 0.0525671 , 0.06472279, 0.07592

In [53]:
k = result.shape[0] - sum(result <= 0)
k
# 说明所有特征都与标签相关

392

### 嵌入法Embedded

In [None]:
'''
参数：
estimator：使用的模型评估器
threshold
prefit：默认False，判断是否将实例化后的模型直接传递给构造函数
norm_order
'''

In [54]:
from sklearn.feature_selection import SelectFromModel

In [56]:
rfc = RFC(n_estimators=10, random_state=0)

In [None]:
# 用原数据跑
X_embedded = SelectFromModel(rfc, threshold=0.005).fit_transform(X, Y)

In [58]:
X_embedded.shape

(42000, 47)

In [None]:
# 用学习曲线找阈值
# 没跑这个代码
threshold = np.linspace(0, (rfc.fit(X, Y).feature_importances_).max(), 20)

score = []
for i in threshold:
    x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(X, Y)
    defen = cross_val_score(rfc, x_embedded, Y, cv = 5).mean()
    score.append(defen)

plt.plot(threshold, score)
plt.show() 

In [59]:
X_embedded = SelectFromModel(rfc, threshold=0.00067).fit_transform(X, Y)
X_embedded.shape

(42000, 324)

In [61]:
cross_val_score(rfc, X_embedded, Y, cv = 5).mean()

0.9391190476190475

In [None]:
# 再次使用学习曲线细化这个参数
# 没有运行
score = []
for i in np.linspace(0, 0.000134, 20):
    x_embedded = SelectFromModel(rfc, threshold=i).fit_transform(X, Y)
    defen = cross_val_score(rfc, x_embedded, Y, cv = 5).mean()
    score.append(defen)

plt.plot(np.linspace(0, 0.00134, 20), score)
plt.show() 
# 发现0.000564 结果最好

In [62]:
X_embedded = SelectFromModel(rfc, threshold=0.000564).fit_transform(X, Y)
X_embedded.shape

(42000, 340)

In [None]:
cross_val_score(rfc, X_embedded, Y, cv = 5).mean()

0.9392857142857144

In [64]:
# 后面可以尝试调整n_estimator进行模型调整
cross_val_score(RFC(n_estimators=100, random_state=0), X_embedded, Y, cv = 5).mean()

0.9634285714285715

### 包装法

计算量介于过滤法和嵌入法之中

In [65]:
'''
参数：
estimators
n_features_to_select：想要选择的特征个数
step：每次迭代中希望移除的特征
support_：返回特征是否被选中的布尔矩阵
ranking：返回特征在迭代中的重要性
'''
from sklearn.feature_selection import RFE

In [66]:
selector = RFE(rfc, n_features_to_select=340, step=50).fit(X, Y)

In [67]:
selector.support_.sum()

340

In [68]:
selector.ranking_

array([10,  9,  8,  7,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  7,  6,  6,
        5,  6,  5,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,  6,  7,  7,
        7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  5,  4,
        4,  5,  3,  4,  4,  4,  5,  4,  5,  7,  6,  7,  7,  7,  8,  8,  8,
        8,  8,  8,  8,  8,  6,  7,  4,  3,  1,  2,  3,  3,  1,  1,  1,  1,
        1,  3,  3,  4,  5,  5,  5,  8,  8,  9,  9,  9,  9,  8,  9,  9,  4,
        4,  3,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,  4,
        5,  5,  9,  9, 10, 10, 10, 10,  7,  4,  4,  3,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  3,  5,  8, 10, 10, 10,
       10,  9,  4,  4,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  3,  4, 10, 10, 10, 10,  9,  7,  4,  3,  2,  2,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        4,  4, 10,  9, 10

In [69]:
X_wrapper = selector.transform(X)

In [70]:
cross_val_score(rfc, X_wrapper, Y, cv = 5).mean()

0.9379761904761905

In [None]:
# 也可以学习曲线
# 不跑了
score = []
for i in range(1, 751, 50):
    x_wrapper = RFE(rfc, n_features_to_select=i, step=50).fit_transform(X, Y)
    defen = cross_val_score(rfc, x_wrapper, Y, cv = 5).mean()
    score.append(defen)

plt.plot(range(1, 751, 50), score)
plt.show()

# 总结

当数据量很大的时候，优先方差过滤和互信息法，然后再其他方法  
使用逻辑回归时，优先使用嵌入法  
使用SVM时，优先包装法  
迷茫的时候，优先过滤法，边看边调