### 基于统计的特征选择

#### 基于皮尔逊相关系数的特征选择

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data/credit_card_default.csv')
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
data['default payment next month'].value_counts(normalize=True)  # 空准确率 77.9%

0    0.7788
1    0.2212
Name: default payment next month, dtype: float64

In [4]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [5]:
pearson = data.corr()     # 皮尔逊相关系数
index = pearson['default payment next month'][:-1].abs() > 0.1
X_subset = X.loc[:, index]
X.columns[index]          # 选择出的重要特征 

Index(['LIMIT_BAL', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'], dtype='object')

#### 特征选择前后模型模型精度对比

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [7]:
cross_val_score(LogisticRegression(), X, y, cv=10).mean()



0.7787667025592632

In [8]:
cross_val_score(KNeighborsClassifier(), X, y, cv=10).mean()

0.7557685117187235

In [9]:
cross_val_score(DecisionTreeClassifier(), X, y, cv=10).mean()

0.7244023755410047

In [21]:
cross_val_score(RandomForestClassifier(), X, y, cv=10).mean()



0.8066361050040116

In [24]:
cross_val_score(LogisticRegression(), X_subset, y, cv=10).mean()



0.7788000247851878

In [25]:
cross_val_score(DecisionTreeClassifier(), X_subset, y, cv=10).mean()

0.7965024706595338

In [26]:
cross_val_score(RandomForestClassifier(), X_subset, y, cv=10).mean()     # 精度为 0.8154359279151032
cross_val_score(KNeighborsClassifier(), X_subset, y, cv=10).mean()       # 精度为 0.7843364916966472



0.796002215115061

#### 基于假设检验的特征选择

In [39]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif
select_model = SelectKBest(score_func=f_classif, k=7)
select_model.fit_transform(X, y)
select_model.get_support()
res = pd.DataFrame({'features': X.columns, 'pvalue': select_model.pvalues_})
res.sort_values('pvalue', ascending=False)

Unnamed: 0,features,pvalue
16,BILL_AMT6,0.3521225
15,BILL_AMT5,0.2416344
14,BILL_AMT4,0.07855564
4,AGE,0.01613685
13,BILL_AMT3,0.01476998
12,BILL_AMT2,0.01395736
11,BILL_AMT1,0.0006673295
3,MARRIAGE,2.485364e-05
2,EDUCATION,1.225038e-06
1,SEX,4.395249e-12


### 基于模型的特征选择

#### 基于决策树的特征选择

In [29]:
from sklearn.tree import DecisionTreeClassifier

In [30]:
Dct = DecisionTreeClassifier()
Dct.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [32]:
Dct.feature_importances_

array([0.05947877, 0.00987531, 0.02481077, 0.01320902, 0.0698004 ,
       0.16124793, 0.03591847, 0.01156139, 0.00706038, 0.00900505,
       0.01182627, 0.06584552, 0.05392542, 0.04456481, 0.04144437,
       0.03931018, 0.05346209, 0.04874335, 0.04941375, 0.05335213,
       0.0400552 , 0.04767658, 0.04841285])

In [43]:
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': Dct.feature_importances_})  # 各特征的重要程度比较
feature_importance

Unnamed: 0,feature,importance
0,LIMIT_BAL,0.059479
1,SEX,0.009875
2,EDUCATION,0.024811
3,MARRIAGE,0.013209
4,AGE,0.0698
5,PAY_0,0.161248
6,PAY_2,0.035918
7,PAY_3,0.011561
8,PAY_4,0.00706
9,PAY_5,0.009005


In [44]:
feature_importance.sort_values(by='importance', ascending=False, inplace=True)   # 特征重要程度排序
feature_importance

Unnamed: 0,feature,importance
5,PAY_0,0.161248
4,AGE,0.0698
11,BILL_AMT1,0.065846
0,LIMIT_BAL,0.059479
12,BILL_AMT2,0.053925
16,BILL_AMT6,0.053462
19,PAY_AMT3,0.053352
18,PAY_AMT2,0.049414
17,PAY_AMT1,0.048743
22,PAY_AMT6,0.048413


In [45]:
feature_tree = feature_importance['feature'][:7]   # 取前7个最重要的属性
X_subset_tree = X[feature_tree]

In [46]:
cross_val_score(LogisticRegression(), X_subset_tree, y, cv=20).mean()           # 精度为 0.778800049600022
cross_val_score(DecisionTreeClassifier(), X_subset_tree, y, cv=20).mean()       # 精度为 0.7266684852748824
cross_val_score(RandomForestClassifier(), X_subset_tree, y, cv=20).mean()       # 精度为 0.8054684982822955
cross_val_score(KNeighborsClassifier(), X_subset_tree, y, cv=20).mean()         # 精度为 0.7489013111561382





0.7489013111561382

#### 基于线模型的特征选择

In [47]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC

select_model = SelectFromModel(estimator=LinearSVC(penalty="l2"), threshold='mean')
select_model.fit(X, y)
index = select_model.get_support()
X_subset_linear = X[X.columns[index]]



In [48]:
cross_val_score(LogisticRegression(), X_subset_linear, y, cv=20).mean()         # 精度为 0.808201298904281
cross_val_score(DecisionTreeClassifier(), X_subset_linear, y, cv=20).mean()     # 精度为 0.7940022074824625
cross_val_score(RandomForestClassifier(), X_subset_linear, y, cv=20).mean()     # 精度为 0.7964684521489416
cross_val_score(KNeighborsClassifier(), X_subset_linear, y, cv=20).mean()       # 精度为 0.793968273778492





0.793968273778492

In [49]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))),
    ('classification', RandomForestClassifier())
])
# clf.fit(X, y)
# clf.score(X, y)    # 0.86433
cross_val_score(clf, X_subset_linear, y, cv=10).mean()    # 精度为 0.8202359283151031



0.8193025393336153