In [1]:
#引用以下包做数据处理
import numpy as np   
import pandas as pd     
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.datasets import make_regression,make_blobs
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector,ExhaustiveFeatureSelector
#引用以下包做数据可视化
import seaborn as sns   
import matplotlib.pyplot as plt
from warnings import filterwarnings

In [2]:
filterwarnings('ignore')
pd.set_option('display.float_format',lambda x:'%.2f'%x) #不使用科学计数法
sns.set(font_scale=1.5) #设置统计图字体大小
plt.rcParams['font.sans-serif']=['SimHei'] #在统计图上显示中文
plt.style.use({'figure.figsize':(24, 8)})  #设置画布大小

# 前向特征选择

In [3]:
#随机生成训练数据：100个样本，有5个特征，样本分布有3个中心
features = 5
X, y_train = make_blobs(n_samples=100,n_features=features,centers=3)
x_train=pd.DataFrame(X,columns=['F{:d}'.format(i) for i in range(features)])
#定义特征选择对象，使用随机森林作为评估模型
rf=RandomForestClassifier()
sfs=SequentialFeatureSelector(rf,k_features=2,forward=True,scoring='accuracy',cv=2)
sfs = sfs.fit(x_train, y_train) #进行特征选择
selected_features = x_train.columns[list(sfs.k_feature_idx_)]#输出最佳特征组合和模型得分
print(selected_features,sfs.k_score_)
#只使用最优特征组合作为训练数据
x_train_sfs = sfs.transform(x_train)
x_train_sfs

Index(['F0', 'F3'], dtype='object') 1.0


array([[  8.94659337,  -7.85232293],
       [  8.6426595 ,  -7.93005909],
       [ -6.84798443,   9.04156003],
       [ -7.16846429,  10.38428247],
       [  6.52518394,  -8.52909252],
       [ -5.62084602,  -0.45324491],
       [ -8.69464717,   9.1894949 ],
       [  8.31134683,  -9.24839218],
       [ -7.11046876,   0.24021171],
       [  6.91587889,  -7.9868507 ],
       [-10.08072251,   6.73185837],
       [ -5.16114826,  -1.94725637],
       [ -6.96307877,  -1.46613174],
       [ -6.75987416,  -0.1779367 ],
       [ -9.2156508 ,   7.61766375],
       [ -5.90736388,  -1.25158491],
       [ -5.71773355,  -2.77292062],
       [  7.42026395,  -7.02581381],
       [ -9.53022469,   9.98444545],
       [ -7.41440594,  -1.97668304],
       [ -7.93518026,  10.38491793],
       [ -6.10365132,  -1.85434503],
       [  7.82403881,  -8.55003309],
       [  9.27113293, -10.44247893],
       [-10.02520583,  10.77346546],
       [  8.10421148,  -8.64770036],
       [ -8.96378609,   6.96335776],
 

# 向后特征消除

In [4]:
#随机生成训练数据：100个样本，有5个特征，样本分布有3个中心
features = 5
X, y_train = make_blobs(n_samples=100,n_features=features,centers=3)
x_train=pd.DataFrame(X,columns=['F{:d}'.format(i) for i in range(features)])
#定义特征选择对象，使用随机森林作为评估模型
rf=RandomForestClassifier()
sfs=SequentialFeatureSelector(rf,k_features=2,forward=False,scoring='accuracy',cv=2)
sfs = sfs.fit(x_train, y_train)#进行特征选择
selected_features = x_train.columns[list(sfs.k_feature_idx_)]#输出最佳特征组合和模型得分
print(selected_features,sfs.k_score_)
#只使用最优特征组合作为训练数据
x_train_sfs = sfs.transform(x_train)
#x_train_sfs

Index(['F0', 'F3'], dtype='object') 1.0


# 穷举特征选择

In [5]:
#随机生成训练数据：100个样本，有5个特征，样本分布有3个中心
features = 3
X, y_train = make_blobs(n_samples=100,n_features=features,centers=2)
x_train=pd.DataFrame(X,columns=['F{:d}'.format(i) for i in range(features)])
#定义特征选择对象，使用随机森林作为评估模型
rf=RandomForestClassifier()
efs=ExhaustiveFeatureSelector(rf,min_features=2,max_features=3,scoring='roc_auc',cv=2)
efs = efs.fit(x_train, y_train)#进行特征选择
selected_features = list(efs.best_feature_names_)#输出最佳特征组合和模型得分
print(selected_features,efs.best_score_)
#只使用最优特征组合作为训练数据
x_train_efs = efs.transform(x_train)
#x_train_efs

Features: 4/4

['F0', 'F1'] 1.0
