使用过滤法进行特征选择

作者：谢文伟

邮件：jim.xie.cn@outlook.com

主页：https://github.com/jim-xie-cn/ai-cv

In [None]:
#引用以下包做数据处理
import numpy as np   
import pandas as pd     
from sklearn.linear_model import Lasso
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.datasets import make_regression,make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest,VarianceThreshold,chi2,f_classif
from mlxtend.feature_selection import SequentialFeatureSelector,ExhaustiveFeatureSelector
#引用以下包做数据可视化
import seaborn as sns   
import matplotlib.pyplot as plt
from warnings import filterwarnings

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
filterwarnings('ignore')
pd.set_option('display.float_format',lambda x:'%.2f'%x) #不使用科学计数法
sns.set(font_scale=1.5) #设置统计图字体大小
plt.rcParams['font.sans-serif']=['SimHei'] #在统计图上显示中文
plt.style.use({'figure.figsize':(24, 8)})  #设置画布大小

# 基本特征过滤方法

In [None]:
df=pd.read_csv("./dataset/bmi/500_Person_Gender_Height_Weight_Index.csv")
df.replace({"Male":1,"Female":0},inplace=True)
#常量检查演示：选取身高，体重为待选特征，丢弃掉方差小于阈值(100)的特征
X=df.to_records(index=False).tolist()
variance = VarianceThreshold(threshold=100) #选择方差超过100的特征
newX = variance.fit_transform(X)
print("特征数量：%d(过滤前),%d(过滤后)"%(df.shape[1],newX.shape[1]))

In [None]:
df=pd.read_csv("./dataset/bmi/500_Person_Gender_Height_Weight_Index.csv")
df.replace({"Male":1,"Female":0},inplace=True)
#准常量占比检查演示：丢弃掉准常量占比超过阈值(0.5)的特征
drop_feature = []
for feature in df.columns:
    diff_values = df[feature].value_counts() #计算最大占比(不同值个数/总记录数)
    row_count = np.float(len(df))
    max_percent = max(diff_values/row_count)
    if max_percent >= 0.5: #准常量占比超过阈值(0.5)
        drop_feature.append(feature)
df1=df.drop(labels=drop_feature, axis=1)
print("特征数量：%d(过滤前),%d(过滤后)"%(df.shape[1],df1.shape[1]))

In [None]:
df=pd.read_csv("./dataset/bmi/500_Person_Gender_Height_Weight_Index.csv")
df.replace({"Male":1,"Female":0},inplace=True)
#重复特征检查演示：丢弃掉重复特征
df['Height1'] = df['Height'] #设置重复特征“Height1”
df_T = df.T #转置操作，将行变成列，列变成行
d_columns = df_T[df_T.duplicated()].index.values #得到重复特征列名
df1=df.drop(labels=d_columns, axis=1)
print("特征数量：%d(过滤前),%d(过滤后)"%(df.shape[1],df1.shape[1]))

# 相关系数法

In [None]:
df=pd.read_csv("./dataset/bmi/500_Person_Gender_Height_Weight_Index.csv")
#为了方便演示，增加一个年龄特征（随机的0至100之间的整数）
df['Age']=np.random.randint(0,100,500)
pear_matrix = df.corr() #计算皮尔森相关系数
#如果相关系数大于0.5，认为两个特征相似，需要丢弃掉一个
corr_features = set()
for i in range(len(pear_matrix .columns)):
    for j in range(i):
        if abs(pear_matrix.iloc[i, j]) > 0.5:
            colname = pear_matrix.columns[i]
            corr_features.add(colname)
df1=df.drop(labels=corr_features, axis=1)
print("特征数量：%d(过滤前),%d(过滤后)"%(df.shape[1],df1.shape[1]))

# 通过统计量过滤

In [None]:
df=pd.read_csv("./dataset/bmi/500_Person_Gender_Height_Weight_Index.csv")
le = preprocessing.LabelEncoder() #将类别转换为编码表示
df['Gender'] = le.fit_transform(df['Gender'])
df1 = df[['Height','Weight']] #身高，体重作为输入向量，性别作为输出向量
X = np.array(df1.to_records(index=False).tolist())
y = np.array(df['Gender'].tolist())
#定义特征选择器，使用方差分析f_classif(卡方检验使用chi2),只选择一个特征k=1
sel = SelectKBest(f_classif, k=1) 
sel.fit(X, y) 
X_new = sel.transform(X)
print("特征得分:%r,P值:%r"%(sel.scores_,sel.pvalues_))
print("特征数量：%d(过滤前),%d(过滤后)"%(X.shape[1],X_new.shape[1]))