学习sklearn特征选择技术。

# 筛选零方差变量

In [3]:
import numpy as np
from sklearn.feature_selection import VarianceThreshold

# 创建一个包含不同方差特征的示例数据集
# 特征0: 常数特征 (方差=0)
# 特征1: 低方差特征 (方差很小)
# 特征2: 高方差特征 (方差较大)
X = np.array([
    [1, 0.1, 10],
    [1, 0.2, 25],
    [1, 0.0, 15],
    [1, 0.1, 12],
    [1, 0.2, 21]
])

filter_var = VarianceThreshold(threshold = 0.1)  # 设置方差阈值
filter_var.fit_transform(X)

array([[10.],
       [25.],
       [15.],
       [12.],
       [21.]])

# 单变量筛选

In [4]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
X, y = load_iris(return_X_y=True)
X.shape
X_new = SelectKBest(f_classif, k=2).fit_transform(X, y)
X_new.shape

(150, 2)

# 递归特征筛选

In [9]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestClassifier

X, y = make_classification(
    n_samples=1000,
    n_features=25,
    n_informative=15,
    n_redundant=5,
    n_repeated=0,
    n_classes=2,
    random_state=42
)

rfe_rf = RFE(
  estimator = RandomForestClassifier(n_estimators=100, random_state=42),
  n_features_to_select=10,
  step = 1
)

rfe_rf.fit_transform(X, y).shape

(1000, 10)

In [10]:
rfecv = RFECV(
  estimator = RandomForestClassifier(n_estimators=100, random_state=42),
  step = 1,
  cv = 5
)

rfecv.fit(X, y)
rfecv.n_features_

21

# 模型筛选法

In [11]:
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
X, y = load_iris(return_X_y=True)
X.shape
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
X_new.shape



(150, 3)

In [12]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
X, y = load_iris(return_X_y=True)
X.shape
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X, y)
clf.feature_importances_  
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)
X_new.shape               

(150, 2)

# 顺序筛选

In [14]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector

X, y = load_iris(return_X_y=True)
knn = KNeighborsClassifier(n_neighbors=3)

sfs_forward = SequentialFeatureSelector(
    estimator=knn,
    n_features_to_select=2, # 目标特征数量
    direction='forward',
    cv=5 # 5折交叉验证
)

sfs_forward.fit_transform(X, y).shape

(150, 2)

In [15]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector

X, y = load_iris(return_X_y=True)
knn = KNeighborsClassifier(n_neighbors=3)

sfs_forward = SequentialFeatureSelector(
    estimator=knn,
    n_features_to_select=2, # 目标特征数量
    direction='backward',
    cv=5 # 5折交叉验证
)

sfs_forward.fit_transform(X, y).shape

(150, 2)