# 过滤式选择

In [9]:
# 导入必要的包
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## 方差选择法

In [16]:
# 方差选择法
def variance_selection(data, threshold=0.0):
    import numpy as np
    # 计算方差
    var = np.var(data, axis=0)
    
    # 选择方差大于阈值的特征
    selected = var > threshold
    return data[:, selected]

In [18]:
data = np.array([[1, 2, 3], [1, 4, 3], [1, 2, 3], [1, 2, 3]])

variance_selection(data, 0.0)

array([0.  , 0.75, 0.  ])

## 相关系数法

In [23]:
# 相关系数法
def correlation_coefficient_selection(X, y, threshold=0.0):
    import numpy as np
    
    # 合并X和y
    data = np.hstack((X, y.reshape(-1, 1)))
    
    # 计算相关系数
    corr = np.corrcoef(data, rowvar=False)
    
    # 删除y的相关系数
    corr_y = corr[-1, :-1]
    
    # 选择相关系数大于阈值的特征
    selected = np.abs(corr_y) > threshold
    
    return X[:, selected]

In [25]:
X = np.array([[1, 2, 3], [1, 4, 3], [1, 2, 3], [2, 2, 3]])
y = np.array([1, 2, 3, 4])

correlation_coefficient_selection(X, y, 0.0)

array([[1, 2],
       [1, 4],
       [1, 2],
       [2, 2]])

## $\chi^2$检验

In [26]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, chi2

In [27]:
# 加载乳腺癌数据集
data = load_breast_cancer()
X = data.data
y = data.target
X, y

(array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
 

In [28]:
# 将数据离散化
X_discrete = np.apply_along_axis(pd.cut, axis=0, arr=X, bins=5, labels=False)

In [29]:
X_discrete

array([[2, 0, 2, ..., 4, 2, 2],
       [3, 1, 3, ..., 3, 1, 1],
       [3, 1, 2, ..., 4, 2, 1],
       ...,
       [2, 3, 2, ..., 2, 0, 0],
       [3, 3, 3, ..., 4, 2, 2],
       [0, 2, 0, ..., 0, 1, 0]], dtype=int64)

In [30]:
# 计算卡方值并选择最重要的前5个特征
selector = SelectKBest(score_func=chi2, k=5)
selector.fit(X_discrete, y)


In [31]:
selector.get_support()

array([False, False, False,  True, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
        True, False, False])

In [41]:
selector.transform(X_discrete)

array([[1, 3, 3, 2, 4],
       [2, 1, 1, 2, 3],
       [2, 2, 3, 1, 4],
       ...,
       [1, 1, 1, 1, 2],
       [2, 4, 3, 2, 4],
       [0, 0, 0, 0, 0]], dtype=int64)

In [32]:
# 获取选择的特征
selected_features = data.feature_names[selector.get_support()]
selected_features

array(['mean area', 'mean concavity', 'mean concave points', 'worst area',
       'worst concave points'], dtype='<U23')

In [33]:
selected_data = X_discrete[:, selector.get_support()]
selected_data

array([[1, 3, 3, 2, 4],
       [2, 1, 1, 2, 3],
       [2, 2, 3, 1, 4],
       ...,
       [1, 1, 1, 1, 2],
       [2, 4, 3, 2, 4],
       [0, 0, 0, 0, 0]], dtype=int64)

## 互信息

In [35]:
# 加载包
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_selection import mutual_info_classif, SelectKBest

In [37]:
# 加载数据集
iris = load_iris()
X, y = iris.data, iris.target
X, y

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [38]:
# 计算每个特征与目标变量之间的互信息
scores = mutual_info_classif(X, y)

scores

array([0.5022743 , 0.28177697, 0.99128281, 0.98735734])

In [40]:
# 根据得分进行特征选择，选择得分排名前2的特征
k_best = SelectKBest(mutual_info_classif, k=2).fit(X, y)
X_new = k_best.transform(X)
X_new

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.7, 0.4],
       [1.4, 0.3],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.5, 0.1],
       [1.5, 0.2],
       [1.6, 0.2],
       [1.4, 0.1],
       [1.1, 0.1],
       [1.2, 0.2],
       [1.5, 0.4],
       [1.3, 0.4],
       [1.4, 0.3],
       [1.7, 0.3],
       [1.5, 0.3],
       [1.7, 0.2],
       [1.5, 0.4],
       [1. , 0.2],
       [1.7, 0.5],
       [1.9, 0.2],
       [1.6, 0.2],
       [1.6, 0.4],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.6, 0.2],
       [1.6, 0.2],
       [1.5, 0.4],
       [1.5, 0.1],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.2, 0.2],
       [1.3, 0.2],
       [1.4, 0.1],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.3, 0.3],
       [1.3, 0.3],
       [1.3, 0.2],
       [1.6, 0.6],
       [1.9, 0.4],
       [1.4, 0.3],
       [1.6, 0.2],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [4.7, 1.4],
       [4.5, 1.5],
       [4.9,

## L1正则化

In [42]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.linear_model import Lasso

# 生成模拟数据集
X, y = make_regression(n_samples=100, n_features=10, n_informative=5, noise=0.1, random_state=42)

# 使用Lasso模型进行特征选择
lasso = Lasso(alpha=0.1)
lasso.fit(X, y)

# 输出选择的特征及其系数
selected_features = np.where(lasso.coef_ != 0)[0]
selected_coefs = lasso.coef_[selected_features]
print("Selected features: ", selected_features)
print("Selected coefficients: ", selected_coefs)

Selected features:  [0 3 5 7 8]
Selected coefficients:  [16.61667804 63.57170498 70.5360479  10.34875749  3.08067278]


In [44]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegressionCV

# 加载数据集
data = load_breast_cancer()
X, y = data.data, data.target

# 构建L1正则化逻辑回归模型
clf = LogisticRegressionCV(Cs=10, penalty='l1', solver='liblinear', cv=10)

# 训练模型并输出选择的特征
clf.fit(X, y)
selected_features = data.feature_names[clf.coef_[0] != 0]
print("Selected features: ", selected_features)



Selected features:  ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'compactness error' 'concavity error' 'concave points error'
 'symmetry error' 'fractal dimension error' 'worst radius' 'worst texture'
 'worst perimeter' 'worst area' 'worst compactness' 'worst concavity'
 'worst symmetry' 'worst fractal dimension']


# 包裹式选择

## Forward Selection

In [1]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [3]:
# 加载数据集
iris = load_iris()
X, y = iris.data, iris.target
X,y

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [4]:
# 定义模型
lr = LogisticRegression()

In [5]:
# 定义前向选择算法，选择3个特征
sfs = SFS(lr, k_features=3, forward=True, scoring='accuracy', cv=5)

# 运行前向选择算法
sfs.fit(X, y)

# 输出选择的特征
print("Selected features: ", sfs.k_feature_names_)



Selected features:  ('0', '2', '3')




## Backward Elimination

In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 加载数据集
data = load_breast_cancer()
X, y = data.data, data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练逻辑回归模型
model = LogisticRegression()
model.fit(X_train, y_train)

# 计算所有特征的重要性得分
scores = np.abs(model.coef_[0])

# 按重要性得分排序
sorted_idx = np.argsort(scores)[::-1]

# 选择前5个特征进行后向消元
num_features = 5
for i in range(X_train.shape[1], num_features, -1):
    # 选择前i个特征
    idx = sorted_idx[:i]
    X_train_subset = X_train[:, idx]
    X_test_subset = X_test[:, idx]
    
    # 训练模型并计算准确率
    model.fit(X_train_subset, y_train)
    y_pred = model.predict(X_test_subset)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"Selected features: {data.feature_names[idx]}")
    print(f"Accuracy: {acc:.4f}")
    
    # 找到最低得分的特征并删除
    if i > num_features:
        worst_idx = np.argmin(scores[idx])
        sorted_idx = np.delete(sorted_idx, np.where(sorted_idx == idx[worst_idx]))

# 最终选择的特征子集
final_idx = sorted_idx[:num_features]
final_features = data.feature_names[final_idx]
print(f"Final selected features: {final_features}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected features: ['worst radius' 'mean radius' 'worst concavity' 'worst compactness'
 'texture error' 'mean concavity' 'worst texture' 'worst concave points'
 'worst symmetry' 'mean compactness' 'mean concave points'
 'worst perimeter' 'mean texture' 'worst smoothness' 'mean symmetry'
 'worst fractal dimension' 'concavity error' 'area error'
 'mean smoothness' 'perimeter error' 'compactness error' 'radius error'
 'mean perimeter' 'worst area' 'concave points error' 'symmetry error'
 'mean fractal dimension' 'mean area' 'smoothness error'
 'fractal dimension error']
Accuracy: 0.9649
Selected features: ['worst radius' 'mean radius' 'worst concavity' 'worst compactness'
 'texture error' 'mean concavity' 'worst texture' 'worst concave points'
 'worst symmetry' 'mean compactness' 'mean concave points'
 'worst perimeter' 'mean texture' 'worst smoothness' 'mean symmetry'
 'worst fractal dimension' 'concavity error' 'area error'
 'mean smoothness' 'perimeter error' 'compactness error' 'radiu

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

## Recursive Feature Elimination

In [7]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

# 加载数据集
iris = load_iris()
X, y = iris.data, iris.target

# 初始化逻辑回归模型
model = LogisticRegression()

# 使用递归特征消除选择2个最佳特征
rfe = RFE(model, n_features_to_select=2)
X_new = rfe.fit_transform(X, y)

# 输出选择的特征
print("Selected features: ", np.array(iris.feature_names)[rfe.get_support()])

Selected features:  ['petal length (cm)' 'petal width (cm)']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# 自上而下RFE
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# 加载数据集
digits = load_digits()
X, y = digits.data, digits.target

# 使用逻辑回归作为基模型，进行特征选择
model = LogisticRegression(solver='liblinear')
rfe = RFE(estimator=model, n_features_to_select=32, step=1)  # 设定参数step=1，表示每次迭代删除一个特征
X_new = rfe.fit_transform(X, y)

# 输出选择的特征
print("Selected features: ", rfe.support_)

In [8]:
### 生成数据
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000,         # 样本个数
                           n_features=25,          # 特征个数
                           n_informative=3,        # 有效特征个数
                           n_redundant=2,          # 冗余特征个数（有效特征的随机组合）
                           n_repeated=0,           # 重复特征个数（有效特征和冗余特征的随机组合）
                           n_classes=8,            # 样本类别
                           n_clusters_per_class=1, # 簇的个数
                           random_state=0)
### 特征选择
 
# RFECV
from sklearn.svm import SVC
svc = SVC(kernel="linear")
 
 
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
rfecv = RFECV(estimator=svc,          # 学习器
              min_features_to_select=2, # 最小选择的特征数量
              step=1,                 # 移除特征个数
              cv=StratifiedKFold(2),  # 交叉验证次数
              scoring='accuracy',     # 学习器的评价标准
              verbose = 0,
              n_jobs = 1
              ).fit(X, y)
X_RFECV = rfecv.transform(X)
print("RFECV特征选择结果——————————————————————————————————————————————————")
print("有效特征个数 : %d" % rfecv.n_features_)
print("全部特征等级 : %s" % list(rfecv.ranking_))

RFECV特征选择结果——————————————————————————————————————————————————
有效特征个数 : 3
全部特征等级 : [5, 1, 12, 19, 15, 6, 17, 1, 2, 21, 23, 11, 16, 10, 13, 22, 8, 14, 1, 20, 7, 9, 3, 4, 18]


## 随机选择

### LVW法

In [1]:
# LVW算法
def LVW(X, y, model, T=1000):
    # 初始化特征索引与特征数量
    num_features = X.shape[1]
    features = np.arange(num_features)
    
    # 计算所有特征的得分
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(model, X, y, cv=5)
    
    t=0
    while t < T:
        # 产生随机子集
        num_features_new = np.random.randint(1, num_features+1)
        features_new = np.random.choice(features, num_features_new, replace=False)
        
        # 计算子集的得分
        scores_new = cross_val_score(model, X[:, features_new], y, cv=5)
        
        # 比较得分
        # 如果子集得分更高或得分相同但特征数量更少，则更新特征索引
        if scores_new.mean() >= scores.mean() and len(features_new) < len(features):
            num_features = num_features_new
            features = features_new
            scores = scores_new
        else:
            t += 1
    return X[:, features]
            

In [2]:
# 导入必要的包
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [4]:
X, y = load_iris(return_X_y=True)
model = LogisticRegression()

X_LVW = LVW(X, y, model, 1000)
print("LVW特征选择结果——————————————————————————————————————————————————")
X_LVW

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LVW特征选择结果——————————————————————————————————————————————————


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

### Null Importance算法

In [5]:
# Null Importance算法

In [8]:
def null_importance(X, y, n_iterations=10):
    """
    计算特征的重要性评分（Null Importance）。

    参数:
    X (pd.DataFrame): 特征数据集。
    y (pd.Series): 目标变量。
    model: 机器学习模型。
    n_iterations (int): 每个特征打乱的重复次数。
    metric (function): 性能评估指标函数。

    返回:
    dict: 每个特征的重要性评分。
    """
    # 训练基准模型
    from sklearn.base import clone

    base_model = clone(model)
    base_model.fit(X, y)
    # 获取特征重要性
    gains = 

    # 计算特征重要性
    null_importance_scores = {}

    for col in X.columns:
        scores = []
        for _ in range(n_iterations):
            X_shuffled = X.copy()
            X_shuffled[col] = np.random.permutation(X_shuffled[col])
            shuffled_model = clone(model)
            shuffled_model.fit(X_shuffled, y)
            shuffled_score =cross_val_score(shuffled_model)
            scores.append(baseline_score - shuffled_score)
        null_importance_scores[col] = np.mean(scores)

    return null_importance_scores

In [None]:
# 测试
