In [1]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

In [2]:
# 从numpy工具包生成模拟数据集
rng = np.random.RandomState(0)
#使用函数RandomState获得随机数生成器。0为随机种子，只要随机种子相同，产生的随机数序列就相同。

In [3]:
dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0] # 506
n_features = X_full.shape[1] # 13
print(X_full.shape)

(506, 13)


## 第一步，估计完整数据集

创建一个由100棵树组成的随机森林估计量estimator，随机状态是随机数生成器的种子0.一个随机森林是一个拟合多棵分类决策树的估计量，它使用平均化的办法来改善预测准确率，控制过度拟合。构建决策树的子样本是原始样本的bootstrap样本。

In [4]:
estimator = RandomForestRegressor(random_state=0, n_estimators=100)

使用交叉验证法评价分数，取分数的平均值，保留小数点后两位打印出来。

In [5]:
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the entire dataset = %.2f" % score)

Score with the entire dataset = 0.56


In [6]:
cross_val_score(estimator,X_full,y_full).mean()

0.5619274613104018

## 第二步：在75%的数据行里增加缺失值

In [7]:
missing_rate = 0.75
n_missing_samples = int(np.floor(n_samples * missing_rate)) #75%的样本数 379

In [8]:
missing_samples = np.hstack(
    (
        np.zeros(n_samples - n_missing_samples,dtype=np.bool),
        np.ones(n_missing_samples,dtype=np.bool)
    )
)
# 506

In [9]:
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

## 第三步：估计不包括缺失行的子集分数

In [10]:
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)

Score without the samples containing missing values = 0.48


## 第四步：估计填补缺失值后的数据集分数

将缺失值处标记为0, 再将数据集里标记为0的项，用该项所在列的均值代替。由于列表示特征，此即用该特征的均值代替缺失值。最后，在填补后的数据集上计算分数。

In [12]:
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()

In [13]:
estimator = Pipeline(
    [
        ("imputer", Imputer(missing_values=0,strategy="mean",axis=0)),
        ("forest", RandomForestRegressor(random_state=0,n_estimators=100))
    ]
)


In [14]:
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)

Score after imputation of the missing values = 0.57


让我们来比较三种情况下的分数值：

Score with the entire dataset = 0.56

Score without the samples containing missing values = 0.48

Score after imputation of the missing values = 0.57

由此可见，填补后的分值更加接近完整分值。