##  缺失数据补全

In [7]:
import numpy as np

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score

In [8]:
rng = np.random.RandomState(0)

dataset = load_boston()
X_full,y_full = dataset.data,dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

# 交叉验证,预测
estimator = RandomForestRegressor(random_state=0,n_estimators=100)
score = cross_val_score(estimator,X_full,y_full).mean()
print "Score with entire dataset: %.2f"  % score

Score with entire dataset: 0.56


In [17]:
X_full.shape  # (506,13)
missing_rate = 0.75
n_missing_samples = np.floor(n_samples * missing_rate) # 379
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                     dtype=np.bool),
                            np.ones(n_missing_samples,
                                   dtype=np.bool)))
rng.shuffle(missing_samples)  #  打乱 missing_samples
missing_features = rng.randint(0,n_features,n_missing_samples) # 0-n_feature 之间, n_missing_samples 个缺失值

#print missing_features

#  删除缺失值的剩余数据预测
X_filtered = X_full[~missing_samples,:]  # X_filtered.shape (127,3)
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0,n_estimators=100)
score = cross_val_score(estimator,X_filtered,y_filtered).mean()
print "Score without the samples containing missing values = %.2f" % score



Score without the samples containing missing values = 0.23


In [24]:
#  补全缺失值重新估计
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])  # 先用平均值补全，再用随机森林训练
score = cross_val_score(estimator, X_missing, y_missing).mean()
print 'Score after imutation of the missing vales: %.2f' % score

Score after imutation of the missing vales: 0.54
