基本与决策树回归相似，主要是讲用来填补缺失值

In [3]:
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer #填补缺失值常用
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
dataset = load_boston()
X_full,y_full=dataset.data,dataset.target
n_samples=X_full.shape[0]
n_features=X_full.shape[1]

In [5]:
# 先变成缺失值数据集，后续才好填补
# 首先确定缺失比例为百分之五十
rng = np.random.RandomState(0)
missing_rate=0.5
n_missing_samples = int(np.floor(n_samples*n_features*missing_rate))
n_missing_samples


3289

In [6]:
X_full.shape

(506, 13)

In [7]:
#缺失随机遍布在行和列中
missing_features=rng.randint(0,n_features,n_missing_samples)
#randint 在下限和上限之间取出n个整数
missing_samples = rng.randint(0,n_samples,n_missing_samples)
#但这种方法可能会重复，用下面的

#missing_samples=rng.choice(n_samples,n_missing_samples,replace=False) 不重复用这个

In [8]:
X_missing=X_full.copy()
Y_missing=y_full.copy()#标签不能缺失

In [9]:
X_missing[missing_samples,missing_features]=np.nan
X_missing=pd.DataFrame(X_missing)

In [10]:
X_missing

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,18.0,,,0.538,,65.2,4.0900,1.0,296.0,,,4.98
1,0.02731,0.0,,0.0,0.469,,78.9,4.9671,2.0,,,396.90,9.14
2,0.02729,,7.07,0.0,,7.185,61.1,,2.0,242.0,,,
3,,,,0.0,0.458,,45.8,,,222.0,18.7,,
4,,0.0,2.18,0.0,,7.147,,,,,18.7,,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,0.0,0.573,,69.1,,1.0,,21.0,,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,,396.90,9.08
503,,,11.93,,0.573,6.976,91.0,,,,21.0,,5.64
504,0.10959,0.0,11.93,,0.573,,89.3,,1.0,,21.0,393.45,6.48


In [11]:
# 使用均值填补
imp_mean=SimpleImputer(missing_values=np.nan,strategy='mean')
X_missing_mean=imp_mean.fit_transform(X_missing)

In [12]:
pd.DataFrame(X_missing_mean)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,3.627579,18.000000,11.163464,0.066007,0.538000,6.305921,65.2,4.090000,1.000000,296.000000,18.521192,352.741952,4.980000
1,0.027310,0.000000,11.163464,0.000000,0.469000,6.305921,78.9,4.967100,2.000000,405.935275,18.521192,396.900000,9.140000
2,0.027290,10.722951,7.070000,0.000000,0.564128,7.185000,61.1,3.856371,2.000000,242.000000,18.521192,352.741952,12.991767
3,3.627579,10.722951,11.163464,0.000000,0.458000,6.305921,45.8,3.856371,9.383871,222.000000,18.700000,352.741952,12.991767
4,3.627579,0.000000,2.180000,0.000000,0.564128,7.147000,67.4,3.856371,9.383871,405.935275,18.700000,352.741952,5.330000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,3.627579,10.722951,11.163464,0.000000,0.573000,6.305921,69.1,3.856371,1.000000,405.935275,21.000000,352.741952,9.670000
502,0.045270,0.000000,11.930000,0.000000,0.573000,6.120000,76.7,2.287500,1.000000,273.000000,18.521192,396.900000,9.080000
503,3.627579,10.722951,11.930000,0.066007,0.573000,6.976000,91.0,3.856371,9.383871,405.935275,21.000000,352.741952,5.640000
504,0.109590,0.000000,11.930000,0.066007,0.573000,6.305921,89.3,3.856371,1.000000,405.935275,21.000000,393.450000,6.480000


In [13]:
# 使用0填补
imp_zero=SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0)
X_missing_zero=imp_zero.fit_transform(X_missing)

In [14]:
pd.DataFrame(X_missing_zero)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00000,18.0,0.00,0.0,0.538,0.000,65.2,4.0900,1.0,296.0,0.0,0.00,4.98
1,0.02731,0.0,0.00,0.0,0.469,0.000,78.9,4.9671,2.0,0.0,0.0,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.000,7.185,61.1,0.0000,2.0,242.0,0.0,0.00,0.00
3,0.00000,0.0,0.00,0.0,0.458,0.000,45.8,0.0000,0.0,222.0,18.7,0.00,0.00
4,0.00000,0.0,2.18,0.0,0.000,7.147,0.0,0.0000,0.0,0.0,18.7,0.00,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.00000,0.0,0.00,0.0,0.573,0.000,69.1,0.0000,1.0,0.0,21.0,0.00,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,0.0,396.90,9.08
503,0.00000,0.0,11.93,0.0,0.573,6.976,91.0,0.0000,0.0,0.0,21.0,0.00,5.64
504,0.10959,0.0,11.93,0.0,0.573,0.000,89.3,0.0000,1.0,0.0,21.0,393.45,6.48


用随机森林填补缺失值，思想：对于n个特征的数据来说，若特征T有缺失值，则将T当作标签，然后将原标签和其余n-1的特征组成新的特征矩阵，来预测
T标签，故T没有缺失的部分就是训练集
又因为，通常缺失的不止一列，故我们按照顺序，从缺失最少的开始填补，填补一个特征时，先将其他特征的缺失值用0代替，完成一次填补一次

In [15]:
X_missing_reg=X_missing.copy()
sortindex=X_missing_reg.isnull().sum(axis=0).sort_values().index


In [16]:
X_missing_reg.isnull().sum(axis=0).sort_values().index

Int64Index([6, 12, 8, 7, 9, 0, 2, 1, 5, 4, 3, 10, 11], dtype='int64')

In [17]:
for i in sortindex:
    df=X_missing_reg
    fillc=df.iloc[:,i]# 作为标签
    df = pd.concat([df.drop(i,axis="columns"),pd.DataFrame(y_full)],axis=1)
    #在新特征矩阵中，对缺失值用0填补
    df_zero=SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0).fit_transform(df)
    #找出训练集和测试集
    Ytrain=fillc[fillc.notnull()]
    Ytest=fillc[fillc.isnull()]
    Xtrain=df_zero[Ytrain.index,:]
    Xtest=df_zero[Ytest.index,:]
    rfc=RandomForestRegressor(n_estimators=100)
    rfc=rfc.fit(Xtrain,Ytrain)
    Ypredict=rfc.predict(Xtest)
    X_missing_reg.iloc[Ytest.index,i]=Ypredict

In [18]:
X_missing_reg.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.275061,18.0,7.2284,0.1,0.538,6.74073,65.2,4.09,1.0,296.0,17.963,389.3563,4.98
1,0.02731,0.0,6.0293,0.0,0.469,6.14486,78.9,4.9671,2.0,301.54,17.868,396.9,9.14
2,0.02729,14.685,7.07,0.0,0.468746,7.185,61.1,4.332461,2.0,242.0,17.886,390.017,4.8477
3,0.09358,23.16,3.4245,0.0,0.458,6.80256,45.8,4.645884,3.41,222.0,18.7,392.6378,5.8808
4,0.094204,0.0,2.18,0.0,0.465844,7.147,59.57,5.072401,3.85,245.09,18.7,391.6614,5.33
5,0.079276,0.0,5.7939,0.0,0.462687,6.43,58.7,6.0622,3.4,222.0,18.7,388.4296,5.21
6,0.248925,12.5,8.4176,0.0,0.524,6.06178,66.6,5.5605,4.17,304.69,15.2,392.3964,12.43
7,0.541312,12.5,8.9741,0.57,0.524,6.4607,96.1,5.9505,5.68,311.0,15.2,396.9,19.15
8,0.349771,13.255,7.87,0.0,0.524,5.631,77.325,6.0821,5.0,311.0,17.648,370.3874,29.93
9,0.17004,17.92,7.87,0.0,0.524,5.85622,63.334,6.5921,5.43,311.0,17.083,386.71,13.8114


In [21]:
X=[X_full,X_missing_mean,X_missing_zero,X_missing_reg]
mse=[]
for x in X:
    estimator=RandomForestRegressor(random_state=0,n_estimators=100)
    scores=cross_val_score(estimator,x,y_full,scoring="neg_mean_squared_error",cv=5).mean()
    mse.append(scores*-1)

In [27]:
[*zip(mse,['X_full','X_missing_mean','X_missing_zero','X_missing_reg'])]

[(21.571667100368845, 'X_full'),
 (40.848037216676374, 'X_missing_mean'),
 (49.626793201980185, 'X_missing_zero'),
 (17.917782104872828, 'X_missing_reg')]