In [1]:
"""
======================================================
Imputing missing values before building an estimator
======================================================

This example shows that imputing the missing values can give better results
than discarding the samples containing any missing value.
Imputing does not always improve the predictions, so please check via cross-validation.
Sometimes dropping rows or using marker values is more effective.

Missing values can be replaced by the mean, the median or the most frequent
value using the ``strategy`` hyper-parameter.
The median is a more robust estimator for data with high magnitude variables
which could dominate results (otherwise known as a 'long tail').

Script output::

  Score with the entire dataset = 0.56
  Score without the samples containing missing values = 0.48
  Score after imputation of the missing values = 0.55

In this case, imputing helps the classifier get close to the original score.
  
"""
import numpy as np

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score

rng = np.random.RandomState(0)

dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

# Estimate the score on the entire dataset, with no missing values
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the entire dataset = %.2f" % score)

# Add missing values in 75% of the lines
missing_rate = 0.75
n_missing_samples = np.floor(n_samples * missing_rate)
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

Score with the entire dataset = 0.56


  a = empty(shape, dtype, order)


In [3]:
dataset.data.shape

(506, 13)

In [24]:
missing_features

array([ 8,  3,  6,  9, 10,  3,  2, 11,  7,  0,  3, 10,  0,  3,  6,  1, 12,
       11,  9,  2,  9,  4,  9, 11,  1,  3,  2,  4,  9,  7,  4,  9,  4,  1,
        2,  7,  2,  3,  9, 10,  7, 10,  6,  6, 12,  2, 10,  3,  6,  0, 12,
       10,  8,  0, 10, 11,  7,  6,  5,  9,  6,  5, 11,  2,  7, 12,  1,  9,
        2,  2, 12,  5, 11,  6,  4,  2, 12,  2,  1, 12,  0,  9,  0,  2,  8,
        3, 10,  0, 10, 11,  8,  8,  1,  0,  5,  8,  2,  3, 12,  5, 11,  3,
        8,  6,  4,  6, 12,  3,  6,  2, 12, 12,  6,  5, 11,  5,  9,  4,  6,
        5, 11,  1,  3,  3, 10,  8,  9,  5,  5,  6,  0,  9,  7,  5,  1,  5,
        6, 12, 10, 10, 11,  6,  8,  7,  5, 11, 10,  3, 10,  2,  9, 11,  9,
        3,  2,  5, 10,  4,  1,  5,  8,  3,  5,  8,  4, 10,  1,  7,  8,  1,
        2,  1,  1,  7,  5, 11,  0,  4,  1,  1, 12,  6,  6,  0,  2,  3,  7,
       12,  9,  2, 11,  4,  9,  0, 12,  6,  9,  2,  4,  7,  3,  0, 12,  5,
        4,  0,  2,  3,  1,  7, 10,  1,  3, 10,  4, 10,  1,  7,  4,  0, 10,
        2, 12, 10,  7,  4

In [7]:
missing_samples.shape

(506,)

In [8]:
# Estimate the score without the lines containing missing values
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]

In [18]:
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)

# Estimate the score after imputation of the missing values
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()

Score without the samples containing missing values = 0.48


In [25]:
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)

Score after imputation of the missing values = 0.57


In [None]:
Pipeline()