In [19]:
import numpy as np
from sklearn.impute import SimpleImputer

print("replace missing values using the mean value of the columns that contain missing values")

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit([[1, 2], [np.nan, 3], [7, 6]])
X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))

print("simple imputer also supports sparse matrices")
import scipy.sparse as sp
X = sp.csc_matrix([[1, 2], [0, -1], [8, 4]])
imp = SimpleImputer(missing_values=-1, strategy='mean')
imp.fit(X)
X_test = sp.csc_matrix([[-1, 2], [6, -1], [7, 6]])
print(imp.transform(X_test).toarray())

print("categorical data supported as string values")
import pandas as pd
df=pd.DataFrame([["a", "x"],
                [np.nan, "y"],
                ["a", np.nan],
                 ["b", "y"]], dtype="category")
imp = SimpleImputer(strategy="most_frequent")
print(imp.fit_transform(df))

print("models each feature with missing values as a function of the other features")
print("a regressor is fit on (X, y) for known y and used to predict missing values")
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])
X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
# the model learns that the second feature is double the first
print(np.round(imp.transform(X_test)))


print("preserving information about which values are missing")
print("missing_values allows specification of other placeholds than data type float")

from sklearn.impute import MissingIndicator
X = np.array([[-1,-1,1,3],
              [4, -1, 0, -1],
             [8, -1, 1, 0]])
indicator = MissingIndicator(missing_values=-1)
mask_missing_values_only = indicator.fit_transform(X)
mask_missing_values_only

print("the features parameter can be set to return all features without regard to missing values")
indicator = MissingIndicator(missing_values=-1, features="all")
mask_all = indicator.fit_transform(X)
print(mask_all)

print(indicator.features_)





replace missing values using the mean value of the columns that contain missing values
[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]
simple imputer also supports sparse matrices
[[3. 2.]
 [6. 3.]
 [7. 6.]]
categorical data supported as string values
[['a' 'x']
 ['a' 'y']
 ['a' 'y']
 ['b' 'y']]
models each feature with missing values as a function of the other features
a regressor is fit on (X, y) for known y and used to predict missing values
[[ 1.  2.]
 [ 6. 12.]
 [ 3.  6.]]
preserving information about which values are missing
missing_values allows specification of other placeholds than data type float
[[ True  True False False]
 [False  True False  True]
 [False  True False False]]
[0 1 2 3]


In [25]:
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.tree import DecisionTreeClassifier

print('missing indicator in a pipeline')
print('loading iris dataset and adding missing values')
X, y = load_iris(return_X_y=True)
mask = np.random.randint(0, 2, size=X.shape).astype(np.bool)
X[mask]= np.nan
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100,
                                              random_state=0)
print('feature union is used to ass indicators features to the regualr features')

transformer = FeatureUnion(
    transformer_list=[
        ('features', SimpleImputer(strategy='mean')),
        ('indicators', MissingIndicator())])
transformer = transformer.fit(X_train, y_train)
results = transformer.transform(X_test)
print(results.shape)

print('in order to make predictions wrap the data in a pipeline with a classifier')

clf = make_pipeline(transformer, DecisionTreeClassifier())
clf = clf.fit(X_train, y_train)
results = clf.predict(X_test)
print(results.shape)


missing indicator in a pipeline
loading iris dataset and adding missing values
feature union is used to ass indicators features to the regualr features
(100, 8)
in order to make predictions wrap the data in a pipeline with a classifier
(100,)
