# key words
- imputer: SimpleImputer
- decomposition: PCA
- pipeline: Pipeline

In [1]:
import numpy as np

from sklearn import datasets
from sklearn import impute, pipeline, decomposition

iris = datasets.load_iris()
X = iris.data
print("X mean, std")
print(X.mean(axis=0))
print(X.std(axis=0))
print()

# init mask array randomly
masking_array = np.random.binomial(1, .25, X.shape).astype(bool)

X[masking_array] = np.nan
print("masked X[:5]")
print(X[:5])
print()

pca = decomposition.PCA(copy=True, iterated_power='auto', n_components=None, random_state=None, svd_solver='auto', tol=0.0, whiten=False)
imputer = impute.SimpleImputer(copy=True, missing_values=np.nan, strategy='mean', verbose=0)

pipe = pipeline.Pipeline([('imputer', imputer), ('pca', pca)])
X_transformed = pipe.fit_transform(X)
print("transformed X[:5] (imputer->pca)")
print(X_transformed[:5])
print()

recovered_X = pca.inverse_transform(X_transformed)
print("recovered X mean, std")
print(recovered_X.mean(axis=0))
print(recovered_X.std(axis=0))

X mean, std
[5.84333333 3.05733333 3.758      1.19933333]
[0.82530129 0.43441097 1.75940407 0.75969263]

masked X[:5]
[[nan 3.5 1.4 nan]
 [nan nan 1.4 0.2]
 [nan nan 1.3 0.2]
 [4.6 3.1 1.5 nan]
 [5.  nan 1.4 nan]]

transformed X[:5] (imputer->pca)
[[-2.12414941  0.59470941  0.75889869 -0.08961   ]
 [-2.39305284  0.67802664 -0.31765177 -0.15059087]
 [-2.48472376  0.70011121 -0.29185242 -0.17164098]
 [-2.37146519 -0.72648886  0.56050194 -0.16680433]
 [-2.35054401 -0.33609653  0.57820286 -0.30169872]]

recovered X mean, std
[5.89693878 3.06320755 3.69322034 1.22525253]
[0.6918048  0.36080012 1.62142854 0.62910979]
