first commit

iskandr · Sep 13, 2018 · ce0a52d · ce0a52d
1 parent 82db8c5
commit ce0a52d
Show file tree

Hide file tree

Showing 15 changed files with 86 additions and 94 deletions.
diff --git a/README.md b/README.md
@@ -15,21 +15,21 @@ from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImput
 
 # Model each feature with missing values as a function of other features, and 
 # use that estimate for imputation.
-X_filled_ii = IterativeImputer().complete(X_incomplete)
+X_filled_ii = IterativeImputer().fit_transform(X_incomplete)
 
 # Use 3 nearest rows which have a feature to fill in each row's missing features
-X_filled_knn = KNN(k=3).complete(X_incomplete)
+X_filled_knn = KNN(k=3).fit_transform(X_incomplete)
 
 # matrix completion using convex optimization to find low-rank solution
 # that still matches observed values. Slow!
-X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)
+X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)
 
 # Instead of solving the nuclear norm objective directly, instead
 # induce sparsity using singular value thresholding
 X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)
-X_filled_softimpute = SoftImpute().complete(X_incomplete_normalized)
+X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized)
 
-# print mean squared error for the three imputation methods above
+# print mean squared error for the four imputation methods above
 ii_mse = ((X_filled_ii[missing_mask] - X[missing_mask]) ** 2).mean()
 print("Iterative Imputer norm minimization MSE: %f" % ii_mse)
 
@@ -64,6 +64,19 @@ on features for which two rows both have observed data.
 * `BiScaler`: Iterative estimation of row/column means and standard deviations to get doubly normalized
 matrix. Not guaranteed to converge but works well in practice. Taken from [Matrix Completion and Low-Rank SVD via Fast Alternating Least Squares](http://arxiv.org/abs/1410.2596).
 
+## Note about Inductive vs Transductive Imputation
+Most imputation algorithms in `fancyimpute` are *transductive*. In the elegant language of `scikit-learn`'s API
+this means that you can only call `solver.fit_transform(X_incomplete)`, but then the "fitted" `solver` will not 
+be able to be applied to new data via a call to `solver.transform`. A simple example is the `MatrixFactorization`
+imputer, which decomposes as follows: `<A,B> = X_incomplete`, such that the product of `A` and `B` is close 
+to `X_incomplete` on its non-missing values. How then, can we apply the learned `A` and `B` matrices to 
+held-out data? It is not doable in general, but there are special cases. `fancyimpute` aims to be of general
+use and we have not implemented an inductive mode for `MatrixFactorization`.
+
+There are some imputation algorithms that are *inductive*, meaning they can be applied to new data after a call to
+`solver.fit` or `solver.fit_transform`. Currently only `IterativeImputer` supports the full `scikit-learn` API: `fit`, `fit_transform`,
+and `transform`, but we are actively looking for contributions that extend other imputers to support
+induction. At least the `KNN` and `SimpleFill` imputers can be extended in a straightforward manner.
 
 ## Note about Multiple vs. Single Imputation
 (From `scikit-learn`'s documentation)
@@ -94,7 +107,7 @@ n_imputations = 5
 XY_completed = []
 for i in range(n_imputations):
     imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i)
-    XY_completed.append(imputer.complete(XY_incomplete))
+    XY_completed.append(imputer.fit_transform(XY_incomplete))
 
 XY_completed_mean = np.mean(XY_completed, 0)
 XY_completed_std = np.std(XY_completed, 0)

diff --git a/experiments/complete_faces.py b/experiments/complete_faces.py
@@ -12,8 +12,7 @@
     SoftImpute,
     BiScaler,
     KNN,
-    MICE,
-    BayesianRidgeRegression,
+    IterativeImputer,
 )
 from fancyimpute.common import masked_mae, masked_mse
 
@@ -172,7 +171,7 @@ def save_images(self, images, base_filename, flattened=True):
 
     def add_entry(self, solver, name):
         print("Running %s" % name)
-        completed_normalized = solver.complete(self.incomplete_normalized)
+        completed_normalized = solver.fit_transform(self.incomplete_normalized)
         completed = self.normalizer.inverse_transform(completed_normalized)
 
         mae = masked_mae(
@@ -266,14 +265,12 @@ def get_lfw(max_size=None):
     for negative_log_regularization_weight in [2, 3, 4]:
         regularization_weight = 10.0 ** -negative_log_regularization_weight
         table.add_entry(
-            solver=MICE(
+            solver=IterativeImputer(
                 n_nearest_columns=80,
-                n_imputations=100,
+                n_iter=50,
                 n_burn_in=5,
-                model=BayesianRidgeRegression(lambda_reg=regularization_weight),
-                init_fill_method="mean",
             ),
-            name="MICE_%d" % negative_log_regularization_weight)
+            name="IterativeImputer_%d" % negative_log_regularization_weight)
 
     for fill_method in ["mean", "median"]:
         table.add_entry(

diff --git a/experiments/readme_example.py b/experiments/readme_example.py
@@ -2,6 +2,7 @@
 from fancyimpute import (
     BiScaler,
     KNN,
+    IterativeImputer,
     NuclearNormMinimization,
     SoftImpute,
     SimpleFill
@@ -20,15 +21,19 @@
 X_incomplete[missing_mask] = np.nan
 
 meanFill = SimpleFill("mean")
-X_filled_mean = meanFill.complete(X_incomplete)
+X_filled_mean = meanFill.fit_transform(X_incomplete)
+
+# Model each feature with missing values as a function of other features, and
+# use that estimate for imputation.
+X_filled_ii = IterativeImputer().fit_transform(X_incomplete)
 
 # Use 3 nearest rows which have a feature to fill in each row's missing features
 knnImpute = KNN(k=3)
-X_filled_knn = knnImpute.complete(X_incomplete)
+X_filled_knn = knnImpute.fit_transform(X_incomplete)
 
 # matrix completion using convex optimization to find low-rank solution
 # that still matches observed values. Slow!
-X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)
+X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)
 
 # Instead of solving the nuclear norm objective directly, instead
 # induce sparsity using singular value thresholding
@@ -41,15 +46,19 @@
 # rescale both rows and columns to have zero mean and unit variance
 X_incomplete_normalized = biscaler.fit_transform(X_incomplete)
 
-X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized)
+X_filled_softimpute_normalized = softImpute.fit_transform(X_incomplete_normalized)
 X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized)
 
-X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete)
+X_filled_softimpute_no_biscale = softImpute.fit_transform(X_incomplete)
 
 meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean()
 print("meanFill MSE: %f" % meanfill_mse)
 
+# print mean squared error for the four imputation methods above
 # print mean squared error for the three imputation methods above
+ii_mse = ((X_filled_ii[missing_mask] - X[missing_mask]) ** 2).mean()
+print("Iterative Imputer norm minimization MSE: %f" % ii_mse)
+
 nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
 print("Nuclear norm minimization MSE: %f" % nnm_mse)
 

diff --git a/experiments/test_knn_timings.py b/experiments/test_knn_timings.py
diff --git a/fancyimpute/common.py b/fancyimpute/common.py
@@ -28,6 +28,7 @@ def import_from(module, name):
     module = importlib.import_module(module)
     return getattr(module, name)
 
+
 def masked_mae(X_true, X_pred, mask):
     masked_diff = X_true[mask] - X_pred[mask]
     return np.mean(np.abs(masked_diff))

diff --git a/fancyimpute/iterative_imputer.py b/fancyimpute/iterative_imputer.py
@@ -985,24 +985,6 @@ def transform(self, X):
         Xt[~mask_missing_values] = X[~mask_missing_values]
         return Xt
 
-    def complete(self, X, y=None):
-        """Completes X.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Input data, where "n_samples" is the number of samples and
-            "n_features" is the number of features.
-
-        y : ignored.
-
-        Returns
-        -------
-        Xt : array-like, shape (n_samples, n_features)
-             The imputed input data.
-        """
-        return self.fit_transform(X)
-
 
     def fit(self, X, y=None):
         """Fits the imputer on X and return self.

diff --git a/fancyimpute/similarity_weighted_averaging.py b/fancyimpute/similarity_weighted_averaging.py
@@ -152,7 +152,7 @@ def complete_dict(
             result = transpose_nested_dictionary(result)
         return result
 
-    def complete(self, X):
+    def fit_transform(self, X):
         X = check_array(X, force_all_finite=False)
 
         if self.verbose:

diff --git a/fancyimpute/solver.py b/fancyimpute/solver.py
@@ -165,7 +165,15 @@ def solve(self, X, missing_mask):
         raise ValueError("%s.solve not yet implemented!" % (
             self.__class__.__name__,))
 
-    def complete(self, X):
+    def fit_transform(self, X, y=None):
+        """
+        Fit the imputer and then transform input `X`
+
+        Note: all imputations should have a `fit_transform` method,
+        but only some (like IterativeImputer) also support inductive mode
+        using `fit` or `fit_transform` on `X_train` and then `transform`
+        on new `X_test`.
+        """
         X_original, missing_mask = self.prepare_input_data(X)
         observed_mask = ~missing_mask
         X = X_original.copy()
@@ -188,3 +196,29 @@ def complete(self, X):
         X_result = self.project_result(X=X_result)
         X_result[observed_mask] = X_original[observed_mask]
         return X_result
+
+    def fit(self, X, y=None):
+        """
+        Fit the imputer on input `X`.
+
+        Note: all imputations should have a `fit_transform` method,
+        but only some (like IterativeImputer) also support inductive mode
+        using `fit` or `fit_transform` on `X_train` and then `transform`
+        on new `X_test`.
+        """
+        raise ValueError("%s.fit not implemented! This imputation algorithm likely"
+                         "doesn't support inductive mode." % (
+            self.__class__.__name__,))
+
+    def transform(self, X, y=None):
+        """
+        Transform input `X`.
+
+        Note: all imputations should have a `fit_transform` method,
+        but only some (like IterativeImputer) also support inductive mode
+        using `fit` or `fit_transform` on `X_train` and then `transform`
+        on new `X_test`.
+        """
+        raise ValueError("%s.transform not implemented! This imputation algorithm likely"
+                         "doesn't support inductive mode." % (
+            self.__class__.__name__,))
diff --git a/test/test_iterative_imputer.py b/test/test_iterative_imputer.py
@@ -7,7 +7,7 @@
 
 def test_iterative_imputer_with_low_rank_random_matrix():
     imputer = IterativeImputer(n_iter=50, random_state=0)
-    XY_completed = imputer.complete(XY_incomplete)
+    XY_completed = imputer.fit_transform(XY_incomplete)
     _, missing_mae = reconstruction_error(
         XY,
         XY_completed,
@@ -18,7 +18,7 @@ def test_iterative_imputer_with_low_rank_random_matrix():
 
 def test_iterative_imputer_with_low_rank_random_matrix_approximate():
     imputer = IterativeImputer(n_iter=50, n_nearest_features=5, random_state=0)
-    XY_completed = imputer.complete(XY_incomplete)
+    XY_completed = imputer.fit_transform(XY_incomplete)
     _, missing_mae = reconstruction_error(
         XY,
         XY_completed,
@@ -33,7 +33,7 @@ def test_iterative_imputer_as_mice_with_low_rank_random_matrix_approximate():
     XY_completed = []
     for i in range(n_imputations):
         imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i)
-        XY_completed.append(imputer.complete(XY_incomplete))
+        XY_completed.append(imputer.fit_transform(XY_incomplete))
     _, missing_mae = reconstruction_error(
         XY,
         np.mean(XY_completed, axis=0),

diff --git a/test/test_iterative_svd.py b/test/test_iterative_svd.py
@@ -5,7 +5,7 @@
 
 def test_iterative_svd_with_low_rank_random_matrix():
     solver = IterativeSVD(rank=3)
-    XY_completed = solver.complete(XY_incomplete)
+    XY_completed = solver.fit_transform(XY_incomplete)
     _, missing_mae = reconstruction_error(
         XY,
         XY_completed,

diff --git a/test/test_knn.py b/test/test_knn.py
@@ -13,7 +13,7 @@ def test_knn():
     print("MAD zero-fill = ", mad_zero_fill)
     for k in [5, 15, 30]:
         print("-- k=", k)
-        XY_completed = KNN(k).complete(XY_incomplete)
+        XY_completed = KNN(k).fit_transform(XY_incomplete)
         mask = np.isfinite(XY_completed)
         eq_((~mask).sum(), 0)
         diff = (XY_completed - XY)[missing_mask]

diff --git a/test/test_matrix_factorization.py b/test/test_matrix_factorization.py
@@ -10,7 +10,7 @@ def test_matrix_factorization_with_low_rank_random_matrix():
         rank=3,
         l2_penalty=0,
         min_improvement=1e-6)
-    XY_completed = solver.complete(XY_incomplete)
+    XY_completed = solver.fit_transform(XY_incomplete)
     _, missing_mae = reconstruction_error(
         XY,
         XY_completed,

diff --git a/test/test_nuclear_norm_minimization.py b/test/test_nuclear_norm_minimization.py
@@ -31,7 +31,7 @@ def create_rank1_data(symmetric=False):
 def test_rank1_convex_solver():
     XY_rank1, XY_missing_rank1 = create_rank1_data(symmetric=False)
     solver = NuclearNormMinimization(max_iters=50000)
-    XY_completed_rank1 = solver.complete(XY_missing_rank1)
+    XY_completed_rank1 = solver.fit_transform(XY_missing_rank1)
     assert abs(XY_completed_rank1[1, 2] - XY_rank1[1, 2]) < 0.01, \
         "Expected %0.4f but got %0.4f" % (
             XY_rank1[1, 2], XY_completed_rank1[1, 2])
@@ -40,15 +40,15 @@ def test_rank1_convex_solver():
 def test_rank1_symmetric_convex_solver():
     XYXY_rank1, XYXY_missing_rank1 = create_rank1_data(symmetric=True)
     solver = NuclearNormMinimization(require_symmetric_solution=True)
-    completed = solver.complete(XYXY_missing_rank1)
+    completed = solver.fit_transform(XYXY_missing_rank1)
     assert abs(completed[1, 2] - XYXY_rank1[1, 2]) < 0.01, \
         "Expected %0.4f but got %0.4f" % (
             XYXY_rank1[1, 2], completed[1, 2])
 
 
 def test_nuclear_norm_minimization_with_low_rank_random_matrix():
     solver = NuclearNormMinimization(max_iters=2000)
-    XY_completed = solver.complete(XY_incomplete[:100])
+    XY_completed = solver.fit_transform(XY_incomplete[:100])
     _, missing_mae = reconstruction_error(
         XY[:100], XY_completed, missing_mask[:100], name="NuclearNorm")
     assert missing_mae < 0.1, "Error too high!"

diff --git a/test/test_similarity_weighted_averaging.py b/test/test_similarity_weighted_averaging.py
@@ -17,7 +17,7 @@ def test_similarity_weighted_column_averaging():
     missing_mask = np.isnan(X_incomplete)
 
     solver = SimilarityWeightedAveraging()
-    X_filled = solver.complete(X_incomplete)
+    X_filled = solver.fit_transform(X_incomplete)
     eq_(X_incomplete.shape, X_filled.shape)
     diff = (X - X_filled)[missing_mask]
     abs_diff = np.abs(diff)

diff --git a/test/test_soft_impute.py b/test/test_soft_impute.py
@@ -5,7 +5,7 @@
 
 def test_soft_impute_with_low_rank_random_matrix():
     solver = SoftImpute()
-    XY_completed = solver.complete(XY_incomplete)
+    XY_completed = solver.fit_transform(XY_incomplete)
     _, missing_mae = reconstruction_error(
         XY,
         XY_completed,