Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
sergeyf committed Sep 13, 2018
1 parent 82db8c5 commit ce0a52d
Show file tree
Hide file tree
Showing 15 changed files with 86 additions and 94 deletions.
25 changes: 19 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,21 @@ from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImput

# Model each feature with missing values as a function of other features, and
# use that estimate for imputation.
X_filled_ii = IterativeImputer().complete(X_incomplete)
X_filled_ii = IterativeImputer().fit_transform(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
X_filled_knn = KNN(k=3).complete(X_incomplete)
X_filled_knn = KNN(k=3).fit_transform(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)
X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)
X_filled_softimpute = SoftImpute().complete(X_incomplete_normalized)
X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized)

# print mean squared error for the three imputation methods above
# print mean squared error for the four imputation methods above
ii_mse = ((X_filled_ii[missing_mask] - X[missing_mask]) ** 2).mean()
print("Iterative Imputer norm minimization MSE: %f" % ii_mse)

Expand Down Expand Up @@ -64,6 +64,19 @@ on features for which two rows both have observed data.
* `BiScaler`: Iterative estimation of row/column means and standard deviations to get doubly normalized
matrix. Not guaranteed to converge but works well in practice. Taken from [Matrix Completion and Low-Rank SVD via Fast Alternating Least Squares](http://arxiv.org/abs/1410.2596).

## Note about Inductive vs Transductive Imputation
Most imputation algorithms in `fancyimpute` are *transductive*. In the elegant language of `scikit-learn`'s API
this means that you can only call `solver.fit_transform(X_incomplete)`, but then the "fitted" `solver` will not
be able to be applied to new data via a call to `solver.transform`. A simple example is the `MatrixFactorization`
imputer, which decomposes as follows: `<A,B> = X_incomplete`, such that the product of `A` and `B` is close
to `X_incomplete` on its non-missing values. How then, can we apply the learned `A` and `B` matrices to
held-out data? It is not doable in general, but there are special cases. `fancyimpute` aims to be of general
use and we have not implemented an inductive mode for `MatrixFactorization`.

There are some imputation algorithms that are *inductive*, meaning they can be applied to new data after a call to
`solver.fit` or `solver.fit_transform`. Currently only `IterativeImputer` supports the full `scikit-learn` API: `fit`, `fit_transform`,
and `transform`, but we are actively looking for contributions that extend other imputers to support
induction. At least the `KNN` and `SimpleFill` imputers can be extended in a straightforward manner.

## Note about Multiple vs. Single Imputation
(From `scikit-learn`'s documentation)
Expand Down Expand Up @@ -94,7 +107,7 @@ n_imputations = 5
XY_completed = []
for i in range(n_imputations):
imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i)
XY_completed.append(imputer.complete(XY_incomplete))
XY_completed.append(imputer.fit_transform(XY_incomplete))

XY_completed_mean = np.mean(XY_completed, 0)
XY_completed_std = np.std(XY_completed, 0)
Expand Down
13 changes: 5 additions & 8 deletions experiments/complete_faces.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
SoftImpute,
BiScaler,
KNN,
MICE,
BayesianRidgeRegression,
IterativeImputer,
)
from fancyimpute.common import masked_mae, masked_mse

Expand Down Expand Up @@ -172,7 +171,7 @@ def save_images(self, images, base_filename, flattened=True):

def add_entry(self, solver, name):
print("Running %s" % name)
completed_normalized = solver.complete(self.incomplete_normalized)
completed_normalized = solver.fit_transform(self.incomplete_normalized)
completed = self.normalizer.inverse_transform(completed_normalized)

mae = masked_mae(
Expand Down Expand Up @@ -266,14 +265,12 @@ def get_lfw(max_size=None):
for negative_log_regularization_weight in [2, 3, 4]:
regularization_weight = 10.0 ** -negative_log_regularization_weight
table.add_entry(
solver=MICE(
solver=IterativeImputer(
n_nearest_columns=80,
n_imputations=100,
n_iter=50,
n_burn_in=5,
model=BayesianRidgeRegression(lambda_reg=regularization_weight),
init_fill_method="mean",
),
name="MICE_%d" % negative_log_regularization_weight)
name="IterativeImputer_%d" % negative_log_regularization_weight)

for fill_method in ["mean", "median"]:
table.add_entry(
Expand Down
19 changes: 14 additions & 5 deletions experiments/readme_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from fancyimpute import (
BiScaler,
KNN,
IterativeImputer,
NuclearNormMinimization,
SoftImpute,
SimpleFill
Expand All @@ -20,15 +21,19 @@
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.complete(X_incomplete)
X_filled_mean = meanFill.fit_transform(X_incomplete)

# Model each feature with missing values as a function of other features, and
# use that estimate for imputation.
X_filled_ii = IterativeImputer().fit_transform(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.complete(X_incomplete)
X_filled_knn = knnImpute.fit_transform(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)
X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
Expand All @@ -41,15 +46,19 @@
# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)

X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized)
X_filled_softimpute_normalized = softImpute.fit_transform(X_incomplete_normalized)
X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized)

X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete)
X_filled_softimpute_no_biscale = softImpute.fit_transform(X_incomplete)

meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean()
print("meanFill MSE: %f" % meanfill_mse)

# print mean squared error for the four imputation methods above
# print mean squared error for the three imputation methods above
ii_mse = ((X_filled_ii[missing_mask] - X[missing_mask]) ** 2).mean()
print("Iterative Imputer norm minimization MSE: %f" % ii_mse)

nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
print("Nuclear norm minimization MSE: %f" % nnm_mse)

Expand Down
44 changes: 0 additions & 44 deletions experiments/test_knn_timings.py

This file was deleted.

1 change: 1 addition & 0 deletions fancyimpute/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def import_from(module, name):
module = importlib.import_module(module)
return getattr(module, name)


def masked_mae(X_true, X_pred, mask):
masked_diff = X_true[mask] - X_pred[mask]
return np.mean(np.abs(masked_diff))
Expand Down
18 changes: 0 additions & 18 deletions fancyimpute/iterative_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,24 +985,6 @@ def transform(self, X):
Xt[~mask_missing_values] = X[~mask_missing_values]
return Xt

def complete(self, X, y=None):
"""Completes X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Input data, where "n_samples" is the number of samples and
"n_features" is the number of features.
y : ignored.
Returns
-------
Xt : array-like, shape (n_samples, n_features)
The imputed input data.
"""
return self.fit_transform(X)


def fit(self, X, y=None):
"""Fits the imputer on X and return self.
Expand Down
2 changes: 1 addition & 1 deletion fancyimpute/similarity_weighted_averaging.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def complete_dict(
result = transpose_nested_dictionary(result)
return result

def complete(self, X):
def fit_transform(self, X):
X = check_array(X, force_all_finite=False)

if self.verbose:
Expand Down
36 changes: 35 additions & 1 deletion fancyimpute/solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,15 @@ def solve(self, X, missing_mask):
raise ValueError("%s.solve not yet implemented!" % (
self.__class__.__name__,))

def complete(self, X):
def fit_transform(self, X, y=None):
"""
Fit the imputer and then transform input `X`
Note: all imputations should have a `fit_transform` method,
but only some (like IterativeImputer) also support inductive mode
using `fit` or `fit_transform` on `X_train` and then `transform`
on new `X_test`.
"""
X_original, missing_mask = self.prepare_input_data(X)
observed_mask = ~missing_mask
X = X_original.copy()
Expand All @@ -188,3 +196,29 @@ def complete(self, X):
X_result = self.project_result(X=X_result)
X_result[observed_mask] = X_original[observed_mask]
return X_result

def fit(self, X, y=None):
"""
Fit the imputer on input `X`.
Note: all imputations should have a `fit_transform` method,
but only some (like IterativeImputer) also support inductive mode
using `fit` or `fit_transform` on `X_train` and then `transform`
on new `X_test`.
"""
raise ValueError("%s.fit not implemented! This imputation algorithm likely"
"doesn't support inductive mode." % (
self.__class__.__name__,))

def transform(self, X, y=None):
"""
Transform input `X`.
Note: all imputations should have a `fit_transform` method,
but only some (like IterativeImputer) also support inductive mode
using `fit` or `fit_transform` on `X_train` and then `transform`
on new `X_test`.
"""
raise ValueError("%s.transform not implemented! This imputation algorithm likely"
"doesn't support inductive mode." % (
self.__class__.__name__,))
6 changes: 3 additions & 3 deletions test/test_iterative_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def test_iterative_imputer_with_low_rank_random_matrix():
imputer = IterativeImputer(n_iter=50, random_state=0)
XY_completed = imputer.complete(XY_incomplete)
XY_completed = imputer.fit_transform(XY_incomplete)
_, missing_mae = reconstruction_error(
XY,
XY_completed,
Expand All @@ -18,7 +18,7 @@ def test_iterative_imputer_with_low_rank_random_matrix():

def test_iterative_imputer_with_low_rank_random_matrix_approximate():
imputer = IterativeImputer(n_iter=50, n_nearest_features=5, random_state=0)
XY_completed = imputer.complete(XY_incomplete)
XY_completed = imputer.fit_transform(XY_incomplete)
_, missing_mae = reconstruction_error(
XY,
XY_completed,
Expand All @@ -33,7 +33,7 @@ def test_iterative_imputer_as_mice_with_low_rank_random_matrix_approximate():
XY_completed = []
for i in range(n_imputations):
imputer = IterativeImputer(n_iter=5, sample_posterior=True, random_state=i)
XY_completed.append(imputer.complete(XY_incomplete))
XY_completed.append(imputer.fit_transform(XY_incomplete))
_, missing_mae = reconstruction_error(
XY,
np.mean(XY_completed, axis=0),
Expand Down
2 changes: 1 addition & 1 deletion test/test_iterative_svd.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def test_iterative_svd_with_low_rank_random_matrix():
solver = IterativeSVD(rank=3)
XY_completed = solver.complete(XY_incomplete)
XY_completed = solver.fit_transform(XY_incomplete)
_, missing_mae = reconstruction_error(
XY,
XY_completed,
Expand Down
2 changes: 1 addition & 1 deletion test/test_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_knn():
print("MAD zero-fill = ", mad_zero_fill)
for k in [5, 15, 30]:
print("-- k=", k)
XY_completed = KNN(k).complete(XY_incomplete)
XY_completed = KNN(k).fit_transform(XY_incomplete)
mask = np.isfinite(XY_completed)
eq_((~mask).sum(), 0)
diff = (XY_completed - XY)[missing_mask]
Expand Down
2 changes: 1 addition & 1 deletion test/test_matrix_factorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_matrix_factorization_with_low_rank_random_matrix():
rank=3,
l2_penalty=0,
min_improvement=1e-6)
XY_completed = solver.complete(XY_incomplete)
XY_completed = solver.fit_transform(XY_incomplete)
_, missing_mae = reconstruction_error(
XY,
XY_completed,
Expand Down
6 changes: 3 additions & 3 deletions test/test_nuclear_norm_minimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def create_rank1_data(symmetric=False):
def test_rank1_convex_solver():
XY_rank1, XY_missing_rank1 = create_rank1_data(symmetric=False)
solver = NuclearNormMinimization(max_iters=50000)
XY_completed_rank1 = solver.complete(XY_missing_rank1)
XY_completed_rank1 = solver.fit_transform(XY_missing_rank1)
assert abs(XY_completed_rank1[1, 2] - XY_rank1[1, 2]) < 0.01, \
"Expected %0.4f but got %0.4f" % (
XY_rank1[1, 2], XY_completed_rank1[1, 2])
Expand All @@ -40,15 +40,15 @@ def test_rank1_convex_solver():
def test_rank1_symmetric_convex_solver():
XYXY_rank1, XYXY_missing_rank1 = create_rank1_data(symmetric=True)
solver = NuclearNormMinimization(require_symmetric_solution=True)
completed = solver.complete(XYXY_missing_rank1)
completed = solver.fit_transform(XYXY_missing_rank1)
assert abs(completed[1, 2] - XYXY_rank1[1, 2]) < 0.01, \
"Expected %0.4f but got %0.4f" % (
XYXY_rank1[1, 2], completed[1, 2])


def test_nuclear_norm_minimization_with_low_rank_random_matrix():
solver = NuclearNormMinimization(max_iters=2000)
XY_completed = solver.complete(XY_incomplete[:100])
XY_completed = solver.fit_transform(XY_incomplete[:100])
_, missing_mae = reconstruction_error(
XY[:100], XY_completed, missing_mask[:100], name="NuclearNorm")
assert missing_mae < 0.1, "Error too high!"
Expand Down
2 changes: 1 addition & 1 deletion test/test_similarity_weighted_averaging.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_similarity_weighted_column_averaging():
missing_mask = np.isnan(X_incomplete)

solver = SimilarityWeightedAveraging()
X_filled = solver.complete(X_incomplete)
X_filled = solver.fit_transform(X_incomplete)
eq_(X_incomplete.shape, X_filled.shape)
diff = (X - X_filled)[missing_mask]
abs_diff = np.abs(diff)
Expand Down
2 changes: 1 addition & 1 deletion test/test_soft_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def test_soft_impute_with_low_rank_random_matrix():
solver = SoftImpute()
XY_completed = solver.complete(XY_incomplete)
XY_completed = solver.fit_transform(XY_incomplete)
_, missing_mae = reconstruction_error(
XY,
XY_completed,
Expand Down

0 comments on commit ce0a52d

Please sign in to comment.