Skip to content

Commit

Permalink
Merge pull request #4 from hammerlab/zero_and_inf_fix
Browse files Browse the repository at this point in the history
Zero and inf fix
  • Loading branch information
iskandr committed Mar 15, 2017
2 parents d045b5c + 324309e commit 8ec2611
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 12 deletions.
1 change: 0 additions & 1 deletion knnimpute/argpartition.py
Expand Up @@ -53,7 +53,6 @@ def knn_impute_with_argpartition(
# one column at a time
missing_mask_column_major = np.asarray(missing_mask, order="F")
X_row_major, D = knn_initialize(X, missing_mask, verbose=verbose)
# D[~np.isfinite(D)] = very_large_value
D_reciprocal = 1.0 / D
neighbor_weights = np.zeros(k, dtype="float32")
dot = np.dot
Expand Down
17 changes: 14 additions & 3 deletions knnimpute/common.py
Expand Up @@ -19,7 +19,11 @@
from .normalized_distance import all_pairs_normalized_distances


def knn_initialize(X, missing_mask, verbose=False):
def knn_initialize(X,
missing_mask,
verbose=False,
min_dist=1e-6,
max_dist_multiplier=1e6):
"""
Fill X with NaN values if necessary, construct the n_samples x n_samples
distance matrix and set the self-distance of each row to infinity.
Expand All @@ -30,7 +34,14 @@ def knn_initialize(X, missing_mask, verbose=False):
# to put NaN's back in the data matrix for the distances function
X_row_major[missing_mask] = np.nan
D = all_pairs_normalized_distances(X_row_major, verbose=verbose)
# set diagonal of distance matrix to infinity since we don't want
# set diagonal of distance matrix to a large value since we don't want
# points considering themselves as neighbors
np.fill_diagonal(D, np.inf)
D_finite_flat = D[np.isfinite(D)]
if len(D_finite_flat) > 0:
max_dist = max_dist_multiplier * np.maximum(1, D_finite_flat.max())
else:
max_dist = max_dist_multiplier
np.fill_diagonal(D, max_dist)
D[D < min_dist] = min_dist # prevents 0s
D[D > max_dist] = max_dist # prevents infinities
return X_row_major, D
4 changes: 1 addition & 3 deletions knnimpute/few_observed_entries.py
Expand Up @@ -49,11 +49,9 @@ def knn_impute_few_observed(
X_column_major = X.copy(order="F")
X_row_major, D = knn_initialize(X, missing_mask, verbose=verbose)
# get rid of infinities, replace them with a very large number
finite_distance_distance_mask = np.isfinite(D)
effective_infinity = 10 ** 6 * D[finite_distance_distance_mask].max()
D[~finite_distance_distance_mask] = effective_infinity
D_sorted = np.argsort(D, axis=1)
inv_D = 1.0 / D
effective_infinity = D[0, 0] # since diagonal was replaced by max_dist
D_valid_mask = D < effective_infinity
valid_distances_per_row = D_valid_mask.sum(axis=1)

Expand Down
6 changes: 1 addition & 5 deletions knnimpute/reference.py
Expand Up @@ -28,11 +28,7 @@ def knn_impute_reference(
"""
n_rows, n_cols = X.shape
X_result, D = knn_initialize(X, missing_mask, verbose=verbose)

# get rid of infinities, replace them with a very large number
finite_distance_distance_mask = np.isfinite(D)
effective_infinity = 10 ** 6 * D[finite_distance_distance_mask].max()
D[~finite_distance_distance_mask] = effective_infinity
effective_infinity = D[0, 0] # since diagonal was replaced by max_dist

for i in range(n_rows):
for j in np.where(missing_mask[i, :])[0]:
Expand Down
6 changes: 6 additions & 0 deletions test/test_knn.py
Expand Up @@ -33,3 +33,9 @@ def test_knn_optimistic_same_as_reference():

def test_knn_optimistic_few_observed():
_knn_implementation(knn_impute_few_observed)

def test_knn_minimal():
X = np.array([[1, 1, np.NaN], [1, 1, 1]])
res = knn_impute_few_observed(X, np.isnan(X), k=1)
assert np.isnan(res).any() == False, \
"Basic example did not get imputed: %s" % res

0 comments on commit 8ec2611

Please sign in to comment.