From 536085ea283ff33e83b001da8f9aa1e25057356b Mon Sep 17 00:00:00 2001 From: Nikolay Mayorov Date: Sat, 19 Apr 2014 18:04:45 +0400 Subject: [PATCH] FIX bug with set_params on minkowski 'p'. Fixes #2609. There was a bug that set_params didn't effectively change the value of minkowski 'p'. Because 'p' was put in self.metric_kwds in _init_params and then it never changed there. --- sklearn/neighbors/base.py | 52 ++++++++++++++++------- sklearn/neighbors/classification.py | 23 +++++----- sklearn/neighbors/regression.py | 20 ++++----- sklearn/neighbors/tests/test_neighbors.py | 18 +++++--- sklearn/neighbors/unsupervised.py | 9 +++- 5 files changed, 78 insertions(+), 44 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index dbf8126f8a889..c5fb86430876e 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -98,13 +98,22 @@ def __init__(self): def _init_params(self, n_neighbors=None, radius=None, algorithm='auto', leaf_size=30, metric='minkowski', - p=2, **kwargs): + p=2, metric_params=None, **kwargs): + if kwargs: + warnings.warn("Passing additional arguments to the metric " + "function as **kwargs is deprecated. " + "Use metric_params dict instead.", + DeprecationWarning, stacklevel=3) + if metric_params is None: + metric_params = {} + metric_params.update(kwargs) + self.n_neighbors = n_neighbors self.radius = radius self.algorithm = algorithm self.leaf_size = leaf_size self.metric = metric - self.metric_kwds = kwargs + self.metric_params = metric_params self.p = p if algorithm not in ['auto', 'brute', @@ -126,24 +135,35 @@ def _init_params(self, n_neighbors=None, radius=None, raise ValueError("Metric '%s' not valid for algorithm '%s'" % (metric, algorithm)) - if self.metric in ['wminkowski', 'minkowski']: - self.metric_kwds['p'] = p - if p < 1: - raise ValueError("p must be greater than one " - "for minkowski metric") + if self.metric_params is not None and 'p' in self.metric_params: + warnings.warn("Parameter p is found in metric_params. " + "The corresponding parameter from __init__ " + "is ignored.", SyntaxWarning, stacklevel=3) + effective_p = metric_params['p'] + else: + effective_p = self.p + + if self.metric in ['wminkowski', 'minkowski'] and effective_p < 1: + raise ValueError("p must be greater than one for minkowski metric") self._fit_X = None self._tree = None self._fit_method = None def _fit(self, X): - self.effective_metric_ = self.metric - self.effective_metric_kwds_ = self.metric_kwds + if self.metric_params is None: + self.effective_metric_params_ = {} + else: + self.effective_metric_params_ = self.metric_params.copy() + effective_p = self.effective_metric_params_.get('p', self.p) + if self.metric in ['wminkowski', 'minkowski']: + self.effective_metric_params_['p'] = effective_p + + self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available if self.metric == 'minkowski': - self.effective_metric_kwds_ = self.metric_kwds.copy() - p = self.effective_metric_kwds_.pop('p', 2) + p = self.effective_metric_params_.pop('p', 2) if p < 1: raise ValueError("p must be greater than one " "for minkowski metric") @@ -154,7 +174,7 @@ def _fit(self, X): elif p == np.inf: self.effective_metric_ = 'chebyshev' else: - self.effective_metric_kwds_['p'] = p + self.effective_metric_params_['p'] = p if isinstance(X, NeighborsBase): self._fit_X = X._fit_X @@ -210,11 +230,11 @@ def _fit(self, X): if self._fit_method == 'ball_tree': self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, - **self.effective_metric_kwds_) + **self.effective_metric_params_) elif self._fit_method == 'kd_tree': self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, - **self.effective_metric_kwds_) + **self.effective_metric_params_) elif self._fit_method == 'brute': self._tree = None else: @@ -292,7 +312,7 @@ class from an array representing our data set and ask who's else: dist = pairwise_distances(X, self._fit_X, self.effective_metric_, - **self.effective_metric_kwds_) + **self.effective_metric_params_) neigh_ind = argpartition(dist, n_neighbors - 1, axis=1) neigh_ind = neigh_ind[:, :n_neighbors] @@ -456,7 +476,7 @@ class from an array representing our data set and ask who's else: dist = pairwise_distances(X, self._fit_X, self.effective_metric_, - **self.effective_metric_kwds_) + **self.effective_metric_params_) neigh_ind = [np.where(d < radius)[0] for d in dist] # if there are the same number of neighbors for each point, diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py index 36d7c07108da0..8ac1050279d89 100644 --- a/sklearn/neighbors/classification.py +++ b/sklearn/neighbors/classification.py @@ -61,7 +61,7 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, required to store the tree. The optimal value depends on the nature of the problem. - metric : string or DistanceMetric object (default='minkowski') + metric : string or DistanceMetric object (default = 'minkowski') the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a @@ -72,9 +72,8 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - **kwargs : - additional keyword arguments are passed to the distance function as - additional arguments. + metric_params: dict, optional (default = None) + additional keyword arguments for the metric function. Examples -------- @@ -113,10 +112,12 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', **kwargs): + p=2, metric='minkowski', metric_params=None, **kwargs): + self._init_params(n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, **kwargs) + leaf_size=leaf_size, metric=metric, p=p, + metric_params=metric_params, **kwargs) self.weights = _check_weights(weights) def predict(self, X): @@ -277,9 +278,8 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, neighbors on given radius). If set to None, ValueError is raised, when outlier is detected. - **kwargs : - additional keyword arguments are passed to the distance function as - additional arguments. + metric_params: dict, optional (default = None) + additional keyword arguments for the metric function. Examples -------- @@ -309,11 +309,12 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', - outlier_label=None, **kwargs): + outlier_label=None, metric_params=None, **kwargs): self._init_params(radius=radius, algorithm=algorithm, leaf_size=leaf_size, - metric=metric, p=p, **kwargs) + metric=metric, p=p, metric_params=metric_params, + **kwargs) self.weights = _check_weights(weights) self.outlier_label = outlier_label diff --git a/sklearn/neighbors/regression.py b/sklearn/neighbors/regression.py index 05bddb2e3c0ff..a6285c77d9fef 100644 --- a/sklearn/neighbors/regression.py +++ b/sklearn/neighbors/regression.py @@ -72,9 +72,8 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - **kwargs : - additional keyword arguments are passed to the distance function as - additional arguments. + metric_params: dict, optional (default = None) + additional keyword arguments for the metric function. Examples -------- @@ -111,10 +110,11 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, def __init__(self, n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', **kwargs): + p=2, metric='minkowski', metric_params=None, **kwargs): self._init_params(n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, **kwargs) + leaf_size=leaf_size, metric=metric, p=p, + metric_params=metric_params, **kwargs) self.weights = _check_weights(weights) def predict(self, X): @@ -213,9 +213,8 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - **kwargs : - additional keyword arguments are passed to the distance function as - additional arguments. + metric_params: dict, optional (default = None) + additional keyword arguments for the metric function. Examples -------- @@ -245,11 +244,12 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, def __init__(self, radius=1.0, weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', **kwargs): + p=2, metric='minkowski', metric_params=None, **kwargs): self._init_params(radius=radius, algorithm=algorithm, leaf_size=leaf_size, - p=p, metric=metric, **kwargs) + p=p, metric=metric, metric_params=metric_params, + **kwargs) self.weights = _check_weights(weights) def predict(self, X): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 4dcb45c488ea1..9c671dc83cd91 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -10,6 +10,7 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_true +from sklearn.utils.testing import assert_warns from sklearn.utils.testing import ignore_warnings from sklearn.utils.validation import check_random_state from sklearn import neighbors, datasets @@ -209,7 +210,6 @@ def test_kneighbors_classifier_predict_proba(): assert_array_almost_equal(real_prob, y_prob) - def test_radius_neighbors_classifier(n_samples=40, n_features=5, n_test_pts=10, @@ -810,9 +810,9 @@ def test_neighbors_metrics(n_samples=20, n_features=3, test = rng.rand(n_query_pts, n_features) - for metric, kwds in metrics: + for metric, metric_params in metrics: results = [] - + p = metric_params.pop('p', 2) for algorithm in algorithms: # KD tree doesn't support all metrics if (algorithm == 'kd_tree' and @@ -820,12 +820,13 @@ def test_neighbors_metrics(n_samples=20, n_features=3, assert_raises(ValueError, neighbors.NearestNeighbors, algorithm=algorithm, - metric=metric, **kwds) + metric=metric, metric_params=metric_params) continue neigh = neighbors.NearestNeighbors(n_neighbors=n_neighbors, algorithm=algorithm, - metric=metric, **kwds) + metric=metric, p=p, + metric_params=metric_params) neigh.fit(X) results.append(neigh.kneighbors(test, return_distance=True)) @@ -849,6 +850,13 @@ def test_callable_metric(): assert_array_almost_equal(dist1, dist2) +def test_metric_params_interface(): + assert_warns(DeprecationWarning, neighbors.KNeighborsClassifier, + metric='wminkowski', w=np.ones(10)) + assert_warns(SyntaxWarning, neighbors.KNeighborsClassifier, + metric_params={'p': 3}) + + if __name__ == '__main__': import nose nose.runmodule() diff --git a/sklearn/neighbors/unsupervised.py b/sklearn/neighbors/unsupervised.py index 175b1c43cc669..0dd5a75861cf1 100644 --- a/sklearn/neighbors/unsupervised.py +++ b/sklearn/neighbors/unsupervised.py @@ -43,6 +43,9 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + metric_params: dict, optional (default = None) + additional keyword arguments for the metric function. + Examples -------- >>> from sklearn.neighbors import NearestNeighbors @@ -76,8 +79,10 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, """ def __init__(self, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', **kwargs): + algorithm='auto', leaf_size=30, metric='minkowski', + p=2, metric_params=None, **kwargs): self._init_params(n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, **kwargs) + leaf_size=leaf_size, metric=metric, p=p, + metric_params=metric_params, **kwargs)