Permalink
Browse files

Merge branch 'nn'

  • Loading branch information...
2 parents 19526ad + aa7d739 commit e97df3da48cbd69d3143498ab60228e637302101 @kwgoodman committed May 18, 2012
View
@@ -279,6 +279,6 @@ After you have installed Bottleneck, run the suite of unit tests::
>>> import bottleneck as bn
>>> bn.test()
<snip>
- Ran 84 tests in 28.169s
+ Ran 85 tests in 28.169s
OK
- <nose.result.TextTestResult run=84 errors=0 failures=0>
+ <nose.result.TextTestResult run=85 errors=0 failures=0>
View
@@ -16,6 +16,7 @@ Thanks to Dougal Sutherland, Bottleneck now runs on Python 3.2.
**New functions**
- replace(arr, old, new), e.g, replace(arr, np.nan, 0)
+- nn(arr, arr0, axis) nearest neighbor and its index of 1d arr0 in 2d arr
**Enhancements**
@@ -12,7 +12,7 @@
try:
from .func import (nansum, nanmax, nanmin, nanmean, nanstd, nanvar, median,
nanmedian, nanargmin, nanargmax, rankdata, nanrankdata,
- ss, partsort, argpartsort, replace)
+ ss, nn, partsort, argpartsort, replace)
except:
pass
try:
@@ -3,7 +3,7 @@
__all__ = ['median', 'nanmedian', 'nansum', 'nanmean', 'nanvar', 'nanstd',
'nanmin', 'nanmax', 'nanargmin', 'nanargmax', 'rankdata',
- 'nanrankdata', 'ss', 'partsort', 'argpartsort', 'replace']
+ 'nanrankdata', 'ss', 'nn', 'partsort', 'argpartsort', 'replace']
def median(arr, axis=None):
"Slow median function used for unaccelerated ndim/dtype combinations."
@@ -145,6 +145,22 @@ def ss(arr, axis=0):
"Slow sum of squares used for unaccelerated ndim/dtype combinations."
return scipy_ss(arr, axis)
+def nn(arr, arr0, axis=1):
+ "Slow nearest neighbor used for unaccelerated ndim/dtype combinations."
+ if arr.ndim != 2:
+ raise ValueError("`arr` must be 2d")
+ if arr0.ndim != 1:
+ raise ValueError("`arr0` must be 1d")
+ if axis == 1:
+ d = (arr - arr0) ** 2
+ elif axis == 0:
+ d = (arr - arr0.reshape(-1,1)) ** 2
+ else:
+ raise ValueError("`axis` must be 0 or 1.")
+ d = d.sum(axis)
+ idx = np.argmin(d)
+ return np.sqrt(d[idx]), idx
+
def partsort(arr, n, axis=-1):
"Slow partial sort used for unaccelerated ndim/dtype combinations."
return np.sort(arr, axis)
@@ -17,6 +17,7 @@
from .rankdata import rankdata
from .nanrankdata import nanrankdata
from .ss import ss
+from .nn import nn
from .partsort import partsort
from .argpartsort import argpartsort
from .replace import replace
@@ -35,6 +36,7 @@
funcs['rankdata'] = rankdata
funcs['nanrankdata'] = nanrankdata
funcs['ss'] = ss
+funcs['nn'] = nn
funcs['partsport'] = partsort
funcs['argpartsort'] = argpartsort
funcs['replace'] = replace
@@ -103,6 +105,7 @@
include "rankdata.pyx"
include "nanrankdata.pyx"
include "ss.pyx"
+include "nn.pyx"
include "partsort.pyx"
include "argpartsort.pyx"
include "replace.pyx"
@@ -0,0 +1,248 @@
+"nn template"
+
+from copy import deepcopy
+import bottleneck as bn
+
+__all__ = ["nn"]
+
+FLOAT_DTYPES = [x for x in bn.dtypes if 'float' in x]
+INT_DTYPES = [x for x in bn.dtypes if 'int' in x]
+
+
+# Float dtypes (not axis=None) ----------------------------------------------
+
+floats = {}
+floats['dtypes'] = FLOAT_DTYPES
+floats['axisNone'] = False
+floats['force_output_dtype'] = False
+floats['reuse_non_nan_func'] = False
+
+floats['top'] = """
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def NAME_NDIMd_DTYPE_axisAXIS(np.ndarray[np.DTYPE_t, ndim=2] a,
+ np.ndarray[np.DTYPE_t, ndim=1] a0):
+ "Nearest neighbor of 1d `a0` in 2d `a` with dtype=DTYPE, axis=AXIS."
+ cdef:
+ np.float64_t xsum = 0, d, xsummin=np.inf, dist
+ Py_ssize_t imin = -1, n, a0size
+"""
+
+loop = {}
+loop[2] = """\
+ a0size = PyArray_SIZE(a0)
+ if nAXIS != a0size:
+ raise ValueError("`a0` must match size of `a` along specified axis")
+ for iINDEX0 in range(nINDEX0):
+ xsum = 0
+ for iINDEX1 in range(nINDEX1):
+ d = a[INDEXALL] - a0[iINDEX1]
+ xsum += d * d
+ if xsum < xsummin:
+ xsummin = xsum
+ imin = iINDEX0
+ if imin == -1:
+ dist = NAN
+ imin = 0
+ else:
+ dist = sqrt(xsummin)
+ return dist, imin
+"""
+floats['loop'] = loop
+
+# Int dtypes (not axis=None) ------------------------------------------------
+
+ints = deepcopy(floats)
+ints['dtypes'] = INT_DTYPES
+
+# Slow, unaccelerated ndim/dtype --------------------------------------------
+
+slow = {}
+slow['name'] = "nn"
+slow['signature'] = "arr, arr0"
+slow['func'] = "bn.slow.nn(arr, arr0, axis=AXIS)"
+
+# Template ------------------------------------------------------------------
+
+nn = {}
+nn['name'] = 'nn'
+nn['is_reducing_function'] = False
+nn['cdef_output'] = False
+nn['slow'] = slow
+nn['templates'] = {}
+nn['templates']['float'] = floats
+nn['templates']['int'] = ints
+nn['pyx_file'] = 'func/%sbit/nn.pyx'
+
+nn['main'] = '''"nn auto-generated from template"
+
+def nn(arr, arr0, int axis=1):
+ """
+ Distance of nearest neighbor (and its index) along specified axis.
+
+ The Euclidian distance between `arr0` and its nearest neighbor in
+ `arr` is returned along with the index of the nearest neighbor in
+ `arr`.
+
+ The squared distance used to determine the nearest neighbor of `arr0`
+ is equivalent to np.sum((arr - arr0) ** 2), axis) where `arr` is 2d
+ and `arr0` is 1d and `arr0` must be reshaped if `axis` is 1.
+
+ If all distances are NaN then the distance returned is NaN and the
+ index is zero.
+
+ Parameters
+ ----------
+ arr : array_like
+ A 2d array. If `arr` is not an array, a conversion is attempted.
+ arr0 : array_like
+ A 1d array. If `arr0` is not an array, a conversion is attempted.
+ axis : int, optional
+ Axis along which the distance is computed. The default (axis=1)
+ is to compute the distance along rows.
+
+ Returns
+ -------
+ dist : np.float64
+ The Euclidian distance between `arr0` and the nearest neighbor
+ in `arr`. If all distances are NaN then the distance returned
+ is NaN.
+ idx : int
+ Index of nearest neighbor in `arr`. If all distances are NaN
+ then the index returned is zero.
+
+ Notes
+ -----
+ A brute force algorithm is used to find the nearest neighbor.
+
+ Depending on the shapes of `arr` and `arr0`, SciPy's cKDTree may
+ be faster than bn.nn(). So benchmark if speed is important.
+
+ The relative speed also depends on how many times you will use
+ the same array `arr` to find nearest neighbors with different
+ `arr0`. That is because it takes time to set up SciPy's cKDTree.
+
+ If `arr` fits into your memory's cache then bn.nn() is fast.
+
+ Examples
+ --------
+ Create the input arrays:
+
+ >>> arr = np.array([[1, 2], [3, 4]])
+ >>> arr0 = np.array([2, 4])
+
+ Find nearest neighbor of `arr0` in `arr` along axis 1:
+
+ >>> dist, idx = bn.nn(arr, arr0, axis=1)
+ >>> dist
+ 1.0
+ >>> idx
+ 1
+
+ Find nearest neighbor of `arr0` in `arr` along axis 0:
+
+ >>> dist, idx = bn.nn(arr, arr0, axis=0)
+ >>> dist
+ 0.0
+ >>> idx
+ 1
+
+ """
+ func, a, a0 = nn_selector(arr, arr0, axis)
+ return func(a, a0)
+
+def nn_selector(arr, arr0, int axis):
+ """
+ Return nn function and arrays.
+
+ Under the hood Bottleneck uses a separate Cython function for each
+ combination of dtype, and axis. A lot of the overhead in bn.nn() is in
+ checking that `axis` is within range, converting `arr` into an array
+ (if it is not already an array), and selecting the function to use to
+ find the nearest neighbor.
+
+ You can get rid of the overhead by doing all this before you, for example,
+ enter an inner loop, by using this function.
+
+ Parameters
+ ----------
+ arr : array_like
+ A 2d array. If `arr` is not an array, a conversion is attempted.
+ arr0 : array_like
+ A 1d array. If `arr0` is not an array, a conversion is attempted.
+ axis : int, optional
+ Axis along which the distance is computed. The default (axis=1)
+ is to compute the distance along rows.
+
+ Returns
+ -------
+ func : function
+ The nn function that is appropriate to use with the given input.
+ a : ndarray
+ If the input array `arr` is not a ndarray, then `a` will contain the
+ result of converting `arr` into a ndarray.
+ a0 : ndarray
+ If the input array `arr0` is not a ndarray, then `a0` will contain the
+ result of converting `arr0` into a ndarray.
+
+ Examples
+ --------
+ Create the input arrays:
+
+ >>> arr = np.array([[1, 2], [3, 4]])
+ >>> arr0 = np.array([2, 4])
+
+ Obtain the function needed to find the nearest neighbor of `arr0`
+ in `arr0` along axis 0:
+
+ >>> func, a, a0 = bn.func.nn_selector(arr, arr0, axis=0)
+ >>> func
+ <function nn_2d_int64_axis0>
+
+ Use the returned function and arrays to determine the nearest
+ neighbor:
+
+ >>> dist, idx = func(a, a0)
+ >>> dist
+ 0.0
+ >>> idx
+ 1
+
+
+ """
+ cdef np.ndarray a
+ if type(arr) is np.ndarray:
+ a = arr
+ else:
+ a = np.array(arr, copy=False)
+ cdef np.ndarray a0
+ if type(arr0) is np.ndarray:
+ a0 = arr0
+ else:
+ a0 = np.array(arr0, copy=False)
+ cdef int dtype = PyArray_TYPE(a)
+ cdef int dtype0 = PyArray_TYPE(a0)
+ if dtype != dtype0:
+ raise ValueError("`arr` and `arr0` must be of the same dtype.")
+ cdef int ndim = PyArray_NDIM(a)
+ if ndim != 2:
+ raise ValueError("`arr` must be 2d")
+ cdef int ndim0 = PyArray_NDIM(a0)
+ if ndim0 != 1:
+ raise ValueError("`arr0` must be 1d")
+ if axis < 0:
+ axis += ndim
+ cdef tuple key = (ndim, dtype, axis)
+ try:
+ func = nn_dict[key]
+ except KeyError:
+ if axis is not None:
+ if (axis < 0) or (axis >= ndim):
+ raise ValueError("axis(=%d) out of bounds" % axis)
+ try:
+ func = nn_slow_dict[axis]
+ except KeyError:
+ tup = (str(ndim), str(a.dtype), str(axis))
+ raise TypeError("Unsupported ndim/dtype/axis (%s/%s/%s)." % tup)
+ return func, a, a0
+'''
@@ -5,7 +5,7 @@
import numpy as np
from numpy.testing import (assert_equal, assert_array_equal, assert_raises,
- assert_array_almost_equal)
+ assert_array_almost_equal, assert_almost_equal)
nan = np.nan
import bottleneck as bn
@@ -152,3 +152,46 @@ def test_nanmin_size_zero(dtypes=bn.dtypes):
a = np.zeros(shape, dtype=dtype)
assert_raises(ValueError, bn.nanmin, a)
assert_raises(ValueError, bn.slow.nanmin, a)
+
+# ---------------------------------------------------------------------------
+# Check nn
+
+def arrays2(dtypes=bn.dtypes):
+ "Iterator that yield arrays to use for unit testing."
+ for dtype in bn.dtypes:
+ arr = np.random.randint(0, 10, (2, 3)).astype(dtype)
+ arr0 = np.random.randint(0, 10, 3).astype(dtype)
+ axis = 0
+ yield arr.copy(), arr0[:2].copy(), axis
+ axis = 1
+ yield arr.copy(), arr0.copy(), axis
+ # Make sure ties are handled in the same way
+ arr.fill(0)
+ arr0.fill(0)
+ axis = 0
+ yield arr.copy(), arr0[:2].copy(), axis
+ axis = 1
+ yield arr.copy(), arr0.copy(), axis
+ if issubclass(arr.dtype.type, np.inexact):
+ # Make sure NaNs are handled in the same way
+ arr.fill(np.nan)
+ arr0.fill(np.nan)
+ axis = 0
+ yield arr.copy(), arr0[:2].copy(), axis
+ axis = 1
+ yield arr.copy(), arr0.copy(), axis
+
+def test_nn():
+ "Test that bn.nn gives the same output as bn.slow.nn."
+ msg = '\nfunc %s | input %s (%s) | axis %s\n'
+ msg += '\nInput array `arr`:\n%s\n'
+ msg += '\nInput array `arr0`:\n%s\n'
+ count = -1
+ for arr, arr0, axis in arrays2():
+ actual = bn.nn(arr, arr0, axis)
+ desired = bn.slow.nn(arr, arr0, axis)
+ count += 1
+ tup = ('bn.nn', 'arr'+str(count), str(arr.dtype),
+ str(axis), arr, arr0)
+ err_msg = msg % tup
+ assert_almost_equal(actual, desired, decimal=5, err_msg=err_msg)
Oops, something went wrong.

0 comments on commit e97df3d

Please sign in to comment.