Skip to content

Commit

Permalink
ENH: add STRtree nearest neighbor(s) function (pygeos/pygeos#272)
Browse files Browse the repository at this point in the history
  • Loading branch information
brendan-ward committed Feb 26, 2021
1 parent 14e1f82 commit 1b721fc
Show file tree
Hide file tree
Showing 10 changed files with 1,277 additions and 12 deletions.
9 changes: 6 additions & 3 deletions CHANGELOG.rst
Expand Up @@ -4,9 +4,10 @@ Changelog
Version 0.10 (unreleased)
------------------------

**Highlights of this release**
**Major enhancements**

* ...
* Addition of ``nearest`` and ``nearest_all`` functions to ``STRtree`` for
GEOS >= 3.6 to find the nearest neighbors (#272).

**API Changes**

Expand All @@ -28,7 +29,9 @@ Version 0.10 (unreleased)
Thanks to everyone who contributed to this release!
People with a "+" by their names contributed a patch for the first time.

* ...
* Brendan Ward
* Casper van der Wel
* Joris Van den Bossche


Version 0.9 (2021-01-23)
Expand Down
134 changes: 128 additions & 6 deletions benchmarks/benchmarks.py
Expand Up @@ -8,6 +8,7 @@

class PointPolygonTimeSuite:
"""Benchmarks running on 100000 points and one polygon"""

def setup(self):
self.points = pygeos.points(np.random.random((100000, 2)))
self.polygon = pygeos.polygons(np.random.random((3, 2)))
Expand All @@ -24,6 +25,7 @@ def time_intersection(self):

class IOSuite:
"""Benchmarks I/O operations (WKT and WKB) on a set of 10000 polygons"""

def setup(self):
self.to_write = pygeos.polygons(np.random.random((10000, 100, 2)))
self.to_read_wkt = pygeos.to_wkt(self.to_write)
Expand All @@ -44,6 +46,7 @@ def time_read_from_wkb(self):

class ConstructiveSuite:
"""Benchmarks constructive functions on a set of 10,000 points"""

def setup(self):
self.points = pygeos.points(np.random.random((10000, 2)))

Expand All @@ -66,16 +69,15 @@ class ClipSuite:
def setup(self):
# create irregular polygons by merging overlapping point buffers
self.polygon = pygeos.union_all(
pygeos.buffer(pygeos.points(np.random.random((1000, 2)) * 500), 10)
)
pygeos.buffer(pygeos.points(np.random.random((1000, 2)) * 500), 10)
)
xmin = np.random.random(100) * 100
xmax = xmin + 100
ymin = np.random.random(100) * 100
ymax = ymin + 100
self.bounds = np.array([xmin, ymin, xmax, ymax]).T
self.boxes = pygeos.box(xmin, ymin, xmax, ymax)


def time_clip_by_box(self):
pygeos.intersection(self.polygon, self.boxes)

Expand All @@ -88,7 +90,13 @@ class GetParts:
"""Benchmarks for getting individual parts from 100 multipolygons of 100 polygons each"""

def setup(self):
self.multipolygons = np.array([pygeos.multipolygons(pygeos.polygons(np.random.random((2, 100, 2)))) for i in range(10000)], dtype=object)
self.multipolygons = np.array(
[
pygeos.multipolygons(pygeos.polygons(np.random.random((2, 100, 2))))
for i in range(10000)
],
dtype=object,
)

def time_get_parts(self):
"""Cython implementation of get_parts"""
Expand All @@ -110,9 +118,11 @@ class OverlaySuite:

def setup(self):
# create irregular polygons by merging overlapping point buffers
self.left = pygeos.union_all(pygeos.buffer(pygeos.points(np.random.random((500, 2)) * 500), 15))
self.left = pygeos.union_all(
pygeos.buffer(pygeos.points(np.random.random((500, 2)) * 500), 15)
)
# shift this up and right
self.right = pygeos.apply(self.left, lambda x: x+50)
self.right = pygeos.apply(self.left, lambda x: x + 50)

def time_difference(self):
pygeos.difference(self.left, self.right)
Expand Down Expand Up @@ -174,6 +184,20 @@ def setup(self):
# initialize the tree by making a tiny query first
self.tree.query(pygeos.points(0, 0))

# create points that extend beyond the domain of the above polygons to ensure
# some don't overlap
self.points = pygeos.points((np.random.random((2000, 2)) * 750) - 125)
self.point_tree = pygeos.STRtree(
pygeos.points(np.random.random((2000, 2)) * 750)
)
self.point_tree.query(pygeos.points(0, 0))

# create points on a grid for testing equidistant nearest neighbors
# creates 2025 points
grid_coords = np.mgrid[:45, :45].T.reshape(-1, 2)
self.grid_point_tree = pygeos.STRtree(pygeos.points(grid_coords))
self.grid_points = pygeos.points(grid_coords + 0.5)

def time_tree_create(self):
tree = pygeos.STRtree(self.polygons)
tree.query(pygeos.points(0, 0))
Expand Down Expand Up @@ -207,3 +231,101 @@ def time_tree_query_bulk_covered_by(self):

def time_tree_query_bulk_contains_properly(self):
self.tree.query_bulk(self.polygons, predicate="contains_properly")

def time_tree_nearest_points(self):
self.point_tree.nearest(self.points)

def time_tree_nearest_points_equidistant(self):
self.grid_point_tree.nearest(self.grid_points)

def time_tree_nearest_points_equidistant_manual_all(self):
# This benchmark approximates nearest_all for equidistant results
# starting from singular nearest neighbors and searching for more
# within same distance.

# try to find all equidistant neighbors ourselves given single nearest
# result
l, r = self.grid_point_tree.nearest(self.grid_points)
# calculate distance to nearest neighbor
dist = pygeos.distance(self.grid_points.take(l), self.grid_point_tree.geometries.take(r))
# include a slight epsilon to ensure nearest are within this radius
b = pygeos.buffer(self.grid_points, dist + 1e-8)

# query the tree for others in the same buffer distance
left, right = self.grid_point_tree.query_bulk(b, predicate='intersects')
dist = pygeos.distance(
self.grid_points.take(left), self.grid_point_tree.geometries.take(right)
)

# sort by left, distance
ix = np.lexsort((right, dist, left))
left = left[ix]
right = right[ix]
dist = dist[ix]

run_start = np.r_[True, left[:-1] != left[1:]]
run_counts = np.diff(np.r_[np.nonzero(run_start)[0], left.shape[0]])

mins = dist[run_start]

# spread to rest of array so we can extract out all within each group that match
all_mins = np.repeat(mins, run_counts)
ix = dist == all_mins
left = left[ix]
right = right[ix]
dist = dist[ix]

def time_tree_nearest_all_points(self):
self.point_tree.nearest_all(self.points)

def time_tree_nearest_all_points_equidistant(self):
self.grid_point_tree.nearest_all(self.grid_points)

def time_tree_nearest_all_points_small_max_distance(self):
# returns >300 results
self.point_tree.nearest_all(self.points, max_distance=5)

def time_tree_nearest_all_points_large_max_distance(self):
# measures the overhead of using a distance that would encompass all tree points
self.point_tree.nearest_all(self.points, max_distance=1000)

def time_tree_nearest_poly(self):
self.tree.nearest(self.points)

def time_tree_nearest_all_poly(self):
self.tree.nearest_all(self.points)

def time_tree_nearest_all_poly_small_max_distance(self):
# returns >300 results
self.tree.nearest_all(self.points, max_distance=5)

def time_tree_nearest_all_poly_python(self):
# returns all input points

# use an arbitrary search tolerance that seems appropriate for the density of
# geometries
tolerance = 200
b = pygeos.buffer(self.points, tolerance, quadsegs=1)
left, right = self.tree.query_bulk(b)
dist = pygeos.distance(self.points.take(left), self.polygons.take(right))

# sort by left, distance
ix = np.lexsort((right, dist, left))
left = left[ix]
right = right[ix]
dist = dist[ix]

run_start = np.r_[True, left[:-1] != left[1:]]
run_counts = np.diff(np.r_[np.nonzero(run_start)[0], left.shape[0]])

mins = dist[run_start]

# spread to rest of array so we can extract out all within each group that match
all_mins = np.repeat(mins, run_counts)
ix = dist == all_mins
left = left[ix]
right = right[ix]
dist = dist[ix]

# arrays are now roughly representative of what tree.nearest_all would provide, though
# some nearest_all neighbors may be missed if they are outside tolerance
126 changes: 126 additions & 0 deletions pygeos/strtree.py
@@ -1,6 +1,7 @@
from enum import IntEnum
import numpy as np
from . import lib
from .decorators import requires_geos


__all__ = ["STRtree"]
Expand Down Expand Up @@ -49,6 +50,8 @@ class STRtree:
>>> # Query geometries that overlap envelopes of `geoms`
>>> tree.query_bulk([pygeos.box(2, 2, 4, 4), pygeos.box(5, 5, 6, 6)]).tolist()
[[0, 0, 0, 1, 1], [2, 3, 4, 5, 6]]
>>> tree.nearest([pygeos.points(1,1), pygeos.points(3,5)]).tolist() # doctest: +SKIP
[[0, 1], [1, 4]]
"""

def __init__(self, geometries, leafsize=10):
Expand Down Expand Up @@ -186,3 +189,126 @@ def query_bulk(self, geometry, predicate=None):
predicate = BinaryPredicate[predicate].value

return self._tree.query_bulk(geometry, predicate)

@requires_geos("3.6.0")
def nearest(self, geometry):
"""Returns the index of the nearest item in the tree for each input
geometry.
If there are multiple equidistant or intersected geometries in the tree,
only a single result is returned for each input geometry, based on the
order that tree geometries are visited; this order may be
nondeterministic.
Any geometry that is None or empty in the input geometries is omitted
from the output.
Parameters
----------
geometry : Geometry or array_like
Input geometries to query the tree.
Returns
-------
ndarray with shape (2, n)
The first subarray contains input geometry indexes.
The second subarray contains tree geometry indexes.
See also
--------
nearest_all: returns all equidistant geometries and optional distances
Examples
--------
>>> import pygeos
>>> tree = pygeos.STRtree(pygeos.points(np.arange(10), np.arange(10)))
>>> tree.nearest(pygeos.points(1,1)).tolist() # doctest: +SKIP
[[0], [1]]
>>> tree.nearest([pygeos.box(1,1,3,3)]).tolist() # doctest: +SKIP
[[0], [1]]
>>> points = pygeos.points(0.5,0.5)
>>> tree.nearest([None, pygeos.points(10,10)]).tolist() # doctest: +SKIP
[[1], [9]]
"""

geometry = np.asarray(geometry, dtype=object)
if geometry.ndim == 0:
geometry = np.expand_dims(geometry, 0)

return self._tree.nearest(geometry)

@requires_geos("3.6.0")
def nearest_all(self, geometry, max_distance=None, return_distance=False):
"""Returns the index of the nearest item(s) in the tree for each input
geometry.
If there are multiple equidistant or intersected geometries in tree, all
are returned. Tree indexes are returned in the order they are visited
for each input geometry and may not be in ascending index order; no meaningful
order is implied.
The max_distance used to search for nearest items in the tree may have a
significant impact on performance by reducing the number of input geometries
that are evaluated for nearest items in the tree. Only those input geometries
with at least one tree item within +/- max_distance beyond their envelope will
be evaluated.
The distance, if returned, will be 0 for any intersected geometries in the tree.
Any geometry that is None or empty in the input geometries is omitted from
the output.
Parameters
----------
geometry : Geometry or array_like
Input geometries to query the tree.
max_distance : float, optional (default: None)
Maximum distance within which to query for nearest items in tree.
Must be greater than 0.
return_distance : bool, optional (default: False)
If True, will return distances in addition to indexes.
Returns
-------
indices or tuple of (indices, distances)
indices is an ndarray of shape (2,n) and distances (if present) an
ndarray of shape (n).
The first subarray of indices contains input geometry indices.
The second subarray of indices contains tree geometry indices.
See also
--------
nearest: returns singular nearest geometry for each input
Examples
--------
>>> import pygeos
>>> tree = pygeos.STRtree(pygeos.points(np.arange(10), np.arange(10)))
>>> tree.nearest_all(pygeos.points(1,1)).tolist() # doctest: +SKIP
[[0], [1]]
>>> tree.nearest_all([pygeos.box(1,1,3,3)]).tolist() # doctest: +SKIP
[[0, 0, 0], [1, 2, 3]]
>>> points = pygeos.points(0.5,0.5)
>>> index, distance = tree.nearest_all(points, return_distance=True) # doctest: +SKIP
>>> index.tolist() # doctest: +SKIP
[[0, 0], [0, 1]]
>>> distance.round(4).tolist() # doctest: +SKIP
[0.7071, 0.7071]
>>> tree.nearest_all(None).tolist() # doctest: +SKIP
[[], []]
"""

geometry = np.asarray(geometry, dtype=object)
if geometry.ndim == 0:
geometry = np.expand_dims(geometry, 0)

if max_distance is not None and max_distance <= 0:
raise ValueError("max_distance must be greater than 0")

# a distance of 0 means no max_distance is used
max_distance = max_distance or 0

if return_distance:
return self._tree.nearest_all(geometry, max_distance)

return self._tree.nearest_all(geometry, max_distance)[0]

0 comments on commit 1b721fc

Please sign in to comment.