In [1]:
import numpy as np
import pandas as pd

# Loading Data

Let's start by loading a large geographic dataset. In this notebook we will be using the Dublin Bus dataset as loaded and prepared on another [repository](https://github.com/joaofig/dublin-bus "Dublin Buses"). You just need the very first notebook to download and prepare the data, so we can use it here. Please be patient as it may take some time. Once you have the data file, please copy it to the data folder.

Note: Make sure you have the pyarrow package installed for the code below to work.

In [2]:
columns_to_read = ['Timestamp', 'LineID', 'Direction', 'PatternID', 
                   'JourneyID', 'Congestion', 'Lon', 'Lat', 
                   'Delay', 'BlockID', 'VehicleID', 'StopID', 'AtStop']
df = pd.read_parquet("data/sir010113-310113.parquet")

Note that, for convenience, the data frame is read in with a reset index, which may be quite useful in the forthcoming computations.

# Building a BallTree

In this section, we will create a BallTree to perform fast searches on our Dublin geographic data. The queries we will look into are *k-nearest neighbors* and *neighbors within a given radius*. Both the tree object and the distance metric object live in the scikit-learn *neighbors* namespace. Let's import those first.

In [None]:
from sklearn.neighbors import BallTree, DistanceMetric

Before we build the tree, we must select the latitude and longitude columns of the data frame. The distance measure for geographic coordinates is the *haversine distance* and the DistanceMetric class requires that we feed the locations as an array of latitude and longitude in radians.

In [None]:
positions = np.radians(df[['Lat', 'Lon']].to_numpy())

Now we can create the BallTree using the *positions* array. Please be patient as the next line may take some time to run.

In [None]:
%%timeit -r1 -n1
tree = BallTree(positions, metric="haversine")

In [3]:
import math

In [4]:
earth_radius = 6371000.0

In [5]:
guiness_lat = 53.3428673
guiness_lon = -6.2717738

In [6]:
guiness_lat_r = math.radians(guiness_lat) 
guiness_lon_r = math.radians(guiness_lon)

In [None]:
guiness = np.array([guiness_lat_r, guiness_lon_r]).reshape(1, -1)

In [None]:
# dist, ind = tree.query(guiness, k=100) 

In [None]:
# ind = tree.query_radius(guiness, r=100.0 / earth_radius) 

In [7]:
from geo.geomath import vec_haversine, num_haversine

In [38]:
lats = df['Lat'].to_numpy()
lons = df['Lon'].to_numpy()

In [40]:
dist0 = vec_haversine(lats, lons, 0.0, 0.0)

In [39]:
dist1 = vec_haversine(lats, lons, 90.0, 0.0)

In [10]:
idx0 = np.argsort(dist0)

In [11]:
idx1 = np.argsort(dist1)

In [19]:
sorted0 = dist0[idx0]

In [20]:
sorted1 = dist1[idx1]

In [12]:
gui0 = num_haversine(guiness_lat, guiness_lon, 0.0, 0.0)
gui1 = num_haversine(guiness_lat, guiness_lon, 90.0, 0.0)

In [17]:
gui0, gui1

(5959787.1282983115, 4076087.181576355)

In [21]:
np.searchsorted(sorted0, gui0)

18151963

In [22]:
np.searchsorted(sorted1, gui1)

25614622

In [25]:
sorted0[18151963], dist0[idx0[18151963]]

(5959787.128902709, 5959787.128902709)

In [26]:
sorted1[25614622], dist1[idx1[25614622]]

(4076087.4373246864, 4076087.4373246864)

# Range Search

Perform a range search using the spoke method

In [30]:
i0 = np.searchsorted(sorted0, gui0 - 100.0)
i1 = np.searchsorted(sorted0, gui0 + 100.0)
match0 = idx0[i0:i1+1]

In [31]:
i0 = np.searchsorted(sorted1, gui1 - 100.0)
i1 = np.searchsorted(sorted1, gui1 + 100.0)
match1 = idx1[i0:i1+1]

In [32]:
match0.shape

(1511660,)

In [33]:
match1.shape

(1508138,)

In [34]:
intersect = np.intersect1d(match0, match1)

In [35]:
intersect.shape

(435177,)

In [43]:
radii = vec_haversine(lats[intersect], lons[intersect], guiness_lat, guiness_lon)

In [46]:
valid_r = radii <= 100.0

In [48]:
valid_r.sum()

97867

In [49]:
intersect[valid_r]

array([    2263,     2282,     4178, ..., 44454409, 44454712, 44454941])