In [72]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
from sklearn.metrics import DistanceMetric

# Other methods that were considered, but none of them offer a Haversine metric option
# from sklearn.neighbors import KDTree
# from scipy.spatial import KDTree
# import pykdtree


In [73]:
sb_cols = ['starbucksid', 'name', 'lat', 'lon']
sb_df = pd.read_csv('data/starbucks_us.csv', usecols=sb_cols)

# convert float64's to float32's to save space
for col in sb_df.columns:
    if sb_df[col].dtype == 'float64':
        sb_df[col]=pd.to_numeric(sb_df[col], downcast='float')

In [74]:
sb_df.rename(columns={'starbucksid': 'starbucks_id', 'name': 'store_name', 'street1': 'store_address', 
                      'city': 'store_city', 'countrysubdivisioncode': 'store_state', 
                      'lat': 'store_lat', 'lon': 'store_long'}, inplace=True)

# drop columns with no lat / long info
sb_df.dropna(axis=0, how="any", subset=['store_lat', 'store_long'], inplace=True)

In [75]:
# US starbucks locations
print(f'Total Starbucks Locations: {len(sb_df)}')
sb_df.head()

Total Starbucks Locations: 13620


Unnamed: 0,starbucks_id,store_name,store_lat,store_long
0,10357,Target Virginia T-847,47.511105,-92.549225
1,1006159,Target Vadnais Heights T-751,45.050411,-93.061546
2,1008940,Highway 169 and Freeport-Elk River,45.319901,-93.564262
3,14598,St. Paul-Grand Ave & Oxford St,44.939873,-93.144943
4,12449,Target Minneapolis T-1375,44.970516,-93.27771


In [76]:
h_cols = ['PROPERTY_ID', 'PROPERTY_NAME', 'ADDRESS', 'CITY', 'STATE_NAME', 'LATITUDE', 'LONGITUDE']
h_df = pd.read_csv("data/multifamily_physical_inspection_scores_0321.csv", usecols=h_cols)

# convert float64's to float32's to save space
for col in h_df.columns:
    if h_df[col].dtype == 'float64':
        h_df[col]=pd.to_numeric(h_df[col], downcast='float')

In [77]:
h_df.columns = [col.lower() for col in h_df.columns]
# drop columns with no lat / long info
h_df.dropna(axis=0, how="any", subset=['latitude', 'longitude'], inplace=True)

In [78]:
# US housing dataset
print(f'Total Housing Locations: {len(h_df)}')
h_df.head()

Total Housing Locations: 27415


Unnamed: 0,property_id,property_name,address,city,state_name,latitude,longitude
0,800006929,PARKVIEW APARTMENTS I,1334 Memorial Ln,Huntington,IN,40.879551,-85.513695
1,800005604,Maple West Apartments,350 W Maple St,Pocatello,ID,42.880722,-112.454483
2,800001529,FOSTER AVE APTS,13630 Foster Ave,Baldwin Park,CA,34.074928,-117.976234
3,800027853,ST. LUKE RESIDENTIAL H.C.F.,299 E River Rd,Oswego,NY,43.444981,-76.493813
4,800225287,Adams Street Place,1024 Adams St,Jefferson City,MO,38.566467,-92.177773


## Brute force method to find nearest starbucks to each house

In [79]:
# This function is the brute force method to find the nearest point to each point in two arrays
# It runs at O^n2...

def nearest_locations_pairwise(df1, df2):
    '''
    df1 is the dataframe you want to append the nearest starbucks to
    df2 is the dataframe with all starbucks locations
    '''
    # make a copy of dfs since we will be modifying them
    houses_df = df1
    starbucks_df = df2

    # add columns to convert latitude and longitude to radians
    houses_df[['lat_radians', 'long_radians']] = (np.radians(houses_df.loc[:, ['latitude', 'longitude']]))
    starbucks_df[['sb_lat_radians', 'sb_long_radians']] = (np.radians(starbucks_df.loc[:, ['store_lat', 'store_long']]))

    # use Haversine distance since earth is sphere
    dist = DistanceMetric.get_metric('haversine')

    # 3959 is the radius of the earth in miles, multiply to convert radians to miles
    dist_matrix = (dist.pairwise(houses_df[['lat_radians','long_radians']],
                                 starbucks_df[['sb_lat_radians','sb_long_radians']])*3959)

    # resultant df is a matrix of distances between all houses and starbucks stores
    distances_df = (pd.DataFrame(dist_matrix, 
                                 index=houses_df['property_id'], 
                                 columns=starbucks_df['starbucks_id']))
    
    # Unpivot above dataframe from wide format to long format.
    distances_df = (pd.melt(distances_df.reset_index(), id_vars='property_id'))
    
    # When you unpivot, the data in the pivot table becomes a column named 'value'. 
    # Rename this column to 'miles' for clarity.
    distances_df.rename(columns={'value':'miles'}, inplace=True)
    
    # Sort all location pairs by distance, keep the closest distance.
    distances_df.sort_values(['property_id', 'miles'], ascending=[False, True], inplace=True)
    distances_df.drop_duplicates(subset=['property_id'], keep='first', inplace=True)
    distances_df.reset_index(drop=True, inplace=True)
    
    # Merge df with distance pairs back into original df
    merged_df = houses_df.merge(distances_df, on='property_id')
    merged_df.rename(columns={'miles': 'closest_starbucks (mi)'}, inplace=True)
    
    return merged_df

In [80]:
%%time 
# current fastest runtime 20:24
pairwise_df = nearest_locations_pairwise(h_df, sb_df)

CPU times: total: 0 ns
Wall time: 0 ns


In [81]:
# Avg distance to a starbucks nationwide = 8.15 miles
print(f'Avg distance to a starbucks nationwide: {pairwise_df["closest_starbucks (mi)"].mean()}')
pairwise_df.head()

## Ball tree method to find nearest starbucks to each house

In [82]:
def nearest_locations_balltree(df1, df2):
    # make a copy of dfs since we will be modifying them
    houses_df = df1.copy()
    starbucks_df = df2.copy()

    # add columns to convert latitude and longitude to radians
    houses_df[['lat_radians', 'long_radians']] = (np.radians(houses_df.loc[:, ['latitude', 'longitude']]))
    starbucks_df[['sb_lat_radians', 'sb_long_radians']] = (np.radians(starbucks_df.loc[:, ['store_lat', 'store_long']]))

    # Construct ball tree with starbucks coordinates
    ball = BallTree(starbucks_df[['sb_lat_radians', 'sb_long_radians']].values, metric='haversine')

    # Execute query with the starbucks locations
    dist, idx = ball.query(houses_df[['lat_radians', 'long_radians']].values, k=1)
    # convert to miles x 3959
    dist = dist * 3959

    # get indices of nearest starbucks and merge with housing
    starbucks_filtered = starbucks_df[['starbucks_id', 'store_name']]
    sb_nearest = starbucks_filtered.iloc[np.squeeze(idx)]
    
    houses_filtered = houses_df.drop(columns=['latitude', 'longitude', 'lat_radians', 'long_radians'])
    final_df = pd.concat([
        houses_filtered.reset_index(drop=True), 
        sb_nearest.reset_index(drop=True), 
        pd.Series(np.squeeze(dist), name='closest_starbucks (mi)')
    ], axis=1)

    return final_df

In [83]:
%%time
# current fastest runtime 00:01.44
balltree_df = nearest_locations_balltree(h_df, sb_df)

CPU times: total: 1.44 s
Wall time: 1.44 s


In [84]:
# Avg distance to a starbucks nationwide = 8.15mi
print(f'Avg distance to a starbucks nationwide: {balltree_df["closest_starbucks (mi)"].mean()}')
balltree_df.head()

Avg distance to a starbucks nationwide: 8.149464937759666


Unnamed: 0,property_id,property_name,address,city,state_name,starbucks_id,store_name,closest_starbucks (mi)
0,800006929,PARKVIEW APARTMENTS I,1334 Memorial Ln,Huntington,IN,1009517,Kroger - Ft Wayne #990,17.004137
1,800005604,Maple West Apartments,350 W Maple St,Pocatello,ID,7436,Fred Meyer-Pocatello #260,0.63602
2,800001529,FOSTER AVE APTS,13630 Foster Ave,Baldwin Park,CA,1015032,Target Baldwinpark T-1033,0.360616
3,800027853,ST. LUKE RESIDENTIAL H.C.F.,299 E River Rd,Oswego,NY,13542,Clay,25.169139
4,800225287,Adams Street Place,1024 Adams St,Jefferson City,MO,8388,Target Jefferson City T-752,2.16071


In [85]:
(pairwise_df['closest_starbucks (mi)'] != balltree_df['closest_starbucks (mi)']).any()