In [34]:
import pandas as pd
import numpy as np
import sklearn.neighbors

In [35]:
sb_df = pd.read_csv('starbucks_us.csv')

In [36]:
sb_df = sb_df[['starbucksid', 'name', 'street1', 'city', 'countrysubdivisioncode', 'postalcode', 'lat', 'lon']]

sb_df.rename(columns={'starbucksid': 'starbucks_id', 'name': 'store_name', 'street1': 'store_address', 
                      'city': 'store_city', 'countrysubdivisioncode': 'store_state','postalcode': 'store_zip', 
                      'lat': 'store_lat', 'lon': 'store_long'}, inplace=True)

In [37]:
sb_df.head()

Unnamed: 0,starbucks_id,store_name,store_address,store_city,store_state,store_zip,store_lat,store_long
0,10357,Target Virginia T-847,1001 13th St S,Virginia,MN,557923254.0,47.511105,-92.549222
1,1006159,Target Vadnais Heights T-751,975 County Rd E,Vadnais Heights,MN,55127.0,45.05041,-93.061545
2,1008940,Highway 169 and Freeport-Elk River,19179 Freeport Ave,Elk River,MN,55330.0,45.319902,-93.564261
3,14598,St. Paul-Grand Ave & Oxford St,1062 Grand Ave.,St. Paul,MN,551053805.0,44.939874,-93.144947
4,12449,Target Minneapolis T-1375,900 Nicollet Mall,Minneapolis,MN,554032530.0,44.970518,-93.277707


In [38]:
h_df = pd.read_csv("multifamily_physical_inspection_scores_0321.csv")

In [39]:
h_df = h_df[['PROPERTY_ID', 'PROPERTY_NAME', 'ADDRESS', 'CITY', 'STATE_NAME', 'ZIPCODE', 'LATITUDE', 'LONGITUDE']]

h_df.columns = [col.lower() for col in h_df.columns]

In [40]:
h_df.head()

Unnamed: 0,property_id,property_name,address,city,state_name,zipcode,latitude,longitude
0,800006929,PARKVIEW APARTMENTS I,1334 Memorial Ln,Huntington,IN,46750.0,40.87955,-85.513698
1,800005604,Maple West Apartments,350 W Maple St,Pocatello,ID,83201.0,42.880722,-112.45448
2,800001529,FOSTER AVE APTS,13630 Foster Ave,Baldwin Park,CA,91706.0,34.074928,-117.976232
3,800027853,ST. LUKE RESIDENTIAL H.C.F.,299 E River Rd,Oswego,NY,13126.0,43.44498,-76.493814
4,800225287,Adams Street Place,1024 Adams St,Jefferson City,MO,65101.0,38.566468,-92.177775


## Define function to find nearest starbucks to each house

In [41]:
# create function to find closest starbucks and add to column of df

def nearest_location(df1, df2):
    '''
    df1 is the dataframe you want to append the nearest starbucks to
    df2 is the dataframe with all starbucks locations
    '''
    # make a copy of dfs since we will be modifying them
    houses_df = df1
    starbucks_df = df2
    
    # add columns to convert latitude and longitude to radians
    houses_df[['lat_radians', 'long_radians']] = (np.radians(houses_df.loc[:, ['latitude', 'longitude']]))
    starbucks_df[['sb_lat_radians', 'sb_long_radians']] = (np.radians(starbucks_df.loc[:, ['store_lat', 'store_long']]))
    
    # use Haversine distance since earth is sphere
    dist = sklearn.neighbors.DistanceMetric.get_metric('haversine')

    # 3959 is the radius of the earth in miles, multiply to convert radians to miles
    dist_matrix = (dist.pairwise(houses_df[['lat_radians','long_radians']],
                                 starbucks_df[['sb_lat_radians','sb_long_radians']])*3959)

    # resultant df is a matrix of distances between all properties and starbucks stores
    distances_df = (pd.DataFrame(dist_matrix, 
                                 index=houses_df['property_id'], 
                                 columns=starbucks_df['starbucks_id']))
    
    # Unpivot above dataframe from wide format to long format.
    distances_df = (pd.melt(distances_df.reset_index(), id_vars='property_id'))
    
    # When you unpivot, the data in the pivot table becomes a column named 'value'. 
    # Rename this column to 'miles' for clarity.
    distances_df.rename(columns={'value':'miles'}, inplace=True)
    
    # Sort all location pairs by distance, keep the closest distance.
    distances_df.sort_values(['property_id', 'miles'], ascending = [False, True], inplace=True)
    distances_df.drop_duplicates(subset = ['property_id'], keep = 'first', inplace=True)
    distances_df.reset_index(drop = True, inplace=True)
    
    # Merge df with distance pairs back into original df
    merged_df = houses_df.merge(distances_df, on='property_id')
    merged_df.rename(columns={'miles': 'closest_starbucks (mi)'}, inplace=True)
    
    return merged_df

In [42]:
%%time

final_df = nearest_location(h_df, sb_df)

CPU times: user 17min 20s, sys: 3min 4s, total: 20min 24s
Wall time: 22min 6s


In [43]:
final_df.head()

Unnamed: 0,property_id,property_name,address,city,state_name,zipcode,latitude,longitude,lat_radians,long_radians,starbucks_id,closest_starbucks (mi)
0,800006929,PARKVIEW APARTMENTS I,1334 Memorial Ln,Huntington,IN,46750.0,40.87955,-85.513698,0.713483,-1.492496,1009517,17.004152
1,800005604,Maple West Apartments,350 W Maple St,Pocatello,ID,83201.0,42.880722,-112.45448,0.74841,-1.962701,7436,0.636081
2,800001529,FOSTER AVE APTS,13630 Foster Ave,Baldwin Park,CA,91706.0,34.074928,-117.976232,0.59472,-2.059074,1015032,0.360384
3,800027853,ST. LUKE RESIDENTIAL H.C.F.,299 E River Rd,Oswego,NY,13126.0,43.44498,-76.493814,0.758258,-1.335069,13542,25.169199
4,800225287,Adams Street Place,1024 Adams St,Jefferson City,MO,65101.0,38.566468,-92.177775,0.673112,-1.608806,8388,2.160519


In [46]:
# Avg distance to a starbucks nationwide
final_df['closest_starbucks (mi)'].mean()

8.149462758320178

In [None]:
#df.to_csv('nearest_starbucks.csv', index = False)