Find neighbourhoods in a city that has a similar venue profile to an address in another city.
In this implementation, the code segments and clusters Toronto neighborhoods based on data scraped from the web and then calculates what neighbourhood profiles most closely match a given address (assumed to be in New York City). 

In [1]:
import pandas as pd
import numpy as np
import requests
import json

!pip install geocoder
import geocoder # import geocoder, used to source neighbourhood coordinates via Google

from scipy.spatial import distance



Gather user input and define variables

In [2]:
#define Google API key
API_key = input("Google API key: ")

CLIENT_ID = input('Foursquare client ID: ') # Foursquare ID
CLIENT_SECRET = input('Foursquare client secret: ') # Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

# reference address for neighbourhood match
ref_address = input('Reference address: ') 

Google API key: AIzaSyArcy6kc5IKhDGE3_IlQ2zjU1pY5pE2tew
Foursquare client ID: FSE4ZUITGDEMRGRLETJA1MYLJHATSRUS2ZNRCX3IKILU3R5V
Foursquare client secret: K5WH3QB2JB0JTPXBU34G4LKKRTTAEBAPUORGJHOG4OJUBILJ
Reference address: 150 W83rd Street, New York


Look up coordinates of reference address and determine nearby venues

In [3]:
# initialize variable
lat_lng_coords = None

#lookup and store reference address coordinates
g = geocoder.google(ref_address,key=API_key)
lat_lng_coords = g.latlng    
ref_latitude = lat_lng_coords[0]
ref_longitude = lat_lng_coords[1]

Define function that gets the top 100 venues within 500 meters of a given neighborhood

In [4]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
                   
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        if len(results) == 0: 
            # if no venues were found within the radius specified
            venues_list.append([(
                name, 
                lat, 
                lng, 
                "No venues nearby", 
                lat, 
                lng,  
                "None")])     
        else:
            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [5]:
ref_venues = getNearbyVenues(names=[ref_address], 
                                 latitudes=[ref_latitude],
                                 longitudes=[ref_longitude]
                             )

Encode venue labels to binary

In [6]:
# one hot encoding
ref_onehot = pd.get_dummies(ref_venues[['Venue Category']], prefix="", prefix_sep="")

# add label in neighborhood column to allign structure with neighbourhood dataframe
ref_onehot['Neighbourhood'] = "Current address"

# move neighborhood column to the first column
fixed_columns = [ref_onehot.columns[-1]] + list(ref_onehot.columns[:-1])
ref_onehot = ref_onehot[fixed_columns]

ref_grouped = ref_onehot.groupby('Neighbourhood').mean().reset_index()

Define function that sorts venues in descending order and then create a dataframe that shows the top 10 venues for the reference address as well as a dataframe that stores the venue profile for the top n venues

In [7]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)

    return row_categories_sorted.index.values[0:num_top_venues]

In [8]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
ref_venues_sorted = pd.DataFrame(columns=columns)
ref_venues_sorted['Neighbourhood'] = ref_grouped['Neighbourhood']

for ind in np.arange(ref_grouped.shape[0]):
    ref_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ref_grouped.iloc[ind, :], num_top_venues)

In [9]:
#extract the venue profile for the top n venues in the reference neighbourhood

top_venues = pd.DataFrame(ref_grouped[ref_venues_sorted.iloc[0,1:]])
top_venues['Neighbourhood']=ref_address
top_venues.set_index('Neighbourhood',inplace=True)
#top_venues.head()

Scrape Toronto neighborhood data from Wikipedia

In [10]:
#set wikipedia url that shows Toronto neighborhoods and postal codes
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [11]:
df_raw = pd.read_html(wiki_url)

In [12]:
#Read the first table on the page into a data frame (assuming that the first table contains the neighborhood data)
neighborhoods = pd.DataFrame(df_raw[0])

In [13]:
#drop unassigned post codes
neighborhoods=neighborhoods[neighborhoods['Borough']!='Not assigned'].reset_index(drop=True)

Get neighborhood coordinates from Google

In [14]:
# initialize variables
lat_lng_coords = None
latitude = []
longitude = []

#Lookup coordinates for postal codes and store in latitude, longitude lists
for index, postal_code in zip(range(0,len(neighborhoods['Postal Code'])),neighborhoods['Postal Code']):
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code),key=API_key)
    lat_lng_coords = g.latlng    
    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])

In [15]:
neighborhoods['Latitude'] =  latitude
neighborhoods['Longitude'] =  longitude

Create new data frame that includes venue data for each neighborhood in Toronto

In [16]:
Toronto_venues = getNearbyVenues(names=neighborhoods['Neighbourhood'], 
                                 latitudes=neighborhoods['Latitude'],
                                 longitudes=neighborhoods['Longitude']
                                  )

Encode venue labels to binary

In [17]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighbourhood'] = Toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_grouped = Toronto_onehot.groupby('Neighbourhood').mean().reset_index()

In [18]:
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = Toronto_grouped['Neighbourhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

#neighborhoods_venues_sorted.head()

Clean data set where a neighbourhood has less that the num_top_venues number of venues

In [19]:
neighborhoods_venues_sorted_clean = neighborhoods_venues_sorted.copy()
for nhood in range(neighborhoods_venues_sorted_clean.shape[0]):
    for ven in range(num_top_venues):
        current_venue = neighborhoods_venues_sorted_clean.iloc[nhood,ven+1]
        if Toronto_grouped.loc[nhood,current_venue] == 0.0:
            neighborhoods_venues_sorted_clean.iloc[nhood,ven+1] = "None"

Subset of neighbourhood data that shows the overlapping profile with the refrence address

In [20]:
Toronto_venue_profile = pd.DataFrame(Toronto_grouped[ref_venues_sorted.iloc[0,1:]])
Toronto_venue_profile['Neighbourhood'] = Toronto_grouped['Neighbourhood']
Toronto_venue_profile.set_index('Neighbourhood',inplace=True)

KeyError: "['Planetarium'] not in index"

Calculate similarity of Toronto neighbourhood venue profile to the reference address, using cosine distance

In [None]:
similarity_cos = pd.Series(dtype='float64')
for candidate in Toronto_grouped['Neighbourhood']:
       similarity_cos[candidate] = distance.cosine(top_venues.iloc[0,:], Toronto_venue_profile.loc[candidate])
similarity_cos.sort_values(ascending=True,inplace=True)

In [None]:
similarity

In [None]:
Toronto_venue_profile.head()

In [None]:
Toronto_grouped.head()

In [None]:
ref_grouped.head()

In [None]:
neighborhoods_venues_sorted.head()

In [None]:
ref_venues_sorted.head()

In [None]:
neighborhoods_venues_sorted_clean.head()

In [None]:
ref_venues.head()

In [None]:
ref_grouped.head()

In [None]:
Toronto_venue_profile.loc['St. James Town']

In [None]:
top_venues.iloc[0,:]

In [None]:
Toronto_venue_profile.loc['Kensington Market, Chinatown, Grange Park']

In [None]:
top_venues.iloc[0,:]

In [None]:
test1 = 0.25*top_venues.iloc[0,:]

In [None]:
test1

In [None]:
test2 = pd.Series(top_venues.iloc[0,:],copy=True)
test3 = pd.Series(top_venues.iloc[0,:],copy=True)

In [None]:
test2.iloc[0] = 0.0
test2.iloc[1] = 0.0
test3.iloc[-1] = 0.0
test3.iloc[-2] = 0.0

In [None]:
test2

In [None]:
test3

In [None]:
similarity['0.25 values'] = distance.euclidean(top_venues.iloc[0,:], test1)
similarity['1st 2 missing'] = distance.euclidean(top_venues.iloc[0,:], test2)
similarity['last 2 missing'] = distance.euclidean(top_venues.iloc[0,:], test3)
similarity.sort_values(ascending=True,inplace=True)

In [None]:
similarity

In [None]:
distance.cosine(top_venues.iloc[0,:], test2)

In [None]:
distance.cosine(top_venues.iloc[0,:], Toronto_venue_profile.loc['St. James Town'])

In [None]:
similarity_cos = pd.Series(dtype='float64')
for candidate in Toronto_grouped['Neighbourhood']:
       similarity_cos[candidate] = distance.cosine(top_venues.iloc[0,:], Toronto_venue_profile.loc[candidate])
similarity_cos.sort_values(ascending=True,inplace=True)

In [None]:
similarity_cos

In [None]:
similarity_cos['0.25 values'] = distance.cosine(top_venues.iloc[0,:], test1)
similarity_cos['1st 2 missing'] = distance.cosine(top_venues.iloc[0,:], test2)
similarity_cos['last 2 missing'] = distance.cosine(top_venues.iloc[0,:], test3)
similarity_cos.sort_values(ascending=True,inplace=True)

In [None]:
similarity_cos[0:9]

In [None]:
top_venues.iloc[0,:]

In [None]:
Toronto_venue_profile.loc['St. James Town']

In [None]:
Toronto_venue_profile.loc['Studio District']

In [21]:
ref_venues_sorted.iloc[0,1:]

1st Most Common Venue                         Café
2nd Most Common Venue                          Bar
3rd Most Common Venue                  Coffee Shop
4th Most Common Venue                  Planetarium
5th Most Common Venue             Sushi Restaurant
6th Most Common Venue     Mediterranean Restaurant
7th Most Common Venue                     Wine Bar
8th Most Common Venue                       Bakery
9th Most Common Venue           Italian Restaurant
10th Most Common Venue              Ice Cream Shop
Name: 0, dtype: object

In [24]:
Toronto_grouped.iloc[0,205:]

Pizza Place        0
Plane              0
Playground         0
Plaza              0
Poke Place         0
                  ..
Warehouse Store    0
Wine Bar           0
Wings Joint        0
Women's Store      0
Yoga Studio        0
Name: 0, Length: 66, dtype: object