### Toronto Neighbourhoods Notebook

Start by installing the required libraries

In [1]:
import pandas as pd
from geopy.geocoders import Nominatim
import folium
import json
import requests
from pandas.io.json import json_normalize
import numpy as np
from sklearn.cluster import KMeans 
import matplotlib.cm as cm
import matplotlib.colors as colors

### Create the dataframe containing Neighbourhoods
Used pandas function read_html to pull the data from the Wikipedia page

In [2]:
d = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

Convert the relevant data to a dataframe and display the first 10 rows so we can understand the data better

In [3]:
df = pd.DataFrame(d[0])
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


Rename the Postal Code column and then drop any rows without a Borough assigned

In [4]:
df.rename(columns = {'Postal Code': 'PostalCode'}, inplace = True)

In [5]:
df.drop(df[df['Borough']=='Not assigned'].index,axis = 0, inplace = True)
df.reset_index(inplace = True, drop = True)

Import dataset containing Latitude and Longitude for Toronto postal codes; merge datasets on the PostalCode column

In [6]:
coords = pd.read_csv('Geospatial_Coordinates.csv')
df2 = df.merge(coords, on = 'PostalCode')
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Plot the neighborhoods on a map
Get the lat and long values for Toronto

In [7]:
geolocator = Nominatim(user_agent="explorer")

toronto = 'Toronto, Canada'

location = geolocator.geocode(toronto)
toronto_lat = location.latitude
toronto_lng = location.longitude

print("Toronto's lat and long are {} and {}".format(toronto_lat, toronto_lng))

Toronto's lat and long are 43.6534817 and -79.3839347


Create the map

In [8]:
map = folium.Map(zoom_start = 11, location = [toronto_lat+0.05, toronto_lng])

for neighborhood, borough, lat, lng in zip(df2['Neighborhood'], df2['Borough'], df2['Latitude'],df2['Longitude']):
    label = (neighborhood+', '+borough.upper())
    
    folium.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            icon = None,
            color='black',
            fill=True,
            fill_color='yellow',
            fill_opacity=1,
            popup = label).add_to(map)

map

Filter Boroughs that contain the word Toronto

In [9]:
df_toronto = df2[df2['Borough'].str.contains('Toronto')]
df_toronto.reset_index(inplace = True, drop = True)

Plot only central Toronto Neighborhoods

In [10]:
map_toronto = folium.Map(zoom_start = 12, location = [toronto_lat+0.02, toronto_lng])

for neighborhood, borough, lat, lng in zip(df_toronto['Neighborhood'], df_toronto['Borough'], df_toronto['Latitude'],df_toronto['Longitude']):
    label = (neighborhood+', '+borough.upper())
    
    folium.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            icon = None,
            color='black',
            fill=True,
            fill_color='yellow',
            fill_opacity=1,
            popup = label).add_to(map_toronto)

map_toronto

### Explore the neighborhoods using the Foursquare API
First, define Foursquare credentials and version

In [11]:
CLIENT_ID = 'LCBA4FF22TQHU5ETZ0GAWTQ4FR01AANEYOOCQM3YTLB0H2E4'
CLIENT_SECRET = 'TKF23YSE4JOY0MLDUHLPVRSEFQOIXTOK5ZHQ3NZ2PIVSJZ3N'
VERSION = '20180605'

In each case we'll aim to get the top 50 venues in a Neighborhood, within a radius of 500 meters

In [12]:
LIMIT = 50
RADIUS = 500

We'll use the explore endpoint of the Foursquare API to run a test for postal code M5A

In [13]:
postal_code = 'M5A'



url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&limit={}&ll={},{}&radius={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    LIMIT,
    float(df_toronto[df_toronto['PostalCode']==postal_code]['Latitude']),
    float(df_toronto[df_toronto['PostalCode']==postal_code]['Longitude']),
    RADIUS)

url

'https://api.foursquare.com/v2/venues/explore?client_id=LCBA4FF22TQHU5ETZ0GAWTQ4FR01AANEYOOCQM3YTLB0H2E4&client_secret=TKF23YSE4JOY0MLDUHLPVRSEFQOIXTOK5ZHQ3NZ2PIVSJZ3N&v=20180605&limit=50&ll=43.6542599,-79.3606359&radius=500'

In [14]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5eb928a10de0d9001ba9aa72'},
 'response': {'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 48,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.653446723052674,
          'lng': -79.3620167174383}],
        'distance': 143,
       

Clean the JSON and structure into a pandas dataframe

In [15]:
#Create the dataframe
venues = results['response']['groups'][0]['items']
venues = json_normalize(venues)
venues = venues.loc[:,['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']]

#Add the categories column
category_list=[]

for i in range(len(venues)):
    category = venues['venue.categories'][i][0]['name']
    category_list.append(category)

venues['category'] = category_list

#Tidy up the columns
venues.drop('venue.categories', inplace = True, axis = 1)
venues.columns = [col.split('.')[-1] for col in venues.columns]

venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,lat,lng,category
0,Roselle Desserts,43.653447,-79.362017,Bakery
1,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,Body Blitz Spa East,43.654735,-79.359874,Spa


Plot the locations within postal code M5A

In [16]:
map_m5a = folium.Map(location = [float(df_toronto[df_toronto['PostalCode']==postal_code]['Latitude']),
    float(df_toronto[df_toronto['PostalCode']==postal_code]['Longitude'])], zoom_start = 16)

for name, lat, lng in zip(venues['name'], venues['lat'], venues['lng']):
    folium.CircleMarker(
    [lat,lng],
    radius = 3,
    icon = None,
    color = 'red', 
    popup = name).add_to(map_m5a)
    
folium.CircleMarker([float(df_toronto[df_toronto['PostalCode']==postal_code]['Latitude']),
    float(df_toronto[df_toronto['PostalCode']==postal_code]['Longitude'])], 
                    radius = 10, 
                    color = 'black',
                   fill_color = 'black',
                   fill_opacity = 1).add_to(map_m5a)

map_m5a

### Create function to repeat the same process for all Toronto neighborhoods

In [17]:
df_venues = pd.DataFrame(columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng', 'category', 'neighborhood', 'latitude', 'longitude'])

for neighborhood, lat, lng in zip(df_toronto['Neighborhood'], df_toronto['Latitude'], df_toronto['Longitude']):

    #Create the URL
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&limit={}&ll={},{}&radius={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    LIMIT,
    lat,
    lng,
    RADIUS)

    #Make the request, get the results
    results = requests.get(url).json()

    #Get the relevant information
    venues = results['response']['groups'][0]['items']
    venues = json_normalize(venues)
    venues = venues.loc[:,['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']]
    print(neighborhood+' has '+str(len(venues))+' venues')
    
    #Add the categories column
    category_list=[]
    for i in range(len(venues)):
        category = venues['venue.categories'][i][0]['name']
        category_list.append(category)
        
    venues['category'] = category_list

    venues['neighborhood'] = neighborhood
    venues['latitude'] = lat
    venues['longitude'] = lng
    
    df_venues = df_venues.append(venues, ignore_index = True)



Regent Park, Harbourfront has 48 venues
Queen's Park, Ontario Provincial Government has 34 venues
Garden District, Ryerson has 50 venues
St. James Town has 50 venues
The Beaches has 4 venues
Berczy Park has 50 venues
Central Bay Street has 50 venues
Christie has 17 venues
Richmond, Adelaide, King has 50 venues
Dufferin, Dovercourt Village has 18 venues
Harbourfront East, Union Station, Toronto Islands has 50 venues
Little Portugal, Trinity has 42 venues
The Danforth West, Riverdale has 42 venues
Toronto Dominion Centre, Design Exchange has 50 venues
Brockton, Parkdale Village, Exhibition Place has 23 venues
India Bazaar, The Beaches West has 20 venues
Commerce Court, Victoria Hotel has 50 venues
Studio District has 40 venues
Lawrence Park has 3 venues
Roselawn has 3 venues
Davisville North has 7 venues
Forest Hill North & West has 4 venues
High Park, The Junction South has 23 venues
North Toronto West has 20 venues
The Annex, North Midtown, Yorkville has 24 venues
Parkdale, Roncesvalle

Tidy up the dataframe

In [18]:
toronto = df_venues.copy()
toronto.drop('venue.categories', axis = 1, inplace = True)
toronto.columns = ['Venue', 'Venue Latitude', 'Venue Longitude', 'Category', 'Neighborhood', 'Neighborhood Lat', 'Neighborhood Lng']
cols = toronto.columns.to_list()[-3:]+toronto.columns.to_list()[:-3]
toronto = toronto[cols]
toronto.head()

Unnamed: 0,Neighborhood,Neighborhood Lat,Neighborhood Lng,Venue,Venue Latitude,Venue Longitude,Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


### Analyse each neighborhood
Start by creating dummies for each venue category

In [19]:
toronto_onehot = pd.get_dummies(toronto[['Category']], prefix = '', prefix_sep = '')

toronto_onehot['Neighborhood_name'] = toronto['Neighborhood']

columns = [toronto_onehot.columns[-1]]+list(toronto_onehot.columns[:-1])

toronto_onehot = toronto_onehot[columns]
toronto_onehot.head()

Unnamed: 0,Neighborhood_name,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Group the rows by Neighborhood and take the mean of the frequency of each catrgory 

In [20]:
toronto_grouped = toronto_onehot.groupby('Neighborhood_name', as_index = False).sum()
toronto_grouped.head()

Unnamed: 0,Neighborhood_name,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,Berczy Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,"Brockton, Parkdale Village, Exhibition Place",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Business reply mail Processing Centre,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,"CN Tower, King and Spadina, Railway Lands, Har...",1,1,1,2,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Central Bay Street,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


Next, write a function to put the venues in descending order

In [21]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [22]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood_name']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood_name'] = toronto_grouped['Neighborhood_name']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood_name,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Restaurant,Bakery,Café,Cheese Shop,Seafood Restaurant,Farmers Market,Fish Market
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Burrito Place,Nightclub,Restaurant,Italian Restaurant,Intersection,Bar,Stadium
2,Business reply mail Processing Centre,Light Rail Station,Yoga Studio,Spa,Garden Center,Garden,Fast Food Restaurant,Farmers Market,Comic Shop,Pizza Place,Burrito Place
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport,Harbor / Marina,Coffee Shop,Plane,Rental Car Location,Sculpture Garden,Boutique,Bar
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Bubble Tea Shop,Burger Joint,Ice Cream Shop,Department Store,Ramen Restaurant,Park


### Now we can cluster the neighborhoods

Run k-means clustering with 4 clusters

In [23]:
#Select the number of clusters
clusters = 4

toronto_clustering = toronto_grouped.drop('Neighborhood_name', axis = 1)

#Define the KMeans function
k_means = KMeans(init = 'k-means++', n_clusters = clusters, n_init = 12)

#Fit the model to the data
k_means.fit(toronto_clustering)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=12, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

Add the cluster label into the dataframe

In [24]:
neighborhoods_venues_sorted['Cluster'] = k_means.labels_
neighborhoods_venues_sorted

Unnamed: 0,Neighborhood_name,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster
0,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Restaurant,Bakery,Café,Cheese Shop,Seafood Restaurant,Farmers Market,Fish Market,0
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Coffee Shop,Burrito Place,Nightclub,Restaurant,Italian Restaurant,Intersection,Bar,Stadium,1
2,Business reply mail Processing Centre,Light Rail Station,Yoga Studio,Spa,Garden Center,Garden,Fast Food Restaurant,Farmers Market,Comic Shop,Pizza Place,Burrito Place,1
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport,Harbor / Marina,Coffee Shop,Plane,Rental Car Location,Sculpture Garden,Boutique,Bar,1
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Bubble Tea Shop,Burger Joint,Ice Cream Shop,Department Store,Ramen Restaurant,Park,2
5,Christie,Grocery Store,Café,Park,Candy Store,Baby Store,Nightclub,Diner,Coffee Shop,Athletics & Sports,Restaurant,1
6,Church and Wellesley,Sushi Restaurant,Coffee Shop,Restaurant,Yoga Studio,Japanese Restaurant,Men's Store,Gastropub,Burger Joint,Bookstore,Sake Bar,3
7,"Commerce Court, Victoria Hotel",Café,Coffee Shop,Hotel,Gym,American Restaurant,Deli / Bodega,Beer Bar,Seafood Restaurant,Japanese Restaurant,Restaurant,0
8,Davisville,Sandwich Place,Dessert Shop,Café,Pizza Place,Coffee Shop,Gym,Sushi Restaurant,Italian Restaurant,Pharmacy,Restaurant,3
9,Davisville North,Park,Department Store,Breakfast Spot,Sandwich Place,Food & Drink Shop,Hotel,Gym,Concert Hall,Discount Store,Electronics Store,1


Merge this dataframe with the dataframe containing latitude and longitude

In [29]:
df_toronto.rename(columns = {'Neighborhood': 'Neighborhood_name'}, inplace = True)

toronto_merged = df_toronto.copy()

toronto_merged = toronto_merged.merge(neighborhoods_venues_sorted, on = 'Neighborhood_name')

Map the clusters

In [27]:
map_cluster = folium.Map(zoom_start = 12, location = [toronto_lat+0.02, toronto_lng],tiles = 'Stamen Toner')

#Set color scheme for clusters
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


for lat, lng, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood_name'],toronto_merged['Cluster']):
        label = folium.Popup(str(poi) + ': Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker([lat, lng],
                            radius = 5,
                            popup = label,
                            color = rainbow[cluster-1],
                            fill = True,
                            fill_color = rainbow[cluster-1]
                            ).add_to(map_cluster)


map_cluster