This notebook will cluster Toronto neighborhoods by healthy life alternatives

Import required libraries

In [1]:
import pandas as pd
import numpy as np
#!conda install -c conda-forge folium=0.5.0 --yes     #uncomment to install folium
import folium
import requests # library to handle requests
import wget
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

Read Toronto Postal Codes from Wikipedia

In [2]:

#read webpage #########################################
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
tables = pd.read_html(url, header=0, keep_default_na=False)

# find the table with matching header #########################################
headings = ['Postcode', 'Borough', 'Neighbourhood']
for df in tables:     #pandas.core.frame.DataFrame ###
    current_headings = df.columns.values[:len(headings)]
    if len(current_headings) != len(headings):   # same header lenght #######
        continue
    if all(current_headings == headings):    # same header ###########
        break


 Clean, correct and prepare the information

In [3]:

#Clean unwanted charachers ########
df['Borough'].replace({r'.*!(.*)': r'\1'}, regex=True, inplace=True)  
df['Neighbourhood'].replace({r'.*!(.*)': r'\1'}, regex=True, inplace=True)

#Correct wrong input in Wikipedia for Queen's Park ######
df.loc[df.Postcode == 'M7A', 'Borough'] = 'Downtown Toronto'
df.loc[df.Postcode == 'M7A', 'Neighbourhood'] = "Queen's Park"
df.loc[df.Postcode == 'M9A', 'Borough'] = 'Etobicoke'
df.loc[df.Postcode == 'M9A', 'Neighbourhood'] = "Humber Valley Village"

# Ignore cells with a Borough that is Not assigned ##########
df = df[df.Borough != "Not assigned"]

# Not assigned Neighbourhood, use Borough #############
pd.set_option('mode.chained_assignment', None)  #clear warning ####
df.loc[df.Neighbourhood == 'Not assigned', 'Neighbourhood'] = df.Borough

# Merge Boroughs under same Postcode, separating by commas #############
df2 = df[['Postcode', 'Borough']]
df2.drop_duplicates(inplace = True)
df2 = df2.groupby('Postcode')['Borough'].agg(lambda col: ', '.join(col)).to_frame()

# Merge Neighbourhoods under same Postcode, separating by commas #############
df3 = df[['Postcode', 'Neighbourhood']]
df3.drop_duplicates(inplace = True)
df3 = df3.groupby('Postcode')['Neighbourhood'].agg(lambda col: ', '.join(col)).to_frame()

# Join table with Boroughs and Neighbourhoods for same Postcode #######
df4 = pd.merge(df2, df3, left_on='Postcode', right_on='Postcode')
df4.head(15)


Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
M1N,Scarborough,"Birch Cliff, Cliffside West"


In [4]:
df4.shape

(103, 2)

######################## End of first part #############################################
######################## End of first part #############################################
######################## End of first part #############################################

Include coordinates from csv file, filter Toronto Boroughs

In [5]:
!wget -O Geospatial_Coordinates.csv https://cocl.us/Geospatial_data
dfCoord = pd.read_csv('Geospatial_Coordinates.csv')
city_data = pd.merge(df4, dfCoord, left_on='Postcode', right_on='Postal Code')
city_data = city_data[['Postal Code', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude']]

city_data = city_data[city_data['Borough'].str.contains("Toronto")]
city_data.head(15)


"wget" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


######################## End of second part #############################################
######################## End of second part #############################################
######################## End of second part #############################################

Show Toronto Neighbourhoods on map

In [6]:
#Downtown Toronto coordinates (University of Toronto)
latitude = 43.662574
longitude = -79.395566
map_City = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighbourhood, postcode in zip(city_data['Latitude'], city_data['Longitude'], city_data['Borough'], city_data['Neighbourhood'] , city_data['Postal Code']  ):
    label = '{}. {}. {}'.format(postcode, borough, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_City)  
map_City

In [7]:
# @hidden_cell
CLIENT_ID = 'K23D1GECXQJW35IMLTISFPWJCTL41WFEFUDH3ITHRZBPQ3JU'
CLIENT_SECRET = 'YE3120DLTKKX3H10W5XKGQ5CQ24TI3YRGSKZMLQKRO2AJ1GE'
VERSION = '20180605'
#https://scipython.com/blog/scraping-a-wikipedia-table-with-pandas/


Parameters for retrieving venues

In [8]:
LIMIT = 200 # limit of number of venues returned by Foursquare API
radius = 400 # define radius
num_top_venues = 5

Function for retrieving venues from Foursquare

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Obtain venues from Foursquare for each neighbourhood

In [10]:
city_venues = getNearbyVenues(names=city_data['Neighbourhood'],
                                   latitudes=city_data['Latitude'],
                                   longitudes=city_data['Longitude'])

Backup Foursquare results before filtering by venue type

In [11]:
city_venues_unfiltered = city_venues   #backup

Filtering venues by healthy choices. Some neighbourhoods do not have healthy venues, will by clustered together in Cluster -1. 

In [12]:
Healthy = ['Trail', 'Health Food Store', 'Other Great Outdoors', 'Yoga Studio', 
           'Fruit & Vegetable Store', 'Gym', 'Gym / Fitness Center', 
           'Swim School', 'Dance Studio', 'Athletics & Sports']
city_venues = city_venues_unfiltered[city_venues_unfiltered['Venue Category'].isin(Healthy)]
city_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
3,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
13,"The Danforth West, Riverdale",43.679557,-79.352188,Moksha Yoga Danforth,43.677622,-79.352116,Yoga Studio
15,"The Danforth West, Riverdale",43.679557,-79.352188,Valley Farm Produce,43.677999,-79.349969,Fruit & Vegetable Store


Analyze each neighbourhood. Obtain frequencies for each venue type

In [13]:
# one hot encoding
city_onehot = pd.get_dummies(city_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
city_onehot['Neighbourhood'] = city_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [city_onehot.columns[-1]] + list(city_onehot.columns[:-1])
city_onehot = city_onehot[fixed_columns]

# group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category
city_grouped = city_onehot.groupby('Neighbourhood').mean().reset_index()
city_grouped.head(5)

Unnamed: 0,Neighbourhood,Athletics & Sports,Dance Studio,Fruit & Vegetable Store,Gym,Gym / Fitness Center,Health Food Store,Other Great Outdoors,Swim School,Trail,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.666667,0.333333,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5
2,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.333333
4,Christie,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Function for obtaining most common venues per neighbourhood. 

In [14]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Show most common venue types per neighbourhood. 

In [15]:
# common venues per neighbourhood #########
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = city_grouped['Neighbourhood']

for ind in np.arange(city_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(city_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head(5)

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Gym,Gym / Fitness Center,Yoga Studio,Trail,Swim School
1,"Brockton, Exhibition Place, Parkdale Village",Yoga Studio,Gym,Trail,Swim School,Other Great Outdoors
2,Business Reply Mail Processing Centre 969 Eastern,Gym / Fitness Center,Yoga Studio,Trail,Swim School,Other Great Outdoors
3,Central Bay Street,Gym / Fitness Center,Yoga Studio,Trail,Swim School,Other Great Outdoors
4,Christie,Athletics & Sports,Yoga Studio,Trail,Swim School,Other Great Outdoors


Cluster neighbourhoods using K-Means algorithm

In [16]:
kclusters = 5
city_grouped_clustering = city_grouped.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(city_grouped_clustering)

Add clustering labels 

In [17]:
neighbourhoods_venues_sorted.insert(0, 'Cluster', kmeans.labels_)
neighbourhoods_venues_sorted.head(5)

Unnamed: 0,Cluster,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,0,"Adelaide, King, Richmond",Gym,Gym / Fitness Center,Yoga Studio,Trail,Swim School
1,0,"Brockton, Exhibition Place, Parkdale Village",Yoga Studio,Gym,Trail,Swim School,Other Great Outdoors
2,2,Business Reply Mail Processing Centre 969 Eastern,Gym / Fitness Center,Yoga Studio,Trail,Swim School,Other Great Outdoors
3,2,Central Bay Street,Gym / Fitness Center,Yoga Studio,Trail,Swim School,Other Great Outdoors
4,3,Christie,Athletics & Sports,Yoga Studio,Trail,Swim School,Other Great Outdoors


Assign clusters to neighbourhoods. Neighbourhoods with no healthy venues are clustered together as Cluster -1. 

In [18]:
# merge city_grouped with city_data to add latitude/longitude for each neighbourhood
city_merged = city_data.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), 
                               on='Neighbourhood')

city_merged['Cluster'] = city_merged['Cluster'].fillna(-1)    #neighborhoods with no facilities
city_merged['Cluster'] = city_merged['Cluster'].astype(int)
city_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Trail,Other Great Outdoors,Health Food Store,Yoga Studio,Swim School
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1,Yoga Studio,Trail,Fruit & Vegetable Store,Swim School,Other Great Outdoors
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Gym,Yoga Studio,Trail,Swim School,Other Great Outdoors
43,M4M,East Toronto,Studio District,43.659526,-79.340923,2,Yoga Studio,Gym / Fitness Center,Trail,Swim School,Other Great Outdoors
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,Swim School,Gym / Fitness Center,Yoga Studio,Trail,Other Great Outdoors


Map Clusters

In [19]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(city_merged['Latitude'], city_merged['Longitude'], city_merged['Neighbourhood'], city_merged['Cluster']):
    label = folium.Popup('Cluster ' + str(cluster) + ': ' +str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

Cluster -1: No healthy venues

In [20]:
city_merged[city_merged['Cluster'] == -1].head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,-1,,,,,
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,-1,,,,,
51,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,-1,,,,,
56,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,-1,,,,,
63,M5N,Central Toronto,Roselawn,43.711695,-79.416936,-1,,,,,


Cluster 0: Gym's

In [21]:
city_merged[city_merged['Cluster'] == 0].head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Gym,Yoga Studio,Trail,Swim School,Other Great Outdoors
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Gym,Yoga Studio,Trail,Swim School,Other Great Outdoors
47,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Gym,Yoga Studio,Trail,Swim School,Other Great Outdoors
52,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,0,Gym,Yoga Studio,Dance Studio,Trail,Swim School
54,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,0,Other Great Outdoors,Gym / Fitness Center,Gym,Yoga Studio,Trail


Cluster 1: Trails

In [22]:
city_merged[city_merged['Cluster'] == 1].head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Trail,Other Great Outdoors,Health Food Store,Yoga Studio,Swim School
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1,Yoga Studio,Trail,Fruit & Vegetable Store,Swim School,Other Great Outdoors
50,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,1,Trail,Yoga Studio,Swim School,Other Great Outdoors,Health Food Store
64,M5P,Central Toronto,"Forest Hill North, Forest Hill West",43.696948,-79.411307,1,Trail,Yoga Studio,Swim School,Other Great Outdoors,Health Food Store


Cluster 2: Yoga

In [23]:
city_merged[city_merged['Cluster'] == 2].head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
43,M4M,East Toronto,Studio District,43.659526,-79.340923,2,Yoga Studio,Gym / Fitness Center,Trail,Swim School,Other Great Outdoors
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,Swim School,Gym / Fitness Center,Yoga Studio,Trail,Other Great Outdoors
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,2,Yoga Studio,Gym / Fitness Center,Trail,Swim School,Other Great Outdoors
53,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,2,Health Food Store,Gym / Fitness Center,Yoga Studio,Trail,Swim School
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,2,Gym / Fitness Center,Yoga Studio,Trail,Swim School,Other Great Outdoors


Cluster 3: Athletics & Sports

In [24]:
city_merged[city_merged['Cluster'] == 3].head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
75,M6G,Downtown Toronto,Christie,43.669542,-79.422564,3,Athletics & Sports,Yoga Studio,Trail,Swim School,Other Great Outdoors


Cluster 4: Dance

In [25]:
city_merged[city_merged['Cluster'] == 4].head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
59,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,4,Dance Studio,Yoga Studio,Trail,Swim School,Other Great Outdoors
