# Toronto Neighbourhoods Clusters

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

from bs4 import BeautifulSoup

### Parse a wikipedia table with BeautifulSoup to obtain postcodes, borough and neighbourhood names

In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

tdlist = table.findAll('td')

In [3]:
# first element of tdlist is the postcode, second element is the borough name and third element is the neighbourhood name

postcodes = []
bor = []
neigh = []

count = 1
for idx, l in enumerate(tdlist):    
    if count == 1:
        postcodes.append(l.get_text())
        count += 1
    elif count == 2:
        bor.append(l.get_text())
        count += 1
    elif count == 3:
        neigh.append(l.get_text().strip())
        count = 1


### Construct a dataframe with the Toronto neighbourhood information

In [4]:
toronto = pd.DataFrame()
toronto['Postcode'] = postcodes
toronto['Borough'] = bor
toronto['Neighbourhood'] = neigh

In [5]:
# remove rows where the borough is 'Not assigned' to a  postcode

toronto = toronto[toronto['Borough'] != 'Not assigned']

In [6]:
# search for not assigned neighbourhoods and name them after the borough

for idx, row in toronto.iterrows():
    if str(row['Neighbourhood']) == 'Not assigned':
        toronto.at[idx, 'Neighbourhood'] = str(row['Borough'])

In [7]:
# group neighbourhoods with the same postcode

toronto = toronto.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [8]:
toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
toronto.shape 

(103, 3)

### Add latitude and longitude values for each neighbourhood

In [10]:
import geocoder

In [11]:
#iterate through the postcodes and obtain the coordinates. A while loop is used to assure getting coordinate values
'''
for idx, row in toronto.iterrows():
    pc = row['Postcode']
    coords = None
    while coords is None:
        g = geocoder.google('{}, Toronto, Ontario'.format(str(pc)))
        coords = g.latlng
    toronto.at[idx, 'Latitude'] = coords[0]
    toronto.at[idx, 'Longitude'] = coords[1]
    
'''   

"\nfor idx, row in toronto.iterrows():\n    pc = row['Postcode']\n    coords = None\n    while coords is None:\n        g = geocoder.google('{}, Toronto, Ontario'.format(str(pc)))\n        coords = g.latlng\n    toronto.at[idx, 'Latitude'] = coords[0]\n    toronto.at[idx, 'Longitude'] = coords[1]\n    \n"

Due to problems with obtaining coordinates from Geocoder the provided csv file is used to gather coordinates

In [12]:
toronto_ll = pd.read_csv('./assignment_files/Geospatial_Coordinates.csv')

In [13]:
# rename column name to 'Postcode'
toronto_ll = toronto_ll.rename(columns={'Postal Code':'Postcode'})

In [14]:
# join the dataframes on the postcode
toronto = toronto.merge(toronto_ll, how='inner', on='Postcode')

In [15]:
toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [16]:
toronto.shape

(103, 5)

## Explore venues in the neighbourhoods and cluster the neighbourhoods with k-Means clustering

In [17]:
address = 'Toronto'

geolocator = Nominatim(user_agent='Toronto-assignment')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Read in csv file with toronto values aquired with the foursquare API

In [19]:
toronto_venues = pd.read_csv('./assignment_files/toronto_venues.csv')

In [20]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,4,4,4,4,4,4
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",2,2,2,2,2,2
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",12,12,12,12,12,12
"Alderwood, Long Branch",10,10,10,10,10,10
"Bathurst Manor, Downsview North, Wilson Heights",16,16,16,16,16,16
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",25,25,25,25,25,25
Berczy Park,54,54,54,54,54,54
"Birch Cliff, Cliffside West",4,4,4,4,4,4


In [21]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 268 uniques categories.


In [36]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix='', prefix_sep='')

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [37]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

In [38]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [39]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)


### Cluster neighbourhoods

In [26]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([3, 3, 0, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3,
       3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 0, 3, 3, 0,
       3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3,
       3, 0, 3, 3, 3, 0, 3, 3, 3, 3, 3, 0], dtype=int32)

In [27]:
kmeans.labels_.shape

(100,)

In [28]:
# add clustering labels
toronto_merged = toronto.merge(neighbourhoods_venues_sorted, how='inner', on='Neighbourhood')
toronto_merged['Cluster Labels'] = kmeans.labels_

toronto_merged.head() 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Print Shop,Fast Food Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dim Sum Restaurant,Dumpling Restaurant,3
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Golf Course,Construction & Landscaping,Bar,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Yoga Studio,3
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Mexican Restaurant,Pizza Place,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Yoga Studio,Discount Store,Dog Run,Doner Restaurant,0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,Coffee Shop,Korean Restaurant,Insurance Office,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,3
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Athletics & Sports,Hakka Restaurant,Lounge,Bakery,Caribbean Restaurant,Bank,Thai Restaurant,Fried Chicken Joint,Drugstore,Doner Restaurant,3


In [29]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
rainbow = ['cyan', 'green', 'red', 'blue']

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.4).add_to(map_clusters)
       
map_clusters

### Cluster 0

In [30]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
2,Scarborough,Mexican Restaurant,Pizza Place,Rental Car Location,Electronics Store,Breakfast Spot,Medical Center,Yoga Studio,Discount Store,Dog Run,Doner Restaurant,0
13,Scarborough,Pizza Place,Pharmacy,Fast Food Restaurant,Italian Restaurant,Noodle House,Shopping Mall,Fried Chicken Joint,Chinese Restaurant,Thai Restaurant,Doner Restaurant,0
16,North York,Fast Food Restaurant,Dog Run,Mediterranean Restaurant,Pool,Golf Course,Drugstore,Diner,Discount Store,Doner Restaurant,Donut Shop,0
40,East Toronto,Gym,Fast Food Restaurant,Sushi Restaurant,Steakhouse,Ice Cream Shop,Italian Restaurant,Pub,Fish & Chips Shop,Burrito Place,Sandwich Place,0
46,Central Toronto,Restaurant,Playground,Gym,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,0
58,Downtown Toronto,Coffee Shop,Café,Hotel,Restaurant,American Restaurant,Gym,Deli / Bodega,Gastropub,Italian Restaurant,Japanese Restaurant,0
62,Central Toronto,Sushi Restaurant,Bus Line,Trail,Jewelry Store,Yoga Studio,Diner,Dog Run,Doner Restaurant,Donut Shop,Drugstore,0
65,Downtown Toronto,Café,Bar,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Coffee Shop,Bakery,Mexican Restaurant,Chinese Restaurant,Dumpling Restaurant,Burger Joint,0
72,York,Park,Pharmacy,Market,Fast Food Restaurant,Women's Store,Gym,Dessert Shop,Event Space,Ethiopian Restaurant,Empanada Restaurant,0
74,West Toronto,Discount Store,Pharmacy,Supermarket,Bakery,Gym / Fitness Center,Pool,Smoke Shop,Brewery,Fast Food Restaurant,Bar,0


### Cluster 1

In [31]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
80,West Toronto,Café,Mexican Restaurant,Fried Chicken Joint,Gastropub,Bar,Diner,Bakery,Speakeasy,Furniture / Home Store,Flea Market,1


### Cluster 2

In [32]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
6,Scarborough,Discount Store,Train Station,Department Store,Bus Station,Coffee Shop,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Yoga Studio,2
26,North York,Coffee Shop,Pharmacy,Deli / Bodega,Shopping Mall,Fast Food Restaurant,Bridal Shop,Sandwich Place,Diner,Bank,Restaurant,2


### Cluster 3

In [33]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
0,Scarborough,Print Shop,Fast Food Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dim Sum Restaurant,Dumpling Restaurant,3
1,Scarborough,Golf Course,Construction & Landscaping,Bar,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Yoga Studio,3
3,Scarborough,Coffee Shop,Korean Restaurant,Insurance Office,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,3
4,Scarborough,Athletics & Sports,Hakka Restaurant,Lounge,Bakery,Caribbean Restaurant,Bank,Thai Restaurant,Fried Chicken Joint,Drugstore,Doner Restaurant,3
5,Scarborough,Playground,Convenience Store,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Electronics Store,3
7,Scarborough,Bus Line,Bakery,Park,Soccer Field,Metro Station,Intersection,Fast Food Restaurant,Coworking Space,Donut Shop,Farmers Market,3
8,Scarborough,Movie Theater,American Restaurant,Motel,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,3
9,Scarborough,College Stadium,Café,Skating Rink,General Entertainment,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,3
10,Scarborough,Indian Restaurant,Furniture / Home Store,Chinese Restaurant,Pet Store,Vietnamese Restaurant,Light Rail Station,Latin American Restaurant,Diner,Discount Store,Dog Run,3
11,Scarborough,Smoke Shop,Middle Eastern Restaurant,Breakfast Spot,Bakery,Yoga Studio,Dumpling Restaurant,Doner Restaurant,Donut Shop,Drugstore,Electronics Store,3
