### Segmenting and Clustering Neighborhoods in Toronto

### Task 1

In [2]:
import pandas as pd

Scrape the table from Wikipedia.

In [3]:
toronto = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]

Drop all the rows without an assigned borough. Reset the index.  

In [4]:
toronto = toronto[toronto.Borough != "Not assigned"]
toronto.reset_index(drop = True, inplace = True)

If the neighbourhood is not assigned, set it to be the borough.

In [5]:
for i in toronto[toronto.Neighbourhood == "Not assigned"].index.to_list():
    toronto.Neighbourhood[i] = toronto.Borough[i]

Finally, join the neighbourhoods of a particular postcode.

In [6]:
postcode = toronto.Postcode.unique()
borough = []
neighbourhood = []

for p in postcode:
    toronto_subset = toronto[toronto.Postcode == p]
    borough.append(toronto_subset.Borough.to_list()[0])
    neighbourhood.append(",".join(toronto_subset.Neighbourhood))
    
toronto_final = pd.DataFrame({"PostalCode": postcode, "Borough": borough, "Neighbourhood": neighbourhood})
toronto_final.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


The final data set has 103 rows.

In [7]:
toronto_final.shape

(103, 3)

### Task 2

In [8]:
location = pd.read_csv("Geospatial_Coordinates.csv")
location.columns = ["PostalCode", "Latitude", "Longitude"]

In [9]:
toronto_join = toronto_final.join(location.set_index("PostalCode"), on = "PostalCode")
toronto_join.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


### Task 3

In [10]:
import numpy as np

import json

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

import requests 

from pandas.io.json import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium

Supply the parameters to make API calls from Foursquare

In [11]:
CLIENT_ID = 'AADZNFJL102ZBDR5UT4GCADHPFMT0GURPVQ5TFKTMPCHJZBV' # your Foursquare ID
CLIENT_SECRET = '04202ITCIA3HYLNLGU5NRPDX1WU55CUAIZELH5P0SQEIXSXP' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100

Get the nearby venues for each postal code

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius = 250):
    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        for v in results:
            venues_list.append([name, lat, lng, 
            v['venue']['name'], v['venue']['location']['lat'], 
            v['venue']['location']['lng'], v['venue']['categories'][0]['name']])
        
    nearby_venues = pd.DataFrame(venues_list)
    nearby_venues.columns = ['PostalCode', 
                  'PostalCode Latitude', 
                  'PostalCode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
toronto_venues = getNearbyVenues(names = toronto_join['PostalCode'],
                                 latitudes = toronto_join['Latitude'],
                                 longitudes = toronto_join['Longitude'])
toronto_venues.head(10)

Unnamed: 0,PostalCode,PostalCode Latitude,PostalCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,M4A,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
2,M4A,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
3,M4A,43.725882,-79.315572,Eglinton Ave E & Sloane Ave/Bermondsey Rd,43.726086,-79.31362,Intersection
4,M4A,43.725882,-79.315572,Pizza Nova,43.725824,-79.31286,Pizza Place
5,M5A,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
6,M5A,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
7,M5A,43.65426,-79.360636,Cooper Koo Family YMCA,43.653191,-79.357947,Gym / Fitness Center
8,M5A,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
9,M5A,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


Split the Venue Category column into multiple dummy variables

In [14]:
toronto_venues_dummy = pd.get_dummies(toronto_venues[['Venue Category']], prefix = "", prefix_sep = "")
toronto_venues_dummy = pd.concat([toronto_venues['PostalCode'], toronto_venues_dummy], axis = 1)
toronto_venues_dummy.head(10)

Unnamed: 0,PostalCode,Adult Boutique,Airport Lounge,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,BBQ Joint,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
toronto_grouped = toronto_venues_dummy.groupby('PostalCode').mean().reset_index()
toronto_grouped

Unnamed: 0,PostalCode,Adult Boutique,Airport Lounge,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Workshop,BBQ Joint,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1M,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,M8Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,M9C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73,M9N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
74,M9P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Obtain the most common venues for each postal code

In [16]:
def most_common(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

In [17]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['PostalCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind + 1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind + 1))

PostalCode_venues_sorted = pd.DataFrame(columns = columns)
PostalCode_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']

for ind in np.arange(toronto_grouped.shape[0]):
    PostalCode_venues_sorted.iloc[ind, 1:] = most_common(toronto_grouped.iloc[ind, :], num_top_venues)

PostalCode_venues_sorted.head(10)

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1G,Korean Restaurant,Yoga Studio,Flower Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant
1,M1H,Hakka Restaurant,Lounge,Bank,Thai Restaurant,Yoga Studio,Donut Shop,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant
2,M1J,Playground,Yoga Studio,Dog Run,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
3,M1M,Motel,American Restaurant,Yoga Studio,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
4,M1N,Pizza Place,Café,Japanese Restaurant,Yoga Studio,Donut Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant
5,M1P,Light Rail Station,Donut Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
6,M1T,Shopping Mall,Yoga Studio,Construction & Landscaping,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
7,M1W,Fast Food Restaurant,Chinese Restaurant,Grocery Store,Supermarket,Pharmacy,Pizza Place,Coffee Shop,Sandwich Place,Yoga Studio,Dog Run
8,M2H,Fast Food Restaurant,Golf Course,Mediterranean Restaurant,Pool,Discount Store,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
9,M2J,Clothing Store,Coffee Shop,Juice Bar,Cosmetics Shop,Fast Food Restaurant,Women's Store,Gift Shop,Burger Joint,Burrito Place,Restaurant


Build a K-Means clustering model with k = 5

In [18]:
k = 5

toronto_cluster = toronto_grouped.drop('PostalCode', axis = 1)

kmeans = KMeans(n_clusters = k, random_state = 0).fit(toronto_cluster)

kmeans.labels_[0:10] 

array([0, 0, 2, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [19]:
try:
    PostalCode_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
except:
    print('The column is already in the data frame!')
    
toronto_merged = PostalCode_venues_sorted

toronto_merged = PostalCode_venues_sorted.join(toronto_join.set_index('PostalCode'), on = 'PostalCode')

toronto_merged.head(10)

Unnamed: 0,Cluster Labels,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Borough,Neighbourhood,Latitude,Longitude
0,0,M1G,Korean Restaurant,Yoga Studio,Flower Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Scarborough,Woburn,43.770992,-79.216917
1,0,M1H,Hakka Restaurant,Lounge,Bank,Thai Restaurant,Yoga Studio,Donut Shop,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Scarborough,Cedarbrae,43.773136,-79.239476
2,2,M1J,Playground,Yoga Studio,Dog Run,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Scarborough,Scarborough Village,43.744734,-79.239476
3,0,M1M,Motel,American Restaurant,Yoga Studio,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
4,0,M1N,Pizza Place,Café,Japanese Restaurant,Yoga Studio,Donut Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848
5,0,M1P,Light Rail Station,Donut Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Scarborough,"Dorset Park,Scarborough Town Centre,Wexford He...",43.75741,-79.273304
6,0,M1T,Shopping Mall,Yoga Studio,Construction & Landscaping,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Scarborough,"Clarks Corners,Sullivan,Tam O'Shanter",43.781638,-79.304302
7,0,M1W,Fast Food Restaurant,Chinese Restaurant,Grocery Store,Supermarket,Pharmacy,Pizza Place,Coffee Shop,Sandwich Place,Yoga Studio,Dog Run,Scarborough,L'Amoreaux West,43.799525,-79.318389
8,0,M2H,Fast Food Restaurant,Golf Course,Mediterranean Restaurant,Pool,Discount Store,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,North York,Hillcrest Village,43.803762,-79.363452
9,0,M2J,Clothing Store,Coffee Shop,Juice Bar,Cosmetics Shop,Fast Food Restaurant,Women's Store,Gift Shop,Burger Joint,Burrito Place,Restaurant,North York,"Fairview,Henry Farm,Oriole",43.778517,-79.346556


Get the precise location of Toronto

In [20]:
geolocator = Nominatim(user_agent = "toronto_explorer")
location = geolocator.geocode('Toronto')
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Visualize the clustering

In [22]:
map_clusters = folium.Map(location = [latitude, longitude], zoom_start=11)

x = np.arange(k)
ys = []
for i in range(k):
    ys.append(i + x + (i*x)**2)
colors_array = cm.Spectral(np.linspace(0, 1, len(ys)))
c = []
for i in colors_array:   
    c.append(colors.rgb2hex(i))
    
markers_colors = []
for lat, lon, pos, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], 
                                  toronto_merged['PostalCode'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(pos) + ' Cluster ' + str(cluster), parse_html = True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = c[cluster - 1],
        fill = True,
        fill_color = c[cluster - 1],
        fill_opacity = 0.7).add_to(map_clusters)
       
map_clusters