In [3]:
import numpy as np
import pandas as pd

# Data preprocesing

The data on Toronto Postal Codes was retrieved from the wikipedia entry and loaded into a `.csv` file

https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [4]:
#loading the csv flie and renaming the columns
codes_df = pd.read_csv('Toronto_Postal_Codes_Wikipedia.csv', header=None)
codes_df.columns = ['PostalCode', 'Borough', 'Neighborhood']

#removing the 'Not assigned' entries for the 'Borough' column
codes_df.drop(codes_df[codes_df['Borough']=='Not assigned'].index, inplace=True)
#There are no 'Not assigned' entries left for 'Neighborhood'

#a function to convert a list into a string separated with commas
def listtostr(L):
    N = len(L)
    if N==0:
        return ''
    R = L[0]
    for i in range(1,N):
        R = R + ', ' + L[i]
    return R
    
#Grouping by 'PostalCode' and 'Borough' converting the rest into a list
codes_df = codes_df.groupby(['PostalCode', 'Borough'], as_index=False).agg(lambda x: x.tolist())

#converting the list into a comma separated string
codes_df['Neighborhood'] = codes_df['Neighborhood'].apply(listtostr)

#the final shape of the dataframe
print("Size of the dataframe:", codes_df.shape)

codes_df.head()

Size of the dataframe: (103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge , Malvern"
1,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Geospacial Data

In [5]:
#loading geospacial data
geo_df = pd.read_csv('Geospatial_Coordinates.csv')
geo_df.columns = ['PostalCode', 'Latitude', 'Longitude']
geo_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
#join the latitude and longitude to the PostalCode dataframe
new_df = pd.merge(codes_df, geo_df, on='PostalCode', how='left')
new_df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village , Martin Grove Gardens , Ric...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens , Beaumond Heights , Humbergate...",43.739416,-79.588437
102,M9W,Etobicoke,Northwest,43.706748,-79.594054


In [16]:
new_df.shape
new_df.describe()

Unnamed: 0,Latitude,Longitude
count,103.0,103.0
mean,43.704608,-79.397153
std,0.052463,0.097146
min,43.602414,-79.615819
25%,43.660567,-79.464763
50%,43.696948,-79.38879
75%,43.74532,-79.340923
max,43.836125,-79.160497


# Clustering neighborhood with k-means

In [33]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from sklearn.cluster import KMeans
%matplotlib inline

In [17]:
#the starting coordinates are the means of our data
codes_map = folium.Map(location=[43.704608, -79.397153], zoom_start=10)

for code, borough, lat, lon in zip(new_df['PostalCode'], new_df['Borough'], new_df['Latitude'], new_df['Longitude']):
    label = '{}: {}'.format(code, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lon], 
                        radius = 5,
                        popup = label,
                        color = 'green',
                        fill = True,
                        fill_color = '#31c094',
                        fill_opacity = 0.5,
                        parse_html = False
                       ).add_to(codes_map)
codes_map

In [35]:
#Trynig KMeans on the geodata
K = 5

X_df = new_df[['Latitude', 'Longitude']]

KM_model = KMeans(n_clusters=K)
KM_model.fit(X_df)

X_df.insert(0, 'Labels', KM_model.labels_)

In [36]:
#visualizing the result
cluster_map = folium.Map(location=[43.704608, -79.397153], zoom_start=10)

#color palette
x = np.arange(K)
ys = [i + x + (i*x)**2 for i in range(K)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for lab, lat, lon in zip(X_df['Labels'], X_df['Latitude'], X_df['Longitude']):
    folium.CircleMarker([lat, lon], 
                        radius = 5,
                        popup = None,
                        color = rainbow[lab-1],
                        fill = True,
                        fill_color = rainbow[lab-1],
                        fill_opacity = 0.5,
                       ).add_to(cluster_map)

cluster_map

# Clustering neighborhood with venue data

First we collect venue data from Foursquare

In [37]:
#Credentials for Foursquare
CLIENT_ID = '*******************'
CLIENT_SECRET = '********************'
VERSION = '20180605'

In [78]:
import requests

LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print("Looking up ", name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        
            # return only relevant information for each nearby venue
            venues_list.append([(name, lat, lng, 
                               v['venue']['name'], 
                               v['venue']['location']['lat'], 
                               v['venue']['location']['lng'],  
                               v['venue']['categories'][0]['name']) for v in results])
        except:
            print("Something went wrong at", name)

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'PC Latitude', 
                  'PC Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [75]:
toronto_venues = getNearbyVenues(names = new_df['PostalCode'],
                                 latitudes = new_df['Latitude'],
                                 longitudes = new_df['Longitude'])

Looking up  M1B
Looking up  M1C
Looking up  M1E
Looking up  M1G
Looking up  M1H
Looking up  M1J
Looking up  M1K
Looking up  M1L
Looking up  M1M
Looking up  M1N
Looking up  M1P
Looking up  M1R
Looking up  M1S
Looking up  M1T
Looking up  M1V
Looking up  M1W
Looking up  M1X
Looking up  M2H
Looking up  M2J
Looking up  M2K
Looking up  M2L
Looking up  M2M
Looking up  M2N
Looking up  M2P
Looking up  M2R
Looking up  M3A
Looking up  M3B
Looking up  M3C
Looking up  M3H
Looking up  M3J
Looking up  M3K
Looking up  M3L
Looking up  M3M
Looking up  M3N
Looking up  M4A
Looking up  M4B
Looking up  M4C
Looking up  M4E
Looking up  M4G
Looking up  M4H
Looking up  M4J
Looking up  M4K
Looking up  M4L
Looking up  M4M
Looking up  M4N
Looking up  M4P
Looking up  M4R
Looking up  M4S
Looking up  M4T
Looking up  M4V
Looking up  M4W
Looking up  M4X
Looking up  M4Y
Looking up  M5A
Looking up  M5B
Looking up  M5C
Looking up  M5E
Looking up  M5G
Looking up  M5H
Looking up  M5J
Looking up  M5K
Looking up  M5L
Looking 

In [76]:
toronto_venues.shape

(2222, 7)

In [77]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,M1E,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
3,M1E,43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa
4,M1E,43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


In [79]:
#Changing headers 
toronto_venues.columns = ['PostalCode', 'PC Latitude', 'PC Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
toronto_venues.head(10)

Unnamed: 0,PostalCode,PC Latitude,PC Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,M1E,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
3,M1E,43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa
4,M1E,43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant
5,M1E,43.763573,-79.188711,Enterprise Rent-A-Car,43.764076,-79.193406,Rental Car Location
6,M1E,43.763573,-79.188711,Woburn Medical Centre,43.766631,-79.192286,Medical Center
7,M1E,43.763573,-79.188711,Lawrence Ave E & Kingston Rd,43.767704,-79.18949,Intersection
8,M1E,43.763573,-79.188711,Eggsmart,43.7678,-79.190466,Breakfast Spot
9,M1G,43.770992,-79.216917,Starbucks,43.770037,-79.221156,Coffee Shop


In [80]:
#backup the database
toronto_venues.to_csv('toronto_venues.csv')

In [81]:
#counting Venue types
len(toronto_venues['Venue Category'].unique())

268

In [82]:
#one-hot filter over Venue Categories
toronto_venue_ones = pd.get_dummies(toronto_venues['Venue Category'], prefix="", prefix_sep="")

#get back a copy of the PostalCodes
toronto_venue_ones['PostalCode'] = toronto_venues['PostalCode']

toronto_venue_ones.tail()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,PostalCode
2217,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M9V
2218,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M9V
2219,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M9W
2220,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M9W
2221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,M9W


In [83]:
toronto_venue_ones.shape

(2222, 269)

In [85]:
#Regroup the entries by Postal Code using a mean for the Categories
toronto_venue_group = toronto_venue_ones.groupby('PostalCode').mean().reset_index()
toronto_venue_group.head()

Unnamed: 0,PostalCode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
toronto_venue_group.shape

(99, 269)

## Clustering the postal codes by the venue data

In [105]:
Kclusters = 11

X_toronto_venue = toronto_venue_group.drop(['PostalCode'], axis=1)

KMmodel = KMeans(n_clusters=Kclusters)
KMmodel.fit(X_toronto_venue)

toronto_venue_group['Cluster'] = KMmodel.labels_

toronto_venue_group.head()

Unnamed: 0,Cluster,PostalCode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,4,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [106]:
toronto_result = pd.merge(toronto_venue_group, geo_df, on='PostalCode', how='left')
toronto_result.head()

Unnamed: 0,Cluster,PostalCode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Latitude,Longitude
0,4,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.806686,-79.194353
1,5,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.784535,-79.160497
2,1,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.763573,-79.188711
3,1,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.770992,-79.216917
4,1,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.773136,-79.239476


# Visualizing the results

In [107]:
cluster_venue_map = folium.Map(location=[43.704608, -79.397153], zoom_start=10)

#color palette
x = np.arange(Kclusters)
ys = [i + x + (i*x)**2 for i in range(Kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for lab, lat, lon in zip(toronto_result['Cluster'], toronto_result['Latitude'], toronto_result['Longitude']):
    folium.CircleMarker([lat, lon], 
                        radius = 5,
                        popup = None,
                        color = rainbow[lab-1],
                        fill = True,
                        fill_color = rainbow[lab-1],
                        fill_opacity = 0.7,
                       ).add_to(cluster_venue_map)

cluster_venue_map