# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# import libraries
import numpy as np
import pandas as pd

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

## 1. Explore Dataset

In [2]:
toronto = pd.read_csv('toronto_neighbourhood.csv')
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [3]:
toronto.shape

(103, 5)

#### Use geopy library to get the latitude and longitude values of Toronto

In [4]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Creat map of Toronto.

In [5]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Split Neighborhood to separate rows

In [6]:
# Split neighborhood and make a new dataframe
# refer "Split (explode) pandas dataframe string entry to separate rows" from stackoverflow
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
neighbor = pd.DataFrame(toronto.Neighborhood.str.split(',').tolist(), index = toronto.PostalCode).stack()
neighbor = neighbor.reset_index()[[0, 'PostalCode']]
neighbor.columns = ['Neighborhood', 'PostalCode']
neighbor.head()

Unnamed: 0,Neighborhood,PostalCode
0,Parkwoods,M3A
1,Victoria Village,M4A
2,Regent Park,M5A
3,Harbourfront,M5A
4,Lawrence Manor,M6A


In [7]:
# Add PostalCode, Borough and coordinates
neighbor = neighbor.merge(toronto, how='left', on='PostalCode')
neighbor.head()

Unnamed: 0,Neighborhood_x,PostalCode,Borough,Neighborhood_y,Latitude,Longitude
0,Parkwoods,M3A,North York,Parkwoods,43.753259,-79.329656
1,Victoria Village,M4A,North York,Victoria Village,43.725882,-79.315572
2,Regent Park,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,Harbourfront,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,Lawrence Manor,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763


In [8]:
neighbor.drop('Neighborhood_y', axis=1, inplace = True)
neighbor.rename(columns={'Neighborhood_x': 'Neighborhood'}, inplace=True)
neighbor.head()

Unnamed: 0,Neighborhood,PostalCode,Borough,Latitude,Longitude
0,Parkwoods,M3A,North York,43.753259,-79.329656
1,Victoria Village,M4A,North York,43.725882,-79.315572
2,Regent Park,M5A,Downtown Toronto,43.65426,-79.360636
3,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636
4,Lawrence Manor,M6A,North York,43.718518,-79.464763


In [9]:
neighbor.shape

(217, 5)

In [10]:
neighbor.to_csv('./toronto_neighbourhood_master.csv', index=False)

### Explore York  borough in Toronto
For illustration purpose, segment and cluster only the neighborhoods in York, Toronto.

#### Create york_neighbor dataframe from toronto dataframe

In [11]:
york_neighbor = neighbor[neighbor['Borough'] == 'York'].reset_index(drop=True)
york_neighbor

Unnamed: 0,Neighborhood,PostalCode,Borough,Latitude,Longitude
0,Humewood-Cedarvale,M6C,York,43.693781,-79.428191
1,Caledonia-Fairbanks,M6E,York,43.689026,-79.453512
2,Del Ray,M6M,York,43.691116,-79.476013
3,Mount Dennis,M6M,York,43.691116,-79.476013
4,Keelsdale and Silverthorn,M6M,York,43.691116,-79.476013
5,Runnymede,M6N,York,43.673185,-79.487262
6,The Junction North,M6N,York,43.673185,-79.487262
7,Weston,M9N,York,43.706876,-79.518188


#### Creat Map of York, Toronto

Get the geographical coordinates of Manhattan.

In [12]:
address = 'York, ON'

geolocator = Nominatim(user_agent="york_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of York, Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of York, Toronto are 43.6896191, -79.479188.


Create map of York, Toronto

In [13]:
# create map of York, Toronto using latitude and longitude values
map_york = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, label in zip(york_neighbor['Latitude'], york_neighbor['Longitude'], york_neighbor['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_york)  
    
map_york

#### Define Foursquare Credentials and Version

In [14]:
CLIENT_ID = 'F2HTGXC21LHSOSNRNKPHBXVKPPGA0K5RX2FBAKZ4VITN0TR3' # your Foursquare ID
CLIENT_SECRET = '1Y2RYRKZS0OOJCIDWTAPWUT2ZSWCPGWBXP11YARDYJWZZFGQ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: F2HTGXC21LHSOSNRNKPHBXVKPPGA0K5RX2FBAKZ4VITN0TR3
CLIENT_SECRET:1Y2RYRKZS0OOJCIDWTAPWUT2ZSWCPGWBXP11YARDYJWZZFGQ


#### Define function for getting nearby venues (within 500m from given location)

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Get each neighborhood in York, Toronto and create a new dataframe called york_venues

In [16]:
york_venues = getNearbyVenues(names=york_neighbor['Neighborhood'],
                                   latitudes=york_neighbor['Latitude'],
                                   longitudes=york_neighbor['Longitude']
                                  )

Humewood-Cedarvale
Caledonia-Fairbanks
Del Ray
 Mount Dennis
 Keelsdale and Silverthorn
Runnymede
 The Junction North
Weston


In [17]:
york_venues.shape

(25, 7)

In [18]:
york_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Park,43.692535,-79.428705,Field
1,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Ravine,43.690188,-79.426106,Trail
2,Humewood-Cedarvale,43.693781,-79.428191,Glen Cedar Park,43.695399,-79.429253,Playground
3,Humewood-Cedarvale,43.693781,-79.428191,Phil White Arena,43.691303,-79.431761,Hockey Arena
4,Caledonia-Fairbanks,43.689026,-79.453512,Nairn Park,43.690654,-79.4563,Park
5,Caledonia-Fairbanks,43.689026,-79.453512,Maximum Woman,43.690651,-79.456333,Women's Store
6,Caledonia-Fairbanks,43.689026,-79.453512,Fairbanks Pool,43.691959,-79.448922,Pool
7,Caledonia-Fairbanks,43.689026,-79.453512,Fairbank Memorial Park,43.692028,-79.448924,Park
8,Del Ray,43.691116,-79.476013,Subway,43.690218,-79.47405,Sandwich Place
9,Del Ray,43.691116,-79.476013,Dollar Tree,43.690296,-79.474667,Discount Store


In [19]:
# Check number of venues for each neighbor
york_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Keelsdale and Silverthorn,4,4,4,4,4,4
Mount Dennis,4,4,4,4,4,4
The Junction North,2,2,2,2,2,2
Caledonia-Fairbanks,4,4,4,4,4,4
Del Ray,4,4,4,4,4,4
Humewood-Cedarvale,4,4,4,4,4,4
Runnymede,2,2,2,2,2,2
Weston,1,1,1,1,1,1


## Analyze Each Neighborhood

In [20]:
# one hot encoding
york_onehot = pd.get_dummies(york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
york_onehot['Neighborhood'] = york_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [york_onehot.columns[-1]] + list(york_onehot.columns[:-1])
york_onehot = york_onehot[fixed_columns]

york_onehot.head()

Unnamed: 0,Neighborhood,Caribbean Restaurant,Convenience Store,Discount Store,Field,Hockey Arena,Park,Playground,Pool,Sandwich Place,Skating Rink,Trail,Turkish Restaurant,Women's Store
0,Humewood-Cedarvale,0,0,0,1,0,0,0,0,0,0,0,0,0
1,Humewood-Cedarvale,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Humewood-Cedarvale,0,0,0,0,0,0,1,0,0,0,0,0,0
3,Humewood-Cedarvale,0,0,0,0,1,0,0,0,0,0,0,0,0
4,Caledonia-Fairbanks,0,0,0,0,0,1,0,0,0,0,0,0,0


In [21]:
york_onehot.shape

(25, 14)

#### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [22]:
york_grouped = york_onehot.groupby('Neighborhood').mean().reset_index()
york_grouped

Unnamed: 0,Neighborhood,Caribbean Restaurant,Convenience Store,Discount Store,Field,Hockey Arena,Park,Playground,Pool,Sandwich Place,Skating Rink,Trail,Turkish Restaurant,Women's Store
0,Keelsdale and Silverthorn,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.25,0.0
1,Mount Dennis,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.25,0.0
2,The Junction North,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.25,0.0,0.0,0.0,0.0,0.25
4,Del Ray,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.25,0.0
5,Humewood-Cedarvale,0.0,0.0,0.0,0.25,0.25,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0
6,Runnymede,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Weston,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
york_grouped.shape

(8, 14)

#### Print each neighborhood along with the top 3 most common venues

In [24]:
num_top_venues = 3

for hood in york_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = york_grouped[york_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Keelsdale and Silverthorn----
            venue  freq
0  Discount Store  0.25
1  Sandwich Place  0.25
2    Skating Rink  0.25


---- Mount Dennis----
            venue  freq
0  Discount Store  0.25
1  Sandwich Place  0.25
2    Skating Rink  0.25


---- The Junction North----
                  venue  freq
0  Caribbean Restaurant   0.5
1     Convenience Store   0.5
2        Discount Store   0.0


----Caledonia-Fairbanks----
           venue  freq
0           Park  0.50
1           Pool  0.25
2  Women's Store  0.25


----Del Ray----
            venue  freq
0  Discount Store  0.25
1  Sandwich Place  0.25
2    Skating Rink  0.25


----Humewood-Cedarvale----
          venue  freq
0         Field  0.25
1  Hockey Arena  0.25
2    Playground  0.25


----Runnymede----
                  venue  freq
0  Caribbean Restaurant   0.5
1     Convenience Store   0.5
2        Discount Store   0.0


----Weston----
                  venue  freq
0                  Park   1.0
1  Caribbean Restaurant   0.0

#### Create dataframe of top 5 venues in descending order

Function to sort the venues in descending order.

In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create the new dataframe and display the top 5 venues for each neighborhood.

In [26]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = york_grouped['Neighborhood']

for ind in np.arange(york_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(york_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Keelsdale and Silverthorn,Turkish Restaurant,Skating Rink,Sandwich Place,Discount Store,Women's Store
1,Mount Dennis,Turkish Restaurant,Skating Rink,Sandwich Place,Discount Store,Women's Store
2,The Junction North,Convenience Store,Caribbean Restaurant,Women's Store,Turkish Restaurant,Trail
3,Caledonia-Fairbanks,Park,Women's Store,Pool,Turkish Restaurant,Trail
4,Del Ray,Turkish Restaurant,Skating Rink,Sandwich Place,Discount Store,Women's Store


## Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [27]:
# set number of clusters
kclusters = 5

york_grouped_clustering = york_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 2, 1, 0, 3, 2, 4], dtype=int32)

Create a new dataframe that includes the cluster as well as the top 5 venues for each neighborhood.

In [28]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

york_merged = york_neighbor

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
york_merged = york_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

york_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,PostalCode,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Humewood-Cedarvale,M6C,York,43.693781,-79.428191,3,Trail,Playground,Hockey Arena,Field,Women's Store
1,Caledonia-Fairbanks,M6E,York,43.689026,-79.453512,1,Park,Women's Store,Pool,Turkish Restaurant,Trail
2,Del Ray,M6M,York,43.691116,-79.476013,0,Turkish Restaurant,Skating Rink,Sandwich Place,Discount Store,Women's Store
3,Mount Dennis,M6M,York,43.691116,-79.476013,0,Turkish Restaurant,Skating Rink,Sandwich Place,Discount Store,Women's Store
4,Keelsdale and Silverthorn,M6M,York,43.691116,-79.476013,0,Turkish Restaurant,Skating Rink,Sandwich Place,Discount Store,Women's Store


#### Visualize the resulting clusters

In [29]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighborhood'], 
                                  york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters
Examine each cluster and determine the discriminating venue categories that distinguish each cluster.

#### Cluster 1

In [30]:
york_merged.loc[york_merged['Cluster Labels'] == 0, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,M6M,0,Turkish Restaurant,Skating Rink,Sandwich Place,Discount Store,Women's Store
3,M6M,0,Turkish Restaurant,Skating Rink,Sandwich Place,Discount Store,Women's Store
4,M6M,0,Turkish Restaurant,Skating Rink,Sandwich Place,Discount Store,Women's Store


#### Cluster 2

In [31]:
york_merged.loc[york_merged['Cluster Labels'] == 1, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,M6E,1,Park,Women's Store,Pool,Turkish Restaurant,Trail


#### Cluster 3

In [32]:
york_merged.loc[york_merged['Cluster Labels'] == 2, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
5,M6N,2,Convenience Store,Caribbean Restaurant,Women's Store,Turkish Restaurant,Trail
6,M6N,2,Convenience Store,Caribbean Restaurant,Women's Store,Turkish Restaurant,Trail


#### Cluster 4

In [33]:
york_merged.loc[york_merged['Cluster Labels'] == 3, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M6C,3,Trail,Playground,Hockey Arena,Field,Women's Store


#### Cluster 5

In [34]:
york_merged.loc[york_merged['Cluster Labels'] == 4, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
7,M9N,4,Park,Women's Store,Turkish Restaurant,Trail,Skating Rink
