# Clustering of Neighborhoods in Toronto Canada

#### _Read the comments at the top of each cell to understand the purpose of the code therein_

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import json
from geopy.geocoders import Nominatim
import requests as req
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
import bs4

In [2]:
# Get page html and create soup with beautiful soup
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
resource = req.get(url)
soup = bs4.BeautifulSoup(resource.text, 'lxml')

In [3]:
# SCrape table data and convert to csv file
with open('a2_data_file.csv', 'a') as csv_file:
    try:
        for row in soup.find_all('tr')[1:]:
            data = row.find_all('td')
            line = '{a}, {b}, {c}'.format(a = data[0].text, b  = data[1].text, c = data[2].text)
            csv_file.write(line)
    except:
        print('Done!')

Done!


In [90]:
# Import CSV file to dataframe
df_post_codes = pd.read_csv('a2_data_file.csv', header = None)

In [91]:
# Rename column headers
df_post_codes.rename(columns = {0: 'PostCode', 1: 'Borough', 2: 'Neighbourhood'}, inplace = True)

In [93]:
# Remove records with 'Not assigned' Boroughs
df_post_codes = df_post_codes[df_post_codes['Borough'] != df_post_codes.iloc[0]['Borough']]

In [95]:
# Take care of 'Not assigned' Neighbourhood
for i in range (2, len(df_post_codes)):
    if df_post_codes.iloc[i]['Neighbourhood'] == ' Not assigned':
        df_post_codes.iloc[i]['Neighbourhood'] = df_post_codes.iloc[i]['Borough']

In [96]:
# Merge rows with the same post_code
for i in range (2, len(df_post_codes)):
    pcode = df_post_codes.iloc[i]['PostCode']
    neigh_list = df_post_codes[df_post_codes['PostCode'] == pcode]
    num_of_neigh = len(neigh_list['Neighbourhood'])
    string_val = ''
    if num_of_neigh > 1:
        for neigh in neigh_list['Neighbourhood']:
            string_val = neigh + ',' + string_val
        string_val = string_val[:-1]
        df_post_codes.iloc[i]['Neighbourhood'] = string_val
# Eliminiate duplicates
df_post_codes = df_post_codes.groupby(df_post_codes['PostCode']).aggregate({'Borough': 'first', 'Neighbourhood': 'first'} )
# Reset Index
df_post_codes = df_post_codes.reset_index()

In [106]:
# Show data frame
df_post_codes.head(12)

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"West Hill, Morningside, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Oakridge, Golden Mile, Clairlea"
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest"
9,M1N,Scarborough,"Cliffside West, Birch Cliff"


In [104]:
# Show data frame shape
df_post_codes.shape

(103, 3)

In [108]:
# Import coordinates CSV file
df_coordinates = pd.read_csv('Geospatial_Coordinates.csv')

In [109]:
# Show coordinates dataframe
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [110]:
# Show shape of coordinates dataframe
df_coordinates.shape

(103, 3)

In [114]:
# Since PostalCode is ordered the same as in df_post_codes and have the same shape, one can just merge both in the same order
# Merge data frames
df_merged = pd.concat([df_post_codes, df_coordinates[['Latitude', 'Longitude']]], axis = 1)

In [117]:
# Show merged data frame
df_merged.head(12)

Unnamed: 0,PostCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"West Hill, Morningside, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Oakridge, Golden Mile, Clairlea",43.711112,-79.284577
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West, Birch Cliff",43.692657,-79.264848


In [122]:
# Filter df_merged to get neighborhoods in Toronto
df_toronto = df_merged[df_merged['Borough'].str.contains('Toronto')]

In [123]:
# Show Toronto dataframe
df_toronto.head()

Unnamed: 0,PostCode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"Riverdale, The Danforth West",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [124]:
# Get geographic coordinates for Toronto Canada
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronoto_exp")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toeonto, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toeonto, Canada are 43.653963, -79.387207.


In [126]:
# Visualize Toronoto Neighbourhoods
# Create Map with Folium
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# Add markers
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

In [188]:
# Set  Foursquare Credentials
CLIENT_ID = 'enter your Foursquare client ID here' 
CLIENT_SECRET = 'enter your Foursquare client secret here'
VERSION = '20180605'

# Limit of number of venues returned by API
LIMIT = 100

In [133]:
# Function to get venues from each neighborhood in Toronto
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = req.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [134]:
# Call getNearbyVenues function above to get venues in each neighborhood in the df_toronto dataframe
toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

In [135]:
# Shape and head of toronto_venues
print(toronto_venues.shape)
toronto_venues.head()

(1693, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"Riverdale, The Danforth West",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [137]:
# Aggregate number of venues returned for each neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,58,58,58,58,58,58
Business Reply Mail Processing Centre 969 Eastern,19,19,19,19,19,19
Central Bay Street,81,81,81,81,81,81
Christie,16,16,16,16,16,16
Church and Wellesley,83,83,83,83,83,83
Davisville,33,33,33,33,33,33
Davisville North,10,10,10,10,10,10
"Dufferin, Dovercourt Village",20,20,20,20,20,20
"Forest Hill West, Forest Hill North",5,5,5,5,5,5
"Garden District, Ryerson",100,100,100,100,100,100


In [138]:
# Print unique categories of venues in data set
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 237 uniques categories.


In [174]:
# Use one hot encoding to create a matrix of neighborhoods to venue categories
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column from index 162 to index 0
fixed_columns = [toronto_onehot.columns[162]] + list(toronto_onehot.columns[:162]) + list(toronto_onehot.columns[163:])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Riverdale, The Danforth West",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [176]:
# Print toronto_onehot.shape to get the size of the dataframe
toronto_onehot.shape

(1693, 237)

In [177]:
# Group rows by neighborhood and compute average frequency of occurency per category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,0.0
1,Business Reply Mail Processing Centre 969 Eas...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
2,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,...,0.0,0.0,0.0,0.012346,0.0,0.0,0.012346,0.0,0.0,0.012346
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.0,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,...,0.0,0.0,0.0,0.0,0.012048,0.012048,0.0,0.012048,0.0,0.012048
5,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Dufferin, Dovercourt Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Forest Hill West, Forest Hill North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Garden District, Ryerson",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.01,0.0,0.0,0.01,0.01,0.01,0.01,0.0,0.0,0.0


In [178]:
# Print toronto_grouped.shape to get the size of the dataframe
toronto_grouped.shape

(38, 237)

In [179]:
# Function to sort venues indescending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [184]:
# Create datafram eof neighborhoods and their top 10 venues
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Café,Pub,Farmers Market,Cheese Shop,Bakery,Steakhouse,Seafood Restaurant,Restaurant
1,Business Reply Mail Processing Centre 969 Eas...,Light Rail Station,Yoga Studio,Recording Studio,Smoke Shop,Skate Park,Brewery,Burrito Place,Butcher,Restaurant,Park
2,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Burger Joint,Bar,Bubble Tea Shop,Chinese Restaurant,Salad Place,Ice Cream Shop,Thai Restaurant
3,Christie,Grocery Store,Café,Park,Athletics & Sports,Nightclub,Diner,Italian Restaurant,Restaurant,Baby Store,Coffee Shop
4,Church and Wellesley,Coffee Shop,Japanese Restaurant,Burger Joint,Sushi Restaurant,Gay Bar,Restaurant,Fast Food Restaurant,Café,Men's Store,Bubble Tea Shop


In [185]:
# Cluster the Neighborhoods using K-means Cluster
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 3, 0])

In [186]:
# Create new dataframe that shows a neighborhoods cluster as well as its top  10 venues
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Health Food Store,Coffee Shop,Pub,Diner,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
41,M4K,East Toronto,"Riverdale, The Danforth West",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Yoga Studio,Bookstore,Brewery,Bubble Tea Shop,Café,Restaurant
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,0,Park,Liquor Store,Movie Theater,Burger Joint,Fast Food Restaurant,Burrito Place,Fish & Chips Shop,Sandwich Place,Steakhouse,Sushi Restaurant
43,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Gastropub,Italian Restaurant,Bakery,American Restaurant,Yoga Studio,Park,Brewery,Seafood Restaurant
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,3,Park,Swim School,Bus Line,Yoga Studio,Dog Run,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant


In [187]:
# Visualize resulting cluster
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Finish! 