# Segmenting and Clustering Neighborhoods in Toronto

First we install Beautiful Soup 4 and the lxml parser:

In [1]:
#!conda install -c conda-forge beautifulsoup4 -y

In [2]:
#!conda install -c conda-forge lxml -y

Next import necessary packages:

In [3]:
from bs4 import BeautifulSoup as bs
import requests
import csv
import pandas as pd

In [4]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text #fetches the webpage as a text file

In the following cell we scrape the webpage into a csv file with the necessary parameters:

In [5]:
soup = bs(source, 'lxml') 
table = soup.find('table') # finds the table to exclude other unwanted html
csv_file = open('tps_scrape.csv', 'w') #creates a csv file
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood']) #name of the columns in our csv file
ro=[]
for row in table.find_all('tr'): #this loops through each row in the table
    prevro = ro
    ro = []
    for entry in row.find_all('td'): #loops through each entry in the row
        ro.append(entry.text)
    if len(ro) and len(prevro) != 0: # this excludes the first list which has length zero
        new = ro[2].replace("\n", "") #fixes the format
        ro[2] = new
        if ro[2] == 'Not assigned': # assigns the borough as the neighborhood for those with 'not assigned' neighborhood
            ro[2] = ro[1]
        if prevro[0] == ro[0]: # checks if the previous row is identical to the current row then combines the neighborhoods if they are 
            ro = [ro[0], ro[1], prevro[2] + ', ' + ro[2]]
        if ro[1] and prevro[1] != 'Not assigned': #this logic excludes unaasigned postcodes and writes the previous row to a csv file
            if ro[0] != prevro[0]:
                csv_writer.writerow(prevro) #note that this code would not print the last line of the table, but in this case we don't need to fix that since the last postcode is unassigned
csv_file.close()

Assign the data to a Pandas DataFrame and find the shape:

In [6]:
tpc_df = pd.read_csv('tps_scrape.csv')
tpc_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [7]:
tpc_df.shape

(103, 3)

In [8]:
df1 = pd.read_csv('Geospatial_Coordinates.csv')
df1.rename(index = str, columns ={'Postal Code': 'Postcode'}, inplace=True )
df = df1.merge(tpc_df, how= 'inner', on = 'Postcode')
df

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae
5,M1J,43.744734,-79.239476,Scarborough,Scarborough Village
6,M1K,43.727929,-79.262029,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,43.711112,-79.284577,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,43.716316,-79.239476,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,43.692657,-79.264848,Scarborough,"Birch Cliff, Cliffside West"


In [9]:
df.head()

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern"
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


First we will plot the neighborhoods on a map of Toronto to get an idea of how they might cluster:

In [63]:
import folium
toronto_map = folium.Map(location=[43.6532, -79.3832], zoom_start=10.5)

# display the map of Toronto
postalcodes = folium.map.FeatureGroup()
toronto_map

In [11]:
toronto_map.save(outfile = "toronto_map.html")

Import more libraries for k-means clustering:

In [12]:
import random 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs 
%matplotlib inline

In [13]:
X = df[['Latitude', 'Longitude']].values[:,0:]
X

array([[ 43.8066863, -79.1943534],
       [ 43.7845351, -79.1604971],
       [ 43.7635726, -79.1887115],
       [ 43.7709921, -79.2169174],
       [ 43.773136 , -79.2394761],
       [ 43.7447342, -79.2394761],
       [ 43.7279292, -79.2620294],
       [ 43.7111117, -79.2845772],
       [ 43.716316 , -79.2394761],
       [ 43.692657 , -79.2648481],
       [ 43.7574096, -79.273304 ],
       [ 43.7500715, -79.2958491],
       [ 43.7942003, -79.2620294],
       [ 43.7816375, -79.3043021],
       [ 43.8152522, -79.2845772],
       [ 43.7995252, -79.3183887],
       [ 43.8361247, -79.2056361],
       [ 43.8037622, -79.3634517],
       [ 43.7785175, -79.3465557],
       [ 43.7869473, -79.385975 ],
       [ 43.7574902, -79.3747141],
       [ 43.789053 , -79.4084928],
       [ 43.7701199, -79.4084928],
       [ 43.7527583, -79.4000493],
       [ 43.7827364, -79.4422593],
       [ 43.7532586, -79.3296565],
       [ 43.7459058, -79.352188 ],
       [ 43.7258997, -79.340923 ],
       [ 43.7543283,

In [52]:
clusterNum = 8
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(X)
labels = k_means.labels_
print(labels)

[2 2 2 2 6 6 6 4 6 4 6 6 6 6 6 6 2 1 1 1 1 1 1 1 1 4 1 4 1 7 3 7 7 7 4 4 4
 4 4 4 4 4 4 4 1 1 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 5 5 5 5 5 5 5 3 3 3
 3 5 3 5 5 3 3 3 3 3 3 5 0 4 0 0 0 0 0 0 0 0 7 7 7 7 7 7 7]


Add labels generated by the algorithm to the dataframe:

In [53]:
df['Group'] = labels
df.head()

Unnamed: 0,Postcode,Latitude,Longitude,Borough,Neighbourhood,Group
0,M1B,43.806686,-79.194353,Scarborough,"Rouge, Malvern",2
1,M1C,43.784535,-79.160497,Scarborough,"Highland Creek, Rouge Hill, Port Union",2
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill",2
3,M1G,43.770992,-79.216917,Scarborough,Woburn,2
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae,6


Now generate a map showing the clusters:

In [66]:
colors = ['green', 'blue', 'red', 'purple', 'pink', 'yellow', 'white', 'black']
for lat, lng, k in zip(df.Latitude, df.Longitude, df.Group):
    postalcodes.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=6, # define how big you want the circle markers to be
            color='grey',
            fill=True,
            fill_color=colors[k],
            fill_opacity=0.6
        )
    )
latitudes = list(df.Latitude)
longitudes = list(df.Longitude)
labels = list(df.Neighbourhood)

for lat, lng, label in zip(latitudes, longitudes, labels):
    popup = folium.Popup(label, parse_html=True) #this line of code prevents an issue that causes the map not to render
    folium.Marker([lat, lng], popup=popup).add_to(toronto_map)   
 #add incidents to map
toronto_map.add_child(postalcodes)

I set the algorithm to cluster the postal codes into 8 clusters. After running the k-means algorithm we see somewhat natural clusters of postal codes with a couple exceptions. Notably, CFB Toronto, Downsview East could be in about four different clusters. 