In [20]:
import pandas as pd
import numpy as np

"The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned."

In [21]:

df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df = df[0]
df.rename(columns = {0:'PostalCode',1:'Borough',2:'Neighborhood'}, inplace = True)
df.drop(axis = 0, index = 0, inplace = True)
df.drop(df[df.Borough == 'Not assigned'].index, axis=0, inplace = True)
df.reset_index(inplace = True)
df.drop(axis = 1, columns = 'index', inplace = True)


"More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table."

In [22]:
#new wikipedia dataframe format, replace "/", with ","
df['Neighborhood'] = df['Neighborhood'].str.replace('/',',')


"If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough."

In [23]:
df.loc[df['Neighborhood']=="Not assigned",'Neighborhood']=df.loc[df['Neighborhood']=="Not assigned",'Borough']



"Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making. In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe."

In [24]:
df.shape

(103, 3)

Get source of the longitude and latitude data

In [25]:
location = pd.read_csv("Geospatial_Coordinates.csv")
location.rename(columns={"Postal Code": "PostalCode"}, inplace = True)


Match the Postal Codes in both dataframes to extract the Latitude and Longitude

In [70]:
locations=df.merge(location,on="PostalCode")
locations

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill , Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


Creating and Marking the map

In [71]:
import folium 
map_Toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

In [72]:
for lat, long, label in zip(locations['Latitude'], locations['Longitude'], locations['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='maroon',
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
map_Toronto


Run k-means to the neighborhood into 6 clusters

In [73]:
from sklearn.cluster import KMeans


# set number of clusters
kclusters = 6

Toronto_grouped_clustering = locations.drop(axis = 1, columns =['Neighborhood','PostalCode','Borough'])


# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 5, 4, 5, 1, 2, 3, 0, 5], dtype=int32)

Create a new dataframe that includes the cluster

In [74]:
locations.insert(0, 'Cluster Labels', kmeans.labels_)

In [76]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(locations['Latitude'], locations['Longitude'], locations['Neighborhood'], locations['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Explore each cluster

In [78]:
locations.loc[locations['Cluster Labels'] == 0]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,M3A,North York,Parkwoods,43.753259,-79.329656
1,0,M4A,North York,Victoria Village,43.725882,-79.315572
8,0,M4B,East York,"Parkview Hill , Woodbine Gardens",43.706397,-79.309937
13,0,M3C,North York,Don Mills,43.7259,-79.340923
14,0,M4C,East York,Woodbine Heights,43.695344,-79.318389
19,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
23,0,M4G,East York,Leaside,43.70906,-79.363452
29,0,M4H,East York,Thorncliffe Park,43.705369,-79.349372
35,0,M4J,East York,East Toronto,43.685347,-79.338106
41,0,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188


In [79]:
locations.loc[locations['Cluster Labels'] == 1]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
5,1,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
11,1,M9B,Etobicoke,"West Deane Park , Princess Gardens , Martin Gr...",43.650943,-79.554724
17,1,M9C,Etobicoke,"Eringate , Bloordale Gardens , Old Burnhamthor...",43.643515,-79.577201
70,1,M9P,Etobicoke,Westmount,43.696319,-79.532242
76,1,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819
77,1,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov...",43.688905,-79.554724
88,1,M8V,Etobicoke,"New Toronto , Mimico South , Humber Bay Shores",43.605647,-79.501321
89,1,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam...",43.739416,-79.588437
93,1,M8W,Etobicoke,"Alderwood , Long Branch",43.602414,-79.543484
94,1,M9W,Etobicoke,Northwest,43.706748,-79.594054


In [80]:
locations.loc[locations['Cluster Labels'] == 2]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
6,2,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
12,2,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
18,2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
22,2,M1G,Scarborough,Woburn,43.770992,-79.216917
26,2,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
32,2,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
38,2,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
51,2,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
65,2,M1P,Scarborough,"Dorset Park , Wexford Heights , Scarborough To...",43.75741,-79.273304
78,2,M1S,Scarborough,Agincourt,43.7942,-79.262029


In [81]:
locations.loc[locations['Cluster Labels'] == 3]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
7,3,M3B,North York,Don Mills,43.745906,-79.352188
27,3,M2H,North York,Hillcrest Village,43.803762,-79.363452
28,3,M3H,North York,"Bathurst Manor , Wilson Heights , Downsview North",43.754328,-79.442259
33,3,M2J,North York,"Fairview , Henry Farm , Oriole",43.778517,-79.346556
39,3,M2K,North York,Bayview Village,43.786947,-79.385975
45,3,M2L,North York,"York Mills , Silver Hills",43.75749,-79.374714
52,3,M2M,North York,"Willowdale , Newtonbrook",43.789053,-79.408493
55,3,M5M,North York,"Bedford Park , Lawrence Manor East",43.733283,-79.41975
59,3,M2N,North York,Willowdale,43.77012,-79.408493
61,3,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [82]:
locations.loc[locations['Cluster Labels'] == 3]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
7,3,M3B,North York,Don Mills,43.745906,-79.352188
27,3,M2H,North York,Hillcrest Village,43.803762,-79.363452
28,3,M3H,North York,"Bathurst Manor , Wilson Heights , Downsview North",43.754328,-79.442259
33,3,M2J,North York,"Fairview , Henry Farm , Oriole",43.778517,-79.346556
39,3,M2K,North York,Bayview Village,43.786947,-79.385975
45,3,M2L,North York,"York Mills , Silver Hills",43.75749,-79.374714
52,3,M2M,North York,"Willowdale , Newtonbrook",43.789053,-79.408493
55,3,M5M,North York,"Bedford Park , Lawrence Manor East",43.733283,-79.41975
59,3,M2N,North York,Willowdale,43.77012,-79.408493
61,3,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [83]:
locations.loc[locations['Cluster Labels'] == 4]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
3,4,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
10,4,M6B,North York,Glencairn,43.709577,-79.445073
21,4,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512
34,4,M3J,North York,"Northwood Park , York University",43.76798,-79.487262
40,4,M3K,North York,Downsview,43.737473,-79.464763
46,4,M3L,North York,Downsview,43.739015,-79.506944
49,4,M6L,North York,"North Park , Maple Leaf Park , Upwood Park",43.713756,-79.490074
50,4,M9L,North York,Humber Summit,43.756303,-79.565963
53,4,M3M,North York,Downsview,43.728496,-79.495697
56,4,M6M,York,"Del Ray , Mount Dennis , Keelsdale and Silvert...",43.691116,-79.476013


In [86]:
locations.loc[locations['Cluster Labels'] == 5]

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,5,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
4,5,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
9,5,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
16,5,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
20,5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,5,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,5,M5H,Downtown Toronto,"Richmond , Adelaide , King",43.650571,-79.384568
31,5,M6H,West Toronto,"Dufferin , Dovercourt Village",43.669005,-79.442259


The first and most obvious observation is that Cluster 5 captures the most number of neighborhoods. This implies that neighborhoods are saturated in Downtown and Central Toronto.

Building on the first observation, we see that in the clusters further to the west, east, and north side of Toronto, the neighborhoods are more dispersed, which is a fairly common trait of suburban areas or areas outside the main hub of a city. Looking at the furthest clusters from the Cluster 5, we see that Cluster 1 and 2 are even more dispersed.

With this clustering method, we see that Cluster 1 and 2 are exclusively captured under one borough each, Etobicoke and Scarborough respectively, with one exception in Cluster 1. In terms of area size, both these clusters seem to tell us that these two boroughs are the biggest.