# Segmenting and Clustering Neighborhoods in Toronto
This notebook is peer-graded assignment for analyzing neighborhoods in Toronto.

### Part 1) Data scraping and data cleaning :

In [1]:
# Import all the related libraries
get_ipython().system(u' pip install --upgrade pip')
get_ipython().system(u' pip install beautifulsoup4')
!pip install lxml
!pip install et_xmlfile
!pip install requests
!conda install -c conda-forge geopy --yes

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from bs4 import BeautifulSoup

# Import K-mean
from sklearn.cluster import KMeans

print('Libraries imported.')

Requirement already up-to-date: pip in /home/jupyterlab/conda/envs/python/lib/python3.6/site-packages (20.2.4)
Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


In [2]:
# Scrape the data from wikipedia.
req = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

soup = BeautifulSoup(req.content,'lxml')

table = soup.find_all('table')[0]

df = pd.read_html(str(table))

neighborhood_raw = pd.DataFrame(df[0])

In [3]:
neighborhood_raw

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [4]:
neighborhood = neighborhood_raw.replace("Not assigned", np.nan)
neighborhood

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,,
176,M6Z,,
177,M7Z,,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [5]:
# simply drop whole row with NaN in "Borough" column
neighborhood.dropna(subset=["Borough"], axis=0, inplace=True)

# reset index after dropping rows
neighborhood.reset_index(drop=True, inplace=True)

In [6]:
neighborhood

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
# Assign Borough name to neighborhood name for condition "If a cell has a borough but a Not assigned
# neighborhood, then the neighborhood will be the same as the borough."
for i in range(neighborhood.shape[0]):
    if neighborhood['Neighbourhood'][i] == 'NaN':
        neighborhood['Neighbourhood'][i] = neighborhood['Borough'][i]

In [8]:
# Check the shape of the data frame neighborhood
neighborhood.shape

(103, 3)

### Part 2) Get the latitude and the longitude coordinates of each neighborhood.

In [9]:
# import geocoder
!conda install -c conda-forge geocoder --yes
import geocoder
print("Imported geocoder.")

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Imported geocoder.


In [10]:
# create empty list for latitude and longitude
latitude, longitude = [], []

Get latitude and longitude values for different neighborhoods.

In [11]:
for i in range(neighborhood.shape[0]):

    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(neighborhood['Postal Code'][i])) #postal_code
        lat_lng_coords = g.latlng

    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])
    print(i)
print('completed')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
completed


Add in columns for Latitude and Longitude in neighborhood dataframe.

In [12]:
neighborhood['Latitude'] = latitude
neighborhood['Longitude'] = longitude
neighborhood

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.65319,-79.51113
99,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38133
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.64869,-79.38544
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.63278,-79.48945


### Part 3) Generate maps to visualize neighborhoods in Toronto and cluster them together

Find the latitude and longitude of Toronto city.

In [13]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
trt_latitude = location.latitude
trt_longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(trt_latitude, trt_longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


Filter Borough to only Borough that contains the word "Toronto"

In [14]:
for i in range(neighborhood.shape[0]):
    if "Toronto" in neighborhood['Borough'][i]:
        print(i)
    else :
        neighborhood.drop([i], inplace=True)

2
4
9
15
19
20
24
25
30
31
36
37
41
42
43
47
48
54
61
62
67
68
69
73
74
75
79
80
81
83
84
86
87
91
92
96
97
99
100


In [15]:
neighborhood.reset_index(drop=True, inplace=True)
neighborhood

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
3,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
4,M4E,East Toronto,The Beaches,43.67709,-79.29547
5,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306
6,M5G,Downtown Toronto,Central Bay Street,43.65609,-79.38493
7,M6G,Downtown Toronto,Christie,43.66869,-79.42071
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6497,-79.38258
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.66505,-79.43891


#### Create Toronto map using Folium

In [16]:
# create map of Toronto using latitude and longitude values
map_toronto_1 = folium.Map(location=[trt_latitude, trt_longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhoods in zip(neighborhood['Latitude'], neighborhood['Longitude'], neighborhood['Borough'], neighborhood['Neighbourhood']):
    label = '{}, {}'.format(neighborhoods, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_1)  
    
map_toronto_1

In [17]:
# Define inputs X to each area of Postal Code
X = [neighborhood['Latitude'], neighborhood['Longitude']]
X = np.array(X).T
X

array([[ 43.65512, -79.36264],
       [ 43.66253, -79.39188],
       [ 43.65739, -79.37804],
       [ 43.65215, -79.37587],
       [ 43.67709, -79.29547],
       [ 43.64536, -79.37306],
       [ 43.65609, -79.38493],
       [ 43.66869, -79.42071],
       [ 43.6497 , -79.38258],
       [ 43.66505, -79.43891],
       [ 43.64285, -79.38076],
       [ 43.64848, -79.41774],
       [ 43.68375, -79.35512],
       [ 43.6471 , -79.38153],
       [ 43.63941, -79.42676],
       [ 43.66797, -79.31467],
       [ 43.6484 , -79.37914],
       [ 43.66213, -79.33497],
       [ 43.72843, -79.38713],
       [ 43.71208, -79.41848],
       [ 43.71276, -79.38851],
       [ 43.69479, -79.4144 ],
       [ 43.65973, -79.46281],
       [ 43.71458, -79.40668],
       [ 43.67484, -79.40452],
       [ 43.64777, -79.44989],
       [ 43.7034 , -79.38659],
       [ 43.66311, -79.4018 ],
       [ 43.64982, -79.47548],
       [ 43.69048, -79.38318],
       [ 43.65351, -79.39722],
       [ 43.68568, -79.40237],
       [

In [18]:
# set number of clusters
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0)
kmeans.fit(X)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 3, 3, 3, 0, 3, 3, 1, 3, 1], dtype=int32)

In [19]:
# add clustering labels
neighborhood.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhood

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,3,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66253,-79.39188
2,3,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
3,3,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
4,0,M4E,East Toronto,The Beaches,43.67709,-79.29547
5,3,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306
6,3,M5G,Downtown Toronto,Central Bay Street,43.65609,-79.38493
7,1,M6G,Downtown Toronto,Christie,43.66869,-79.42071
8,3,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6497,-79.38258
9,1,M6H,West Toronto,"Dufferin, Dovercourt Village",43.66505,-79.43891


Finally, let's visualize the resulting clusters

In [20]:
# create map
map_clusters = folium.Map(location=[trt_latitude, trt_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(neighborhood['Latitude'], neighborhood['Longitude'], neighborhood['Neighbourhood'], neighborhood['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters