# Scraping the Wikipedia page for the neighbourhoods in Toronto

First install wikipedia module

In [1]:
!pip install wikipedia

import pandas as pd
import wikipedia as wp
 
#Get the html source
html = wp.page("List of postal codes of Canada: M").html().encode("UTF-8")
df = pd.read_html(html)[0]
df.head() # Try to see if it works

Collecting wikipedia
  Downloading https://files.pythonhosted.org/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/87/2a/18/4e471fd96d12114d16fe4a446d00c3b38fb9efcb744bd31f4a
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Now clean the dataframe such that only entries with assigned postcodes are displayed

In [2]:
df_assigned = df[df.Borough != 'Not assigned']
df_assigned.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


Join the rows with the same postcode prefix

In [3]:
df_cleaned = df_assigned.groupby(['Postcode', 'Borough'], as_index = False).agg({'Neighborhood': ', '.join})
df_cleaned.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [4]:
# To see how many neighbourhoods are still "not assigned"
df_cleaned[df_cleaned.Neighborhood=='Not assigned']

Unnamed: 0,Postcode,Borough,Neighborhood
85,M7A,Queen's Park,Not assigned


In [5]:
# Just one, so I can do this
df_cleaned['Neighborhood'].replace('Not assigned', 'Queen\'s Park', inplace=True)

# And now, just to double-check
df_cleaned[df_cleaned.Neighborhood=='Not assigned']

Unnamed: 0,Postcode,Borough,Neighborhood


In [6]:
# Print the number of rows
df_cleaned.shape

(103, 3)

# Adding the corresponding coordinates for the boroughs

Load the geographical coordinates from the csv file

In [7]:
df_latlong = pd.read_csv('http://cocl.us/Geospatial_data/Geospatial_coordinates.csv')

In [8]:
# Make sure the dataframe have matching row numbers
print("Shape of df_cleaned: ", df_cleaned.shape)
print("Shape of df_latlong: ", df_latlong.shape)

Shape of df_cleaned:  (103, 3)
Shape of df_latlong:  (103, 3)


In [9]:
# Then append
df_cleaned['Latitude']=df_latlong['Latitude']
df_cleaned['Longitude']=df_latlong['Longitude']
df_cleaned.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Clustering neighbourhoods

Import Folium package, and all graphical packages for subsequent drawing of figures

In [10]:
# import folium and nominatim, latter to retrieve coordinates for Toronto
!pip install folium==0.5.0
import folium
from geopy.geocoders import Nominatim
import numpy as np

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting folium==0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/07/37/456fb3699ed23caa0011f8b90d9cad94445eddc656b601e6268090de35f5/folium-0.5.0.tar.gz (79kB)
[K     |████████████████████████████████| 81kB 3.0MB/s eta 0:00:01
[?25hCollecting branca (from folium==0.5.0)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/f8/98/ff/954791afc47740d554f0d9e5885fa09dd60c2265d42578e665
Successfully built folium
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.5.0


In [11]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

Pick out the entries where the Borough contains the word "Toronto"

In [13]:
df_toronto = df_cleaned[df_cleaned.Borough.str.contains('Toronto')]

df_toronto_clustering_temp = df_toronto.drop('Borough',1)
df_toronto_clustering_temp = df_toronto_clustering_temp.drop('Postcode',1)
df_toronto_clustering = df_toronto_clustering_temp.drop('Neighborhood',1)

Now set up for k-means clustering, with number of clusters set to be 5

In [14]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)

Append clustering labels to the original dataframe

In [15]:
# add clustering labels
df_toronto.insert(5, 'Cluster Labels', kmeans.labels_)
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,1
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,1
43,M4M,East Toronto,Studio District,43.659526,-79.340923,1
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2


Now getting ready to plot. First of all, retrieve the coordinates for Toronto so that we can centre the map there

In [16]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Then add the markers for each neighbourhood in Toronto

In [17]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Now generate the markers as categorized by the cluster they belong in

In [18]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters