# Segmenting and clustering neighborhoods in Toronto

In [1]:
#installing folium
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

# All requested packages already installed.



In [2]:
#importing the rest of the libraries
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import requests

from bs4 import BeautifulSoup
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

Solving environment: done

# All requested packages already installed.



# Scraping the wikipedia page using the BeautifulSoup Libary


We use the request.get() fuction from the request library to retrieve the text from the wikipedia page. The retrieved html page is stored in the text attribute of the page object. We then use the BeautifulSoup Library to parse the page.

In [3]:
html = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(html)
text = page.text
soup = BeautifulSoup(text, 'html.parser')

   
Now we use the find() function to search for the table.
We can then extract the rows from the table and headers are extracted to retrieve the column names
This table along with the column names are then loaded in to a pandas dataframe

In [4]:
table = soup.find('table')
rows = table.findAll('tr')
columns = rows[0].findAll('th')
no_of_cols = len(columns)
no_of_rows = len(rows)
#since the row containg the column names is used we subtract a row from the total no of rows
no_of_rows = no_of_rows - 1

#now lets write the column headers into a list
cols= list()

for i in range(no_of_cols):
    if i == no_of_cols - 1:
        cols.append(columns[i].string[:-1].lower())
    else:
        cols.append(columns[i].string.lower())

df = pd.DataFrame(columns = cols, index = range(no_of_rows))

for i in range(no_of_rows):
    new_rows = rows[i+1].findAll('td')
    for j in range(no_of_cols):
        df.iloc[i,j] = new_rows[j].string
        if df.iloc[i,j] == None:
            df.iloc[i,j] = new_rows[j].a.string 
            
#remove the line breaks
for i in df.index:
    if df.loc[i,'neighbourhood'][-1:] == '\n':
        df.loc[i,'neighbourhood'] = df.loc[i,'neighbourhood'][:-1]
print(df.shape)
df.head(10)

(289, 3)


Unnamed: 0,postcode,borough,neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


   
# Preprocessing the dataframe 

Since some of the rows in the dataframe have no boroughs assigned to them they provide no required information so we should drop these rows

In [5]:
#lets see how many rows have boroughs that are not assigned
count = 0
for i in df.index:
    if df.loc[i,'borough'] == 'Not assigned':
        count = count + 1
count


77

There are 77 rows thats are not assigned a borough. Now we drop these rows and assign the borough to neighbourhoods that have boroughs assigned but have the neighbourhoods unassigned

In [6]:
for i in df.index:
    if df.loc[i,'borough'] == 'Not assigned':
        df = df.drop(i,axis=0)
    elif df.loc[i,'neighbourhood'] == 'Not assigned':
        df.loc[i,'neighbourhood'] = df.loc[i,'borough']
df.head()

Unnamed: 0,postcode,borough,neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [7]:
postcodes = list(df['postcode'].unique())
index=range(len(postcodes))
new_df = pd.DataFrame(columns=cols,index=range(len(postcodes)))

for i in new_df.index:
    code = postcodes[i]
    tmp_df = df[df['postcode']==code]
    hood_string = ''
    for j in range(tmp_df.shape[0]):
        hood_string = hood_string + tmp_df.loc[tmp_df.index[j],'neighbourhood'] + ','
    hood_string = hood_string[:-1]
    new_df.loc[i,'postcode'] = code
    new_df.loc[i,'borough'] = tmp_df.loc[tmp_df.index[0],'borough']
    new_df.loc[i,'neighbourhood'] = hood_string
    
new_df.reset_index(inplace=True)
new_df.drop('index',axis=1,inplace=True)
new_df.rename(columns = {'postcode' : 'postal code'},inplace = True)
new_df.head(10)

Unnamed: 0,postal code,borough,neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


Now lets see use the shape function to find the size of the above dataframe.

In [8]:
print('Number of Neighbourhoods are {} and number of boroughs are {} '.format(new_df.shape[0],len(new_df['borough'].unique())))

Number of Neighbourhoods are 103 and number of boroughs are 11 



___
Now to get the coordinates for the different postal codes, we use the data from the csv file from http://cocl.us/Geospatial_data.

In [9]:
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
## lets add the latitude and longitude colums to our dataframe
new_df['latitude'] = ''
new_df['longitude'] = ''

#Now lets fill these columns with the coordinates we obtained in geo_df
for i in new_df.index:
    new_df.loc[i,'latitude'] = geo_df[geo_df['Postal Code'] == new_df.loc[i,'postal code']].iloc[0,1]
    new_df.loc[i,'longitude'] = geo_df[geo_df['Postal Code'] == new_df.loc[i,'postal code']].iloc[0,2]

neighborhoods = new_df
neighborhoods

Unnamed: 0,postal code,borough,neighbourhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7533,-79.3297
1,M4A,North York,Victoria Village,43.7259,-79.3156
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.6543,-79.3606
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.7185,-79.4648
4,M7A,Queen's Park,Queen's Park,43.6623,-79.3895
5,M9A,Etobicoke,Islington Avenue,43.6679,-79.5322
6,M1B,Scarborough,"Rouge,Malvern",43.8067,-79.1944
7,M3B,North York,Don Mills North,43.7459,-79.3522
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.7064,-79.3099
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.6572,-79.3789
