# Segmenting and Clustering Neighbourhoods in Toronto¶


## Part 1

In [1]:
import pandas as pd
import numpy as np
import requests

Get the HTML page of Wiki, and using read_html to convert the html data into list of Data frame objects.

Remove cells which have Borough not assigned.

In [2]:
wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = requests.get(wiki)

wiki_raw = pd.read_html(wiki_page.content, header = 0)[0]

# Check blank neighbourhood & Borough
np.unique(wiki_raw[wiki_raw.Neighbourhood == 'Not assigned']['Borough'])

array(['Not assigned'], dtype=object)

It turns out if Not assigned Neighbourhood has also Not assigned Borough.
Thus we could remove all the Not assigned Neighbourhood

In [3]:
df = wiki_raw[wiki_raw.Neighbourhood != 'Not assigned']
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [4]:
# Drop redundant index column and reset index 
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
df.groupby(['Postal Code']).first()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


The new Wiki link already has neighbourhood merged according to Postal Code and Borough, thus try to check all the entry of postal code is unique

In [6]:
len(df['Postal Code'].unique()) # it's unique

103

In [7]:
# Create preview & custom sort like demos
df_mapping = pd.DataFrame({'Postal Code': ['M5G', 'M2H', 'M4B', 'M1J', 'M4G', 'M4M', 'M1R', 'M9V', 'M9L', 'M5V', 'M1B', 'M5A']})
sort_mapping = df_mapping.reset_index().set_index('Postal Code')

# Reset the index
df_demo_df = df.copy()
df_demo_df = df_demo_df[df_demo_df['Postal Code'].isin(['M5G', 'M2H', 'M4B', 'M1J', 'M4G', 'M4M', 'M1R', 'M9V', 'M9L', 'M5V', 'M1B', 'M5A'])]
df_demo_df['Postal Mapping'] =df_demo_df['Postal Code'].map(sort_mapping['index'])
df_demo_df.sort_values(by='Postal Mapping').drop(columns=['Postal Mapping']).reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


In [8]:
# Let's see the real shape
df.shape

(103, 3)

## Part 2

In [9]:
!pip install geocoder



Import geocoder lib to fetch latitude and longitude + download the csv file url

In [10]:
import geocoder 
url = 'http://cocl.us/Geospatial_data'


In [11]:
# Read the dataframe
df_geo = pd.read_csv(url)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
# Check the datatypes of url
df_geo.dtypes

Postal Code     object
Latitude       float64
Longitude      float64
dtype: object

In [13]:
# Check the datatypes from wikipedia
df.dtypes

Postal Code      object
Borough          object
Neighbourhood    object
dtype: object

In [14]:
# Ensure same row size before joining the df
print('URL shape: ', df.shape)
print('Wikipedia shape: ', df_geo.shape)

URL shape:  (103, 3)
Wikipedia shape:  (103, 3)


Join the df using merge method and clean it with the index

In [15]:
df_joined = df.merge(df_geo, how='inner', on='Postal Code')
df_joined.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [16]:
df_joined.shape

(103, 5)

In [17]:
# Create preview & custom sort like demos
df_mapping = pd.DataFrame({'Postal Code': ['M5G', 'M2H', 'M4B', 'M1J', 'M4G', 'M4M', 'M1R', 'M9V', 'M9L', 'M5V', 'M1B', 'M5A']})
sort_mapping = df_mapping.reset_index().set_index('Postal Code')

# Reset the index
df_demo_df = df_joined.copy()
df_demo_df = df_demo_df[df_demo_df['Postal Code'].isin(['M5G', 'M2H', 'M4B', 'M1J', 'M4G', 'M4M', 'M1R', 'M9V', 'M9L', 'M5V', 'M1B', 'M5A'])]
df_demo_df['Postal Mapping'] =df_demo_df['Postal Code'].map(sort_mapping['index'])
df_demo_df.sort_values(by='Postal Mapping').drop(columns=['Postal Mapping']).reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442
