# Segmenting and Clustering Neighborhoods in Toronto

In [26]:
#import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### Read table from the html page and create the dataframe

In [12]:
#read table from web page with pd.read_html
#[0] is added to get the first table on the web page with the postal codes of Canada in a dataframe
canada_pc = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header=0)[0]

In [21]:
# keep rows that have an assigned Borough
canada_pc_clean = canada_pc[canada_pc['Borough']!= 'Not assigned']

In [27]:
# assign the Borough name to neighbourhoods that are 'Not assigned'
canada_pc_clean['neighbourhood_new'] = np.where(canada_pc_clean['Neighbourhood']=='Not assigned',canada_pc_clean['Borough'],canada_pc_clean['Neighbourhood'])

In [41]:
# combine rows when more than one neighbourhood is having the same postal code
canada_pc_final = canada_pc_clean.groupby(['Postcode','Borough'])['neighbourhood_new'].agg(','.join).reset_index()
canada_pc_final.columns=['PostalCode','Borough','Neighbourhood']

In [43]:
# show the first rows of the dataframe
canada_pc_final.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [44]:
# find out the shape of the dataframe
canada_pc_final.shape

(103, 3)

### Latitude and longitude coordinates of each neighbourhood

In [50]:
#!pip install geocoder

In [55]:
# import geocoder # import geocoder

# # initialize your variable to None
# lat_lng_coords = None

# # loop until you get the coordinates
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#   lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

In [47]:
# as the geocoder takes very long and is unreliable, I downloaded the csv to get the latitudes an longitudes of the neighbourhoods
lat_lon = pd.read_csv('http://cocl.us/Geospatial_data')

In [68]:
# merge the canada_pc_final dataframe with the lat_lon dataframe
canada_pc_ll = canada_pc_final.merge(lat_lon, left_on=['PostalCode'], right_on =['Postal Code'], how='left')

In [69]:
# drop the Postal Code column
canada_pc_ll = canada_pc_ll.drop(columns=['Postal Code'])

In [70]:
canada_pc_ll.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
