# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup


source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'html5lib')





In [2]:
tables = soup.select('table')
table = tables[0].findAll('tr')

In [3]:
postcodes = [] #creat list to contain the data
broughs = []
neighborhoods = []
for row in tables[0].findAll('tr'): # find each row
    if row.findAll('td'):           # the data start with td
        postcodes.append(row.findAll('td')[0].contents[0]) # the first column is postcode.
        broughs.append(row.findAll('td')[1].contents[0].string.replace('\n',''))# the second column is broughs. delete the '\n' if ther is
        neighborhoods.append(row.findAll('td')[2].contents[0].string.replace('\n',''))# the third column is neighborhood.delete the '\n' if there is
# convert the data into dataframe
df = pd.DataFrame({'Postcode':postcodes, 'Broughs':broughs, 'Neighborhoods':neighborhoods})



In [4]:
df = df[df.Broughs!='Not assigned'] # let's delete those brough not assigned
nh_mask = df.Neighborhoods =='Not assigned' # where neighbor hood was not assigned
df["Neighborhoods"][nh_mask] = df['Broughs'][nh_mask]
df.shape

(211, 3)

In [5]:
df.head()

Unnamed: 0,Postcode,Broughs,Neighborhoods
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [6]:
def f(x):
     return pd.DataFrame(dict(Postcode = np.unique(x['Postcode']),  #We extract the unique postcodes
                        Broughs = np.unique(x['Broughs']),          #We extract the unique broughs, since many are the same with the same postcodes
                        Neighborhoods = "%s" % ', '.join(x['Neighborhoods'])))

In [7]:
df_group = df.groupby('Postcode').apply(f)

In [8]:
df_group.shape

(103, 3)

In [9]:
df_group.reset_index(drop = True, inplace = True) 

In [10]:
df_group.head()

Unnamed: 0,Postcode,Broughs,Neighborhoods
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [None]:
# let's import the libraries
import geocoder
latitude = []
longitude = []
for postal_code in df_group[["Postcode"]].values.tolist():
    lat_lng_coords = None
    while (lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code[0]))
        lat_lng_coord = g.latlng
    latitude.append(lat_lng_coord[0])
    longitude.append(lat_lng_coord[1])

In [None]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
latitude = []
longitude = []
for postal_code in df_group[["Postcode"]].values.tolist():
    address = '{}, Toronto, Ontario'.format(postal_code[0])
    location = None
    while (location is None):
        geolocator = Nominatim(user_agent="To_explorer")
        location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print(latitude, longitude)

In [18]:
df_to = pd.read_csv(r"C:\Users\Rohan Kapoor\Desktop\Geospatial_Coordinates.csv")


In [19]:
df_to.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
df_toronto = pd.concat([df_group, df_to], axis = 1, join = 'inner')
df_toronto.head()

Unnamed: 0,Postcode,Broughs,Neighborhoods,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [21]:
# we can find two postal codes and we drop the second one
df_toronto.drop(columns = ["Postal Code"], axis = 1, inplace = True)

In [22]:
df_toronto.head(10)

Unnamed: 0,Postcode,Broughs,Neighborhoods,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [23]:
df_toronto.shape

(103, 5)