In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
soup = BeautifulSoup(url,'lxml')

In [11]:
# Scrape the data and build a pandas dataframe

prop = pd.DataFrame()

for i in soup.find_all('td'):
    try:
        postalcode = i.p.b.text
    except:
        None
    try:
        borough = i.span.text.split("(")[0]
    except:
        None
    try:
        neighborhood = i.span.text.split("(")[1].rstrip(")")
    except:
        None
    data_dict = {'PostalCode':postalcode,
                'Borough':borough,
                'Neighborhood':neighborhood}
    prop = prop.append(data_dict, ignore_index=True)
prop.head()

Unnamed: 0,Borough,Neighborhood,PostalCode
0,Not assigned,Mimico NW / The Queensway West / South of Bloo...,M1A
1,Not assigned,Mimico NW / The Queensway West / South of Bloo...,M2A
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,Regent Park / Harbourfront,M5A


In [12]:
# Dropping the NaN values from the dataframe

prop.replace(['Not assigned',""],np.nan,inplace=True)
prop.dropna(inplace=True)

In [13]:
# Assigning the Borough Name for neighbourhoods that don't have a neighborhood.

prop.loc[168]['Neighborhood'] = "East Toronto"
prop.loc[6]['Neighborhood'] = "Queen's Park"
prop.loc[148]['Neighborhood'] = "Downtown Toronto"
prop.loc[114]['Neighborhood'] = "Mississauga"

In [14]:
# Cleaning the names of the neighborhoods

borough = []
for i in prop.Borough:
    if i == "MississaugaCanada Post Gateway Processing Centre":
        borough.append("Mississauga")
    elif i == "East TorontoBusiness reply mail Processing Centre969 Eastern":
        borough.append("East Toronto")
    elif i == "Downtown TorontoStn A PO Boxes25 The Esplanade":
        borough.append("Downtown Toronto")
    elif i == "Queen's Park / Ontario Provincial Government":
        borough.append("Queen's Park")
    else:
        borough.append(i)

prop['Borough'] = borough

In [15]:
# Separating multiple neighborhoods  by comma

neigh = []
for i in prop.Neighborhood:
    neigh.append(",".join(i.split("/")))
    
prop['Neighborhood'] = neigh

In [16]:
# Printing the final dataframe

prop.head()

Unnamed: 0,Borough,Neighborhood,PostalCode
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,"Regent Park , Harbourfront",M5A
5,North York,"Lawrence Manor , Lawrence Heights",M6A
6,Queen's Park,Queen's Park,M7A


In [17]:
# Printing the final shape of the dataframe
print(prop.shape)

(103, 3)


In [22]:
# getting the csv for the coordinates of the postalcode

coords = pd.read_csv('Geospatial_Coordinates_Toronto.csv')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [27]:
# Merging dataframes to get the combined dataframe with latitudes and longitudes

final_df = pd.merge(left=prop,right=coords,left_on=prop['PostalCode'],right_on=coords['Postal Code'])
final_df.drop(['Postal Code','key_0'],axis=1,inplace=True)
final_df.head()

Unnamed: 0,Borough,Neighborhood,PostalCode,Latitude,Longitude
0,North York,Parkwoods,M3A,43.753259,-79.329656
1,North York,Victoria Village,M4A,43.725882,-79.315572
2,Downtown Toronto,"Regent Park , Harbourfront",M5A,43.65426,-79.360636
3,North York,"Lawrence Manor , Lawrence Heights",M6A,43.718518,-79.464763
4,Queen's Park,Queen's Park,M7A,43.662301,-79.389494
