<h1 style="text-align:center">Part 1: Web Scrapping and Data Cleaning</h1>

In [None]:
import selenium


from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(ChromeDriverManager().install())

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
driver.get(url)
driver.page_source

In [97]:
wikiPage = driver.page_source

from bs4 import BeautifulSoup

soup = BeautifulSoup(wikiPage)

#print(soup.find("tbody"))
#print("--------------------")
#print(soup.prettify())

table = soup.findChildren("tbody")[0]
rows = table.findChildren(["tr"])

dictToronto = {"Postal Code":[], "Borough":[], "Neighborhood":[]}
columnNames = ["Postal Code", "Borough", "Neighborhood"]
for i in range(len(rows)):
    
    cells = rows[i].findChildren(["td"])
    
    for j in range(len(cells)):
        dictToronto[columnNames[j]].append(cells[j].next_element)
    
import pandas as pd

dfToronto = pd.DataFrame.from_dict(dictToronto)

In [117]:
"""Data Cleaning"""

# First we erase trailing spaces
dfToronto = dfToronto.apply(lambda x: x.str.strip())

In [162]:
import numpy as np

# We drop rows that have no Borough assigned
dfToronto = dfToronto.replace("Not assigned", np.nan).dropna(axis = 0, subset=["Borough"])

# Reset index
dfToronto.reset_index(drop=True, inplace = True)

# We check if there is any Neighborhood with value not assigned
print("Neighborhoods with no assigned value: ", (dfToronto["Neighborhood"].isnull()).sum())

Neighborhoods with no assigned value:  0


In [163]:
# We print the shape
print("Shape of the final DataFrame: ", dfToronto.shape)
dfToronto

Shape of the final DataFrame:  (103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


<h1 style="text-align: center">Part 2: Latitudes and Longitudes</h1>

In [238]:
# Geocoder didn't work with any address 21/June/20 13:20 GMT-5

df_latlng = pd.read_csv("Geospatial_Coordinates.csv")

dfToronto_latlng = pd.merge(dfToronto,df_latlng, how="inner", on ="Postal Code")
dfToronto_latlng

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
