In [1]:
import pandas as pd
import numpy as np
import urllib.request
import json
from bs4 import BeautifulSoup

print("Modules imported")

Modules imported


<h3>Retrieving data from url and using BeautifulSoup to parse the data</h3>

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()

In [3]:
# Parse data retrieved from wiki page
soup = BeautifulSoup(article, 'lxml')

# Locate the table
postal_table_html = soup.find('table', class_='wikitable sortable')

<h3>Getting the headers for the table</h3>

<p>By using find_all, we can look for the headers that can be located with the attribute 'th'

In [4]:
rows = postal_table_html.find_all('tr')

first = rows[0]
headers = first.find_all('th')
headerArray = []

for i in headers:
    headerArray.append(i.text.strip())
    
headerArray

['Postal Code', 'Borough', 'Neighborhood']

<h3>Using loop to look for each individual rows</h3>

<p> The for loop will look for each individual row and append the data as long as the postal code is assigned to a location

In [5]:
postal = []
borough = []
neighborhood = []

for i in range(1, len(rows)):
    row = rows[i].find_all('td')
    
    if row[1].text.strip() != 'Not assigned' :
        postal.append(row[0].text.strip())
        borough.append(row[1].text.strip())
        neighborhood.append(row[2].text.strip())

<h3>Creating the dataframe</h3>

In [6]:
postal_code = pd.DataFrame({headerArray[0]:postal, headerArray[1]:borough, headerArray[2]:neighborhood})

postal_code.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
postal_code.shape

(103, 3)

In [8]:
import os

csv_file = 'Geospatial_Coordinates.csv'

df = pd.read_csv(csv_file).sort_values(['Postal Code'], ascending=True)

df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
postal_code.sort_values(['Postal Code'], inplace=True)

postal_code.reset_index(inplace=True, drop=True)

postal_code.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [17]:
new_postal_code = pd.concat([postal_code, df[['Latitude', 'Longitude']]], axis=1, sort=False)

In [18]:
new_postal_code.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
