# <font color=blue>I. Scrape postal codes of Canadian from wikipedia.com</font>
### <font color=green>Clean up table and read into proper pandas data frame format</font>

In [179]:
# Install beautifulsoup4
!pip install beautifulsoup4



In [180]:
# Import necessary libraries
import pandas as pd
import numpy as np
import requests

In [181]:
# Import BeautifulSoup
#  Access the url "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", download a copy of the html file 
# to local drive and name it "postal codes of Canada.html".
# Scrape all data from the url into soup variable and check what's in soup.

from bs4 import BeautifulSoup
from bs4 import SoupStrainer
with open("postal codes of Canada.html", encoding="utf8") as fp:
    soup = BeautifulSoup(fp)
#soup

In [182]:
# Parse table into the variable table and check what it's look like.
table = soup.find_all('table')[0]
#table

In [183]:
# First load all rows of the table into table_rows
# Read table_rows into a pandas dataframe.

table_rows = table.find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip('\n') for tr in td]
    l.append(row)
    
df = pd.DataFrame(l, columns=["PostalCode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [184]:
type(df)

pandas.core.frame.DataFrame

In [185]:
df.dtypes

PostalCode      object
Borough         object
Neighborhood    object
dtype: object

In [192]:
df.shape

(104, 3)

In [193]:
# Drop all rows with Borough column value as 'Not assigned'
df = df[df.Borough != "Not assigned"]
df = df.drop(df.index[0]).reset_index(drop=True)
df.shape

(103, 3)

In [194]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [195]:
#df = df.drop(df.index[0])
df.reset_index(inplace = True, drop=True)
df.shape

(103, 3)

In [196]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [197]:
df.shape

(103, 3)

# <font color=blue>II. Transform the postal dataframe to include latitude and longitude information</font>
### <font color=green>Locate latitude and longitude information for each Postal Code</font>

In [199]:
# Install geocoder
!pip install geocoder



Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [200]:
# Import geocoder
import geocoder

In [201]:
!pip install pgeocode

Collecting pgeocode
  Downloading pgeocode-0.3.0-py3-none-any.whl (8.5 kB)
Installing collected packages: pgeocode
Successfully installed pgeocode-0.3.0


In [202]:
# import pgeocode
import pgeocode

In [219]:
# Example getting latitude and longitude for one postal code M1R
eg = pgeocode.Nominatim('ca')
eg.query_postal_code('M1R')

postal_code                                    M1R
country_code                                    CA
place_name        Scarborough (Wexford / Maryvale)
state_name                                 Ontario
state_code                                      ON
county_name                            Scarborough
county_code                                    NaN
community_name                                 NaN
community_code                                 NaN
latitude                                   43.7507
longitude                                 -79.3003
accuracy                                         6
Name: 0, dtype: object

In [220]:
# Example getting latitude and longitude for postal code M1R
pgeo = eg.query_postal_code('M1R')
lat_M1R = pgeo['latitude']
long_M1R = pgeo['longitude']
print("Latitude for postal code M1R in Canada is: ", lat_M1R)
print("Longitude for postal code M1R in Canada is: ", long_M1R)

Latitude for postal code M1R in Canada is:  43.7507
Longitude for postal code M1R in Canada is:  -79.3003


In [213]:
# Build empty list for both latitude and longitude values, loop through the PostalCode in the datafrome, 
# get latitude and longitude values for each PostalCode, and append each latitude and longitude values to their 
# respective list lats and longs.
nomi = pgeocode.Nominatim('ca')
lats = []
longs = []
for postalcode in df['PostalCode']:
    nomi_result = nomi.query_postal_code(postalcode)              
    lat = nomi_result['latitude']
    long = nomi_result['longitude']
    lats.append(lat)
    longs.append(long)
print(lats)
print(longs)

[43.7545, 43.7276, 43.6555, 43.7223, 43.6641, 43.6662, 43.8113, 43.745, 43.7063, 43.6572, 43.7081, 43.6505, 43.7878, 43.7334, 43.6913, 43.6513, 43.6915, 43.6437, 43.7678, 43.6784, 43.6456, 43.6889, 43.7712, 43.7124, 43.6564, 43.6683, 43.7686, 43.8015, 43.7535, 43.7059, 43.6496, 43.6655, 43.7464, 43.7801, 43.7694, 43.6872, 43.62300000000001, 43.648, 43.7298, 43.7797, 43.739, 43.6803, 43.6469, 43.6383, 43.7122, 43.7547, 43.7334, 43.6693, 43.6492, 43.7137, 43.7598, 43.7247, 43.7915, 43.7319, 43.6561, 43.7335, 43.6934, 43.7366, 43.6952, 43.7673, 43.7568, 43.7301, 43.7113, 43.6748, 43.7068, 43.7612, 43.75, 43.7135, 43.6966, 43.6605, 43.6949, 43.7507, 43.7786, 43.7143, 43.6736, 43.6469, nan, 43.6898, 43.7946, 43.702, 43.6629, 43.6512, 43.7812, 43.6899, 43.6541, 43.8177, 43.6861, 43.6404, 43.6075, 43.7432, 43.8016, 43.6827, 43.6437, 43.6021, 43.7144, 43.834, 43.6684, 43.6492, 43.6518, 43.6656, 43.7804, 43.6325, 43.6256]
[-79.33, -79.3148, -79.3626, -79.4504, -79.3889, -79.5282, -79.193, -79.3

In [216]:
# Add columns Latitude and Longitude in the dataframe and their respective values are from the lists generated from pgeocode.
# Check first 5 rows of the dataframe to verify latitude and longitude column addition
df['Latitude'] = lats
df['Longitude'] = longs
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


In [217]:
df.shape

(103, 5)