# Part 1: Webscraping postal code information from Wiki

In [264]:
# Importing libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [265]:
# Making request and parsing 
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki = requests.get(url).text
soup = BeautifulSoup(wiki, 'lxml')
# print(soup.prettify())

## Finding table header and data

In [266]:
table = soup.find("table", attrs={"class": "wikitable sortable"})
header_data = table.tbody.find_all("tr")[0]
table_data = table.tbody.find_all("td")
postcode=[]
borough=[]
neighbourhood=[]

## Get all the headings of Lists

In [267]:
headings = []
for td in header_data.find_all("th"):

    # remove any newlines and extra spaces from left and right
    headings.append(td.text.replace('\n', ' ').strip().replace(' ', '_'))

print(headings)

['Postal_Code', 'Borough', 'Neighbourhood']


## Looping on table data and appending to lists.
Skip every 3 rows as all 3 columns of data are in one column

In [268]:
for k in range(0,540,3):
    postcode.append(table_data[k].text.replace('\n', '').strip())
    borough.append(table_data[k+1].text.replace('\n', '').strip())
    neighbourhood.append(table_data[k+2].text.replace('\n', '').strip())

Creating dataframe and dropping cases where Boroughs are unassigned

In [269]:
df = pd.DataFrame(zip(postcode,borough,neighbourhood),columns=headings)
df = df[df.Borough!='Not assigned'].reset_index(drop=True)
df.head()

Unnamed: 0,Postal_Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [270]:
df['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
East York            5
York                 5
Mississauga          1
Name: Borough, dtype: int64

In [271]:
# Check to make sure postal codes are Unique
print("Postal codes are unique:",df.Postal_Code.is_unique)

Postal codes are unique: True


In [272]:
# Check that all Neighborhoods are Assigned
df[df.Neighbourhood=='Not assigned']

Unnamed: 0,Postal_Code,Borough,Neighbourhood


In [273]:
df.shape

(103, 3)