Connect to Wikipedia page and scrape the postal codes

In [76]:
import pandas as pd

First get the postal codes of Canada and parse with BeautifulSoup

In [77]:
#Fetch source postal codes and feed to BeautifulSoup
from bs4 import BeautifulSoup
import requests
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

Find the postal codes table and use the <td> tag to build a Pandas dataframe

In [78]:
#Fetch data from BeautifulSoup and store in Pandas DataFrame
table = soup.find('tbody')
table_rows = table.find_all('tr')

postal_codes = []
boroughs = []
neighbourhoods = []

#Find body data
for table_row in table_rows[1:]:
    postal_codes.append(table_row.find_all('td')[0].text)
    boroughs.append(table_row.find_all('td')[1].text)
    neighbourhoods.append(table_row.find_all('td')[2].text[0:-1])

postal_codes_ls = list(zip(postal_codes, boroughs, neighbourhoods))
overall_df = pd.DataFrame(postal_codes_ls, columns = ['PostalCodes', 'Boroughs', 'Neighbourhoods'])



Remove any postal codes not assigned to any borough

In [152]:
#Drop Non-Assigned Postal Codes
overall_df = overall_df.query("Boroughs != 'Not assigned'")
overall_df

Unnamed: 0,PostalCodes,Boroughs,Neighbourhoods
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


Merge postal codes assigned to multiple neighbourhoods by filtering on unique postal codes

In [149]:
unique_postal_codes = overall_df['PostalCodes'].unique()
unique_boroughs = overall_df['Boroughs'].unique()
unique_neighbourhoods = overall_df['Neighbourhoods'].unique()

boroughs = []
neighbourhoods = []

#Define boroughs to correspond to unique postal codes
for unique_postal_code in unique_postal_codes:
    target_boroughs = overall_df[overall_df.PostalCodes == unique_postal_code]['Boroughs'].unique()    
    boroughs.append(', '.join(target_boroughs))
    #print(f"Target Boroughs: {', '.join(target_boroughs)}")
    #print(target_boroughs)

#Define neighbourhoods to correspond to unique postal codes
for unique_postal_code in unique_postal_codes:
    target_neighbourhoods = overall_df[overall_df.PostalCodes == unique_postal_code]['Neighbourhoods'].unique()    
    neighbourhoods.append(', '.join(target_neighbourhoods))
    #print(target_neighbourhoods)
    
temp_ls = list(zip(unique_postal_codes, boroughs, neighbourhoods))
final_df = pd.DataFrame(temp_ls, columns = ['PostalCodes', 'Boroughs', 'Neighbourhoods'])


Confirm the structure and data of the Pandas dataframe

In [150]:
final_df.head()

Unnamed: 0,PostalCodes,Boroughs,Neighbourhoods
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


Get the shape of the Pandas data frame

In [127]:
#Get the final shape of the data frame
final_df.shape


(103, 3)