Import required libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Retrieve the Wikipedia page

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)

Parse the html to get the info for the dataframe columns

In [3]:
soup = BeautifulSoup(req.text, 'html.parser')

Since the data is in the first table on the page, separate that out

In [5]:
info_table = soup.table

Each "row" for us is a "tr" section in the table

In [6]:
rows = info_table.find_all('tr')

In [7]:
rows[0:3]

[<tr>
 <th>Postal code
 </th>
 <th>Borough
 </th>
 <th>Neighborhood
 </th></tr>, <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>
 </td></tr>, <tr>
 <td>M2A
 </td>
 <td>Not assigned
 </td>
 <td>
 </td></tr>]

We don't want the first row

In [8]:
rows = rows[1:]

Build new list omitting entries with Borough = Not assigned

In [9]:
new_rows=[]
for row in rows:
    if "Not assigned" not in row.find_all('td')[1].text:
        new_rows.append(row)

Break apart the three columns into lists

In [10]:
PostalCode = []
Borough = []
Neighborhood = []
for row in new_rows:
    PostalCode.append(row.find_all('td')[0].text)
    Borough.append(row.find_all('td')[1].text)
    Neighborhood.append(row.find_all('td')[2].text)

Notice the trailing "line break" character on each line

In [11]:
print(PostalCode[0:3])
print(Borough[0:3])
print(Neighborhood[0:3])

['M3A\n', 'M4A\n', 'M5A\n']
['North York\n', 'North York\n', 'Downtown Toronto\n']
['Parkwoods\n', 'Victoria Village\n', 'Regent Park / Harbourfront\n']


Eliminate the line break character

In [12]:
for i in range(len(PostalCode)):
    PostalCode[i] = PostalCode[i][:-1]
    Borough[i] = Borough[i][:-1]
    Neighborhood[i] = Neighborhood[i][:-1]

In [13]:
print(PostalCode[0:3])
print(Borough[0:3])
print(Neighborhood[0:3])

['M3A', 'M4A', 'M5A']
['North York', 'North York', 'Downtown Toronto']
['Parkwoods', 'Victoria Village', 'Regent Park / Harbourfront']


Change " x / y " neighborhoods to "x, y"

In [14]:
slash = " / "
for i, nh in enumerate(Neighborhood):
    if slash in nh:
        nh_new = nh.replace(slash, ', ')
        Neighborhood[i] = nh_new
print(Neighborhood[2:5])

['Regent Park, Harbourfront', 'Lawrence Manor, Lawrence Heights', "Queen's Park, Ontario Provincial Government"]


Changed Not assigned Neighborhoods to match Borough

In [15]:
for i, nh in enumerate(Neighborhood):
    if nh == '':
        Neighborhood[i] = Borough[i]

Create the dataframe

In [16]:
df = pd.DataFrame(list(zip(PostalCode, Borough, Neighborhood)), \
                  columns = ['PostalCode', 'Borough', 'Neighborhood'])

In [17]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Check the shape!

In [18]:
df.shape

(103, 3)