In [15]:
import requests
import pandas as pd

### Scrape the list of postal codes of Toronto from Wikipedia

In [16]:
list_of_postal_codes_of_toronto_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
list_of_postal_codes_of_toronto_page = requests.get(list_of_postal_codes_of_toronto_link)
list_of_postal_codes_of_toronto_html_text = list_of_postal_codes_of_toronto_page.text

### BeautifulSoup for reading the table of postal codes of Toronto

In [17]:
from bs4 import BeautifulSoup
beautiful_soup = BeautifulSoup(list_of_postal_codes_of_toronto_html_text, 'html.parser')
table_of_postal_codes_of_toronto = beautiful_soup.find('table', { 'class': 'wikitable sortable' })

### Convert the table of postal codes of Toronto into a pandas dataframe object

In [19]:
columns = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.read_html(str(table_of_postal_codes_of_toronto), skiprows=1)
df = pd.DataFrame.from_dict(df[0])
df.columns = columns
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M2A,Not assigned,Not assigned
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M6A,North York,Lawrence Heights


### Drop cells with a borough that is "Not assigned"

In [21]:
df = df[df.Borough != 'Not assigned'].reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


### Combine into one row when there has more than one neighborhood existing in one postal code area

In [22]:
def combine_neighborhood(series):
    return series.str.cat(sep=', ')

df_by_postal_codes = df.groupby(['PostalCode', 'Borough'])
df = df_by_postal_codes.agg({'Neighborhood': combine_neighborhood}).reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [24]:
def check_neighborhood_column_value(row):
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
    return row

df = df.apply(check_neighborhood_column_value, axis=1)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Display the number of rows of data frame object df

In [27]:
df.shape[0]

103