In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import numpy as np

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
l = []
for tr in soup.find_all('tr')[1:]:
    td = tr.find_all('td')[:3]
    row = [tr.text for tr in td]
    l.append(row)

In [5]:
labels = ['PostalCode', 'Borough', 'Neighborhood']
postcodes = pd.DataFrame.from_records(l, columns=labels)

### Convert dtypes to str

In [6]:
postcodes['Neighborhood'] = postcodes.Neighborhood.astype(str)
postcodes['PostalCode'] = postcodes.PostalCode.astype(str)
postcodes['Borough'] = postcodes.Borough.astype(str)

### Remove '\n' from end of Neighborhood Names

In [7]:
postcodes['Neighborhood'] = postcodes['Neighborhood'].map(lambda x: x.rstrip('\n'))
postcodes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Drop codes not assigned a Borough
Create a new dataframe called df that includes all rows where the value of a cell in the Borough column does not equal “Not assigned”

In [8]:
postcodes_valid = postcodes[postcodes.Borough != 'Not assigned']
postcodes_valid.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Combine rows containing the same PostalCode value

In [9]:
postcodes_valid = postcodes_valid.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ', '.join(x)).reset_index()

### Assign Borough Name to Neighborhoods without names

In [10]:
for index, row in postcodes_valid.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']

### Cleaning for invalid Borough names

In [11]:
postcodes_valid = postcodes_valid[postcodes_valid.Borough != 'None']
postcodes_valid = postcodes_valid[postcodes_valid.Borough != '\n']
postcodes_valid = postcodes_valid[postcodes_valid.Borough != 'NL\n']
postcodes_valid = postcodes_valid[postcodes_valid.Borough != 'B\n']
postcodes_valid = postcodes_valid[postcodes_valid.Borough != 'NS\n']

In [12]:
postcodes_valid.shape

(103, 3)