# This part scrapes Wikipedia for Toronto borough data

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [74]:
# Read Wikipedia page and pull table
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
pc_df = df[0]
pc_df.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Change first row to header

In [75]:
# change first row to header
new_header = pc_df.iloc[0]
pc_df = pc_df[1:]
pc_df.columns = new_header
pc_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


### Drop rows where borough is 'Not assigned'

In [76]:
# drop rows where borough is 'Not assigned'
pc_df = pc_df[pc_df['Borough']!='Not assigned']
pc_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


### Combine rows where postcode is the same and neighborhood is different

In [77]:
# combine rows where postcode is the same and neighborhood is different
pc_df = pc_df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
pc_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### If borough is assigned but neighborhood isn't, set neighborhood equal to borough

In [78]:
# if borough is assigned but neighborhood isn't, set neighborhood equal to borough
pc_df['Neighbourhood'][pc_df['Neighbourhood'] == 'Not assigned'] = pc_df['Borough'] 
pc_df.head()

In [81]:
pc_df.shape

(103, 3)