# Scraping table from Wikipedia & clean dataframe

In [241]:
# scrape table from webpage

import pandas as pd
import requests

url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=890001695'
webpage = requests.get(url).text

webpage_table = pd.read_html(url)
Canada_code = webpage_table[0]

Canada_code.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [230]:
# Set up dataframe
import numpy as np
columns = ['PostalCode','Borough','Neighborhood']
df_Canada = Canada_code.rename(columns={'Postcode':'PostalCode'})
df_Canada.drop(df_Canada[df_Canada['Borough']=='Not assigned'].index,inplace=True)
df_Canada.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [236]:
df = df_Canada.groupby(['PostalCode','Borough'],as_index=False).agg(lambda x: ','.join(x))
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [239]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same
mask = df['Neighbourhood'] == "Not assigned"
df.loc[mask,'Neighbourhood'] = df.loc[mask,'Borough']


df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [240]:
print('There are {} rows and {} columns in this dataframe'.format(df.shape[0],df.shape[1]))

There are 103 rows and 3 columns in this dataframe
