# --------------------- PART 1 -------------------------

Importing libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

Webscraping the page using BeautifulSoup and displaying a first 'raw' result

In [2]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df_as_list = pd.read_html(str(table))
df = pd.read_json(df_as_list[0].to_json(orient='records'))
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


Ensuring lines with Neighbourhood 'Not Assigned' are assigned with the value in Borough column as the Neighbourhood.
Checks are done before and after to see the results.

In [3]:
df[ (df['Neighbourhood'] == 'Not assigned') & (df['Borough'] != 'Not assigned')]

Unnamed: 0,Postcode,Borough,Neighbourhood
9,M9A,Queen's Park,Not assigned


In [4]:
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])

In [5]:
df[ (df['Neighbourhood'] == 'Not assigned') & (df['Borough'] != 'Not assigned')]

Unnamed: 0,Postcode,Borough,Neighbourhood


In [6]:
df.loc[df['Postcode'] == 'M9A']

Unnamed: 0,Postcode,Borough,Neighbourhood
9,M9A,Queen's Park,Queen's Park


Replacing Borough 'Not Assigned' by NaN value, before dropping the corresponding columns

In [7]:
df['Borough'].replace("Not assigned", np.nan, inplace = True)

In [8]:
df.dropna(subset=["Borough"], axis=0, inplace = True)

Changing the name of the column

In [9]:
df.rename(columns={"Postcode": "PostalCode"}, inplace = True)

Finally grouping the lines according to postcode, while concatenating the Neighbourhood values.

In [10]:
df = df.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [11]:
df.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [12]:
df.shape

(103, 3)