# Installing and importing the necessary libraries

In [1]:
! pip install pandas
import pandas as pd
! pip install lxml
! pip install html5lib
! pip install beautifulsoup4
print("Libraries installed")

Libraries installed


# Obtaining the data from Wikipedia

In [2]:
from pandas.io.html import read_html
page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

wikitables = read_html(page, attrs={"class":"wikitable"})
print("Exracted {num} wikitables".format(num=len(wikitables)))
wikitables[0].shape

Exracted 1 wikitables


(288, 3)

# Converting the Wikitable into a Pandas DataFrame

In [3]:
wiki = wikitables[0]
df = pd.DataFrame(wiki)

# Ignoring the cells with a borough that is "Not assigned"

In [4]:
new_df = df[df.Borough != 'Not assigned']
new_df.shape

(211, 3)

# Combining rows with the same Postcode separating the Neigburhoods with a comma

In [5]:
grouped = new_df.groupby(['Postcode','Borough'], sort=False)['Neighbourhood'].apply(', '.join).reset_index()
grouped[0:11]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


# Replacing all "Not assigned" neighborhoods with the name of the Borough

In [6]:
grouped.loc[(grouped["Neighbourhood"] == "Not assigned"), 'Neighbourhood'] = grouped['Borough']
grouped[0:11]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


# What is the shape of the dataframe?

In [7]:
grouped.shape

(103, 3)