In [1]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

### Scraping of table from specified URL

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
tables = pd.read_html(url, header=0,flavor=['lxml'])
tables[0]

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Conversion of table to dataframe

In [6]:
df = pd.DataFrame(tables[0])

In [7]:
df.shape

(180, 3)

In [11]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Removal of rows with no assigned borough

In [19]:
df1 = df[df.Borough != 'Not assigned']

In [20]:
df1.shape

(103, 3)

In [22]:
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [26]:
df1.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### Checking for duplicate postal codes

In [28]:
df2 = df1.groupby(['Postal Code']).groups

In [29]:
len(df2)

103

##### The length of df2 is equal to number of rows of df1, this implies that there are no duplicates.

### Checking for missing neighborhoods

In [31]:
df1.count()

Postal Code     103
Borough         103
Neighborhood    103
dtype: int64

##### The number of values is equal to the number of rows in df1, this implies that here are no missing neighborhoods.

### Size of the resulting dataframe of neighborhoods

In [32]:
df1.shape

(103, 3)

##### The dataframe contains 103 rows.