# This notebook extract information from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M and put into a right dataframe format

## Import libraries.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

Get data from website.

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
data = r.text
soup = BeautifulSoup(data, 'lxml')

# print(soup.prettify())

Find the table contents.

In [3]:
results = []
for content in soup.find_all('tr'):
    results.append(content.text.split('\n')[1:4])
print(len(results))

294


Observe the dataset.

In [4]:
# print(results)

In [5]:
results = results[0:289]

In [6]:
# print(results)

In [7]:
results = np.array(results)

Create DataFrame and put data into DataFrame.

In [8]:
df = pd.DataFrame(results[1:], columns = results[0])

In [9]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
df['Postcode'].value_counts()

M9V    8
M8Y    8
M5V    7
M8Z    5
M9B    5
M4V    5
M9R    4
M6M    4
M1V    4
M9C    4
M1T    3
M1P    3
M5R    3
M1K    3
M6K    3
M6L    3
M1M    3
M1L    3
M1C    3
M5T    3
M5J    3
M8X    3
M2J    3
M1E    3
M5H    3
M8V    3
M3H    3
M6R    2
M3C    2
M6P    2
      ..
M2W    1
M9K    1
M3X    1
M7S    1
M2N    1
M4E    1
M2T    1
M1A    1
M9G    1
M1Y    1
M1G    1
M1W    1
M6Z    1
M3A    1
M6Y    1
M9T    1
M2G    1
M7E    1
M7K    1
M9J    1
M7H    1
M2H    1
M9N    1
M3B    1
M3E    1
M8T    1
M8P    1
M8S    1
M7V    1
M6V    1
Name: Postcode, Length: 180, dtype: int64

In [11]:
df['Neighbourhood'].value_counts()

Not assigned                       78
Runnymede                           2
St. James Town                      2
Woodbine Gardens                    1
Bedford Park                        1
Sunnylea                            1
Birch Cliff                         1
The Danforth West                   1
Weston                              1
West Deane Park                     1
The Beaches West                    1
Cloverdale                          1
Cliffside                           1
Stn A PO Boxes 25 The Esplanade     1
Parkdale Village                    1
Downsview Central                   1
Christie                            1
Milliken                            1
Del Ray                             1
Old Burnhamthorpe                   1
University of Toronto               1
Eringate                            1
Downsview North                     1
Morningside                         1
Humberlea                           1
The Queensway East                  1
Ionview     

In [12]:
df['Borough'].value_counts()

Not assigned        77
Etobicoke           45
North York          38
Downtown Toronto    37
Scarborough         37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

Replace 'Not assigned' with np.nan.

In [13]:
df.replace('Not assigned', np.nan,inplace = True)
print(df.shape)
df.head(10)

(288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,
9,M8A,,


Drop nan on column 'Borough'.

In [14]:
df.dropna(axis = 0, subset = ['Borough'], inplace = True)
print(df.shape)
df.head(10)

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Replace Neighbourhood of nan with Borough.

In [15]:
for index in df.index:
    if pd.isna(df.loc[index,'Neighbourhood']):
        df.loc[index,'Neighbourhood'] = df.loc[index,'Borough']
        print('1')


1


Group the Neighbourhood.

In [16]:
result = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ",".join(x))

In [17]:
result = pd.DataFrame(result)

In [18]:
result.reset_index(inplace = True)

In [19]:
result.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [20]:
result.shape

(103, 3)

In [21]:
result.to_csv("PostalCode.csv")