In [1]:
import numpy as np
import pandas as pd
link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
!pip install beautifulsoup4



In [5]:
from requests import get

In [6]:
response = get(link)

In [7]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
type(soup)

bs4.BeautifulSoup

table data is tagged 'td', find all table data

In [9]:
tag = soup.findAll('td')

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
</td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
</td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
<

make list out of tags

In [11]:
tag_list = list(tag)

build lists of each column from the list of tags
note that the tag_list is partitioned in each for loop to get every 3rd item in the list

In [12]:
PostalCode = [tag.text for tag in tag_list[0::3]]  

In [13]:
Borough = [tag.text for tag in tag_list[1::3]]

In [14]:
Neighborhood = [tag.text[:-1] for tag in tag_list[2::3]]

In [15]:
df = pd.DataFrame(list(zip(PostalCode, Borough, Neighborhood))
                                    ,columns =['PostalCode', 'Borough', 'Neighborhood'])

In [16]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


remove the borough == 'not assigned' rows

In [18]:
PostalCodes = df.loc[(df['Borough'] != 'Not assigned')]

In [19]:
PostalCodes.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


there are some nonsensical entries at the bottom of the dataframe

In [20]:
PostalCodes.tail(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
286,M8Z,Etobicoke,South of Bloor
288,,\n,
289,\n\n\nNL\n\nNS\n\nPE\n\nNB\n\nQC\n\nON\n\nMB\n...,NL\n,NS
290,PE\n,NB\n,QC
291,ON\n,MB\n,SK
292,AB\n,BC\n,NU/NT
293,YT\n,A\n,B
294,C\n,E\n,G
295,H\n,J\n,K
296,L\n,M\n,N


these can be removed by filtering on '\n' in the column borough

In [21]:
PostalCodes = PostalCodes[~PostalCodes.Borough.str.contains('\n')]

In [22]:
PostalCodes.tail(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
269,M8Y,Etobicoke,Mimico NE
270,M8Y,Etobicoke,Old Mill South
271,M8Y,Etobicoke,The Queensway East
272,M8Y,Etobicoke,Royal York South East
273,M8Y,Etobicoke,Sunnylea
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


In [23]:
PostalCodes.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [30]:
PostalCodes = PostalCodes.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

In [31]:
PostalCodes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [33]:
PostalCodes.Neighborhood = PostalCodes.Borough.where(PostalCodes.Neighborhood == 'Not assigned',
                                                     PostalCodes.Neighborhood)

In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [36]:
PostalCodes.shape

(103, 3)