In [40]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Scrape list of postal codes of Canada

In [41]:
list_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(list_url).text

In [42]:
soup=BeautifulSoup(source, 'xml')

In [43]:
table = soup.find('table')

In [44]:
#dataframe consist of 3 columns: Postal codes, Borough and Neighborhood
column_names = ['Postal_Code', 'Borough', 'Neighborhood']
df= pd.DataFrame(columns = column_names)

In [45]:
# Search the postcode, borough, neighborhood 
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [46]:
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [47]:
#clean dataframe
df = df[df.Borough!='Not assigned']
df = df[df.Borough!=0]
df.reset_index(drop=True, inplace=True)
i=0
for i in range(0, df.shape[0]):
    if df.iloc[i][2] == 'Not assigned':
        df.iloc[i][2]=df.iloc[i][1]
        i = i+1
df= df.groupby(['Postal_Code','Borough'])['Neighborhood'].apply(','.join).reset_index()
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Cleaning the data

In [48]:
df = df.dropna()
empty= 'Not assigned'
df = df[(df.Postal_Code !=empty) & (df.Borough !=empty)  & (df.Neighborhood !=empty)]

In [49]:
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [50]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
                    
grp = df.groupby(['Postal_Code', 'Borough'])
df2 = grp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [55]:
print(df2.shape)
df2.head(10)

(103, 3)


Unnamed: 0,Postal_Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [52]:
print('The DataFrame shape is', df2.shape)


The DataFrame shape is (103, 3)


The dataframe has 103 Postal codes but it has 212 rows, this is because each Postal code can have more than one neighborhood.