In [43]:
import numpy as np
import pandas as pd 
import bs4 as bs
import urllib.request

#### Using BeautifulSoup with lxml parser to scrape the Wiki page of the data table 

In [44]:
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()

soup=bs.BeautifulSoup(source, 'lxml')

#### Finding the table of Boroughs and Neighborhoods in Toronto within the html code of the wiki 

In [57]:
table = soup.find('table')

table_rows = table.find_all('tr')

#### Making a list of the items in the table by looking over each row

In [58]:
ls = []
for tr in table_rows:
    td = tr.find_all('td')
    
    row = [tr.text.strip() for tr in td]
    
    ls.append(row)

#### Creating the dataframe and cleaning up the data

In [59]:
df1 = pd.DataFrame(ls, columns = ['PostalCode','Borough','Neighborhood']).drop(df1.index[0]) #Dropping the first row because it does not have any values

df1 = df1[df1.Borough != 'Not assigned'] #Droppign rows with unassigned Boroughs

df1.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Downtown Toronto,Queen's Park
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern
14,M3B,North York,Don Mills North


In [62]:
meta_df = df1.groupby('PostalCode', sort = False).agg({'Neighborhood': ','.join}) # Creating another dataframe to combine the Neighborhoods by Code

meta_df.head(10)

Unnamed: 0_level_0,Neighborhood
PostalCode,Unnamed: 1_level_1
M3A,Parkwoods
M4A,Victoria Village
M5A,Harbourfront
M6A,"Lawrence Heights,Lawrence Manor"
M7A,Not assigned
M9A,Queen's Park
M1B,"Rouge,Malvern"
M3B,Don Mills North
M4B,"Woodbine Gardens,Parkview Hill"
M5B,"Ryerson,Garden District"


In [68]:
df2 = pd.merge(df1, meta_df, how = 'left', on = 'PostalCode').drop_duplicates('PostalCode').drop(['Neighborhood_x'], axis = 1).rename(columns = {'Neighborhood_y':'Neighborhood'}) 
                                                                                                                        # Merging the two dataframe, dropping the redundant rows by Postal Codes, 
                                                                                                                        # dropping redundant column created while merging
                                                                                                                        # Renaming column
df2.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
5,M7A,Queen's Park,Not assigned
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,"Rouge,Malvern"
9,M3B,North York,Don Mills North
10,M4B,East York,"Woodbine Gardens,Parkview Hill"
12,M5B,Downtown Toronto,"Ryerson,Garden District"


In [69]:
df2.loc[(df2.Neighborhood == 'Not assigned'), 'Neighborhood'] = df2.loc[(df2.Neighborhood == 'Not assigned'), 'Borough'] #Replacing unassigned Neighborhoods with their Boroughs

df2.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
5,M7A,Queen's Park,Queen's Park
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,"Rouge,Malvern"
9,M3B,North York,Don Mills North
10,M4B,East York,"Woodbine Gardens,Parkview Hill"
12,M5B,Downtown Toronto,"Ryerson,Garden District"


In [70]:
df2.shape

(103, 3)