In [1]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = urllib.request.urlopen(url)
article = req.read().decode()

In [3]:
soup = BeautifulSoup(article, 'html.parser')
table = soup.find('table', class_='sortable')

In [4]:
#headings
ths = table.find_all('th')
headings = [th.text.strip() for th in ths]

#create dataframe
neighbourhoods = pd.DataFrame(columns=headings)
neighbourhoods

# scrape rows into dataframe
for tr in table.find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue                            #skips first row with headings
    pc, b, n = [td.text.strip() for td in tds]
    if b!="Not assigned":
        neighbourhoods = neighbourhoods.append({'Postcode': pc, 'Borough': b, 'Neighbourhood': n}, ignore_index=True)

neighbourhoods.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Neighbourhood,Postcode
0,,North York,,Parkwoods,M3A
1,,North York,,Victoria Village,M4A
2,,Downtown Toronto,,Regent Park / Harbourfront,M5A
3,,North York,,Lawrence Manor / Lawrence Heights,M6A
4,,Downtown Toronto,,Queen's Park / Ontario Provincial Government,M7A


In [5]:

neighbourhoods['Neighbourhood'] = neighbourhoods.groupby('Postcode')['Neighbourhood'].transform(lambda x: "%s" % ', '.join(x)).values
neighbourhoods = neighbourhoods.drop_duplicates().reset_index(drop=True)
neighbourhoods.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Neighbourhood,Postcode
0,,North York,,Parkwoods,M3A
1,,North York,,Victoria Village,M4A
2,,Downtown Toronto,,Regent Park / Harbourfront,M5A
3,,North York,,Lawrence Manor / Lawrence Heights,M6A
4,,Downtown Toronto,,Queen's Park / Ontario Provincial Government,M7A


In [6]:

neighbourhoods['Neighbourhood'].replace("Not assigned", neighbourhoods['Borough'], inplace=True)
neighbourhoods.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Neighbourhood,Postcode
0,,North York,,Parkwoods,M3A
1,,North York,,Victoria Village,M4A
2,,Downtown Toronto,,Regent Park / Harbourfront,M5A
3,,North York,,Lawrence Manor / Lawrence Heights,M6A
4,,Downtown Toronto,,Queen's Park / Ontario Provincial Government,M7A


In [7]:

neighbourhoods.shape

(103, 5)

In [8]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
result = pd.merge(neighbourhoods, geo_df, how='left',
        left_on='Postcode', right_on='Postal Code', validate="1:1")
result.drop(labels='Postal Code', axis=1, inplace=True)
result.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Neighbourhood,Postcode,Latitude,Longitude
0,,North York,,Parkwoods,M3A,43.753259,-79.329656
1,,North York,,Victoria Village,M4A,43.725882,-79.315572
2,,Downtown Toronto,,Regent Park / Harbourfront,M5A,43.65426,-79.360636
3,,North York,,Lawrence Manor / Lawrence Heights,M6A,43.718518,-79.464763
4,,Downtown Toronto,,Queen's Park / Ontario Provincial Government,M7A,43.662301,-79.389494


In [10]:
result

Unnamed: 0,Postal code,Borough,Neighborhood,Neighbourhood,Postcode,Latitude,Longitude
0,,North York,,Parkwoods,M3A,43.753259,-79.329656
1,,North York,,Victoria Village,M4A,43.725882,-79.315572
2,,Downtown Toronto,,Regent Park / Harbourfront,M5A,43.654260,-79.360636
3,,North York,,Lawrence Manor / Lawrence Heights,M6A,43.718518,-79.464763
4,,Downtown Toronto,,Queen's Park / Ontario Provincial Government,M7A,43.662301,-79.389494
5,,Etobicoke,,Islington Avenue,M9A,43.667856,-79.532242
6,,Scarborough,,Malvern / Rouge,M1B,43.806686,-79.194353
7,,North York,,Don Mills,M3B,43.745906,-79.352188
8,,East York,,Parkview Hill / Woodbine Gardens,M4B,43.706397,-79.309937
9,,Downtown Toronto,,"Garden District, Ryerson",M5B,43.657162,-79.378937


In [11]:
result.shape

(103, 7)