# 0. Installing the pre-requisted libs

In [1]:
!pip install beautifulsoup4 requests pandas geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 15.1MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


# 1. Data scrape

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import geocoder
import time
import json

### 1.1 Scraping from wikipedia

In [3]:
def link_or_text(elem):
    link = elem.select_one('a')
    if link:
        return link.text.strip()
    return elem.text.strip()

In [139]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
pc_data = []

html = requests.get(URL).content
soup = BeautifulSoup(html, 'html.parser')
tbody = soup.select_one('table.wikitable>tbody')

for tr in tbody.find_all('tr'):
    tds = tr.find_all('td')
    if len(tds) == 3:
        pc, borough, neighbourhood = [link_or_text(td) for td in tds]
        if borough == 'Not assigned':
            # ignore when borough is ''Not assigned
            continue
        item = dict(zip(['Postcode', 'Borough', 'Neighbourhood'],  [pc, borough, neighbourhood]))
        pc_data.append(item) 

In [140]:
pc_data[:3]

[{'Postcode': 'M3A', 'Borough': 'North York', 'Neighbourhood': 'Parkwoods'},
 {'Postcode': 'M4A',
  'Borough': 'North York',
  'Neighbourhood': 'Victoria Village'},
 {'Postcode': 'M5A',
  'Borough': 'Downtown Toronto',
  'Neighbourhood': 'Harbourfront'}]

In [155]:
pc_df = pd.DataFrame(pc_data)

In [156]:
pc_df.head()

Unnamed: 0,Borough,Neighbourhood,Postcode
0,North York,Parkwoods,M3A
1,North York,Victoria Village,M4A
2,Downtown Toronto,Harbourfront,M5A
3,Downtown Toronto,Regent Park,M5A
4,North York,Lawrence Heights,M6A


In [157]:
pc_df.tail()

Unnamed: 0,Borough,Neighbourhood,Postcode
206,Etobicoke,Kingsway Park South West,M8Z
207,Etobicoke,Mimico NW,M8Z
208,Etobicoke,The Queensway West,M8Z
209,Etobicoke,Royal York South West,M8Z
210,Etobicoke,South of Bloor,M8Z


In [158]:
pc_df = pc_df[pc_df.Borough != 'Not assigned']

In [159]:
pc_df = pc_df[['Postcode','Borough','Neighbourhood']]
pc_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [160]:
pc_df_2 = pc_df.groupby('Postcode')['Neighbourhood'].apply(', '.join).reset_index()
pc_df_2

Unnamed: 0,Postcode,Neighbourhood
0,M1B,"Rouge, Malvern"
1,M1C,"Highland Creek, Rouge Hill, Port Union"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae
5,M1J,Scarborough Village
6,M1K,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,"Clairlea, Golden Mile, Oakridge"
8,M1M,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,"Birch Cliff, Cliffside West"


In [161]:
pc_df_3 = pc_df[['Postcode','Borough']]


In [162]:
pc_df_2 = pd.merge(pc_df_3, pc_df_2, on='Postcode', how='inner') 
pc_df_2 = pc_df_2.drop_duplicates(subset=['Postcode', 'Borough','Neighbourhood'], keep='first')
pc_df_2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,"Rouge, Malvern"
10,M3B,North York,Don Mills North
11,M4B,East York,"Woodbine Gardens, Parkview Hill"
13,M5B,Downtown Toronto,"Ryerson, Garden District"


In [163]:
for index, row in pc_df_2.iterrows():
    if row['Neighbourhood']=='Not assigned': 
        row['Neighbourhood']=row['Borough']


In [167]:
pc_df_2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Queen's Park


In [166]:
pc_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### 1.2 Retriving location coordinates

In [16]:
GCP_API_KEY = 'THIS_IS_A_SECRET'

In [17]:
#@hidden_cell
GCP_API_KEY = 'AIzaSyC4AdT2WqA17f8ZEO5K_TbOWRV3wW4A8m4'

In [18]:
latlong_data = []

qry_result = []
for (idx, pc) in pc_df_2['Postcode'].items():
    # pc = item.get('Postcode')
    qry = f'{pc}, Toronto, Ontario'
    for i in range(3):
        r = geocoder.google(qry, key=GCP_API_KEY)
        if r.latlng:
            #print(f'{qry} {r.latlng}')
            qry_result.append([pc, r.latlng[0], r.latlng[1]])
            break
        else:
            time.sleep(1.5)
    

In [19]:
## `latlongs` is just a list of dictionary with keys Latitude and Longtitude 
## and we will merge it back to the pc_data later
latlongs = [dict([('Postcode', r[0]), ('Latitude', r[1]), ('Longtitude', r[2])])  for r in  qry_result]

In [20]:
len(latlongs), len(pc_data)

(103, 211)

In [21]:
latlongs[:5]

[{'Postcode': 'M3A', 'Latitude': 43.7532586, 'Longtitude': -79.3296565},
 {'Postcode': 'M4A', 'Latitude': 43.72588229999999, 'Longtitude': -79.3155716},
 {'Postcode': 'M5A', 'Latitude': 43.6542599, 'Longtitude': -79.36063589999999},
 {'Postcode': 'M6A', 'Latitude': 43.718518, 'Longtitude': -79.4647633},
 {'Postcode': 'M7A', 'Latitude': 43.6623015, 'Longtitude': -79.3894938}]

In [22]:
## Merging the pc_data and latlongs
_ = [d.update(d_latlng) for (d, d_latlng) in zip(pc_data, latlongs)]

In [23]:
df_latlongs = pd.DataFrame(latlongs)

In [24]:
df_latlongs

Unnamed: 0,Latitude,Longtitude,Postcode
0,43.753259,-79.329656,M3A
1,43.725882,-79.315572,M4A
2,43.654260,-79.360636,M5A
3,43.718518,-79.464763,M6A
4,43.662301,-79.389494,M7A
5,43.667856,-79.532242,M9A
6,43.806686,-79.194353,M1B
7,43.745906,-79.352188,M3B
8,43.706397,-79.309937,M4B
9,43.657162,-79.378937,M5B


In [168]:
df = pd.merge(pc_df_2, df_latlongs, on='Postcode', how='left')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longtitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [169]:
df2= pd.merge(pc_df, df_latlongs, on='Postcode', how='left')
df2

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longtitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.654260,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763
5,M6A,North York,Lawrence Manor,43.718518,-79.464763
6,M7A,Queen's Park,Not assigned,43.662301,-79.389494
7,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
8,M1B,Scarborough,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,43.806686,-79.194353
