# 0. Installing the pre-requisted libs

In [1]:
!pip install beautifulsoup4 requests pandas geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 13.2MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


# 1. Data scrape

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import geocoder
import time
import json

### 1.1 Scraping from wikipedia

In [5]:
def link_or_text(elem):
    link = elem.select_one('a')
    if link:
        return link.text.strip()
    return elem.text.strip()

In [41]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
pc_data = []

html = requests.get(URL).content
soup = BeautifulSoup(html, 'html.parser')
tbody = soup.select_one('table.wikitable>tbody')

for tr in tbody.find_all('tr'):
    tds = tr.find_all('td')
    if len(tds) == 3 :
        td_texts = [link_or_text(td) for td in tds]
        item = dict(zip(['Postcode', 'Borough', 'Neighbourhood'], td_texts))
        pc_data.append(item) 

In [42]:
pc_data[:3]

[{'Postcode': 'M1A',
  'Borough': 'Not assigned',
  'Neighbourhood': 'Not assigned'},
 {'Postcode': 'M2A',
  'Borough': 'Not assigned',
  'Neighbourhood': 'Not assigned'},
 {'Postcode': 'M3A', 'Borough': 'North York', 'Neighbourhood': 'Parkwoods'}]

In [43]:
pc_df = pd.DataFrame(pc_data)

In [44]:
pc_df.head()

Unnamed: 0,Borough,Neighbourhood,Postcode
0,Not assigned,Not assigned,M1A
1,Not assigned,Not assigned,M2A
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,Harbourfront,M5A


In [15]:
pc_df.tail()

Unnamed: 0,borough,neighbourhood,postcode
283,Etobicoke,Mimico NW,M8Z
284,Etobicoke,The Queensway West,M8Z
285,Etobicoke,Royal York South West,M8Z
286,Etobicoke,South of Bloor,M8Z
287,Not assigned,Not assigned,M9Z


### 1.2 Retriving location coordinates

In [51]:
GCP_API_KEY = 'THIS_IS_A_SECRET'

In [21]:
latlong_data = []

qry_result = []
for item in pc_data:
    pc = item.get('postcode')
    qry = f'{pc}, Toronto, Ontario'
    for i in range(3):
        r = geocoder.google(qry, key=GCP_API_KEY)
        if r.latlng:
            qry_result.append(r)
            break
        else:
            time.sleep(1.5)
    

In [45]:
## `latlongs` is just a list of dictionary with keys Latitude and Longtitude 
## and we will merge it back to the pc_data later
latlongs = [dict([('Latitude', r.latlng[0]), ('Longtitude', r.latlng[1])])  for r in  qry_result]

In [52]:
latlongs[:5]

[{'Latitude': 43.653226, 'Longtitude': -79.3831843},
 {'Latitude': 43.653226, 'Longtitude': -79.3831843},
 {'Latitude': 43.7532586, 'Longtitude': -79.3296565},
 {'Latitude': 43.72588229999999, 'Longtitude': -79.3155716},
 {'Latitude': 43.6542599, 'Longtitude': -79.36063589999999}]

In [53]:
## Merging the pc_data and latlongs
[d.update(d_latlng) for (d, d_latlng) in zip(pc_data, latlongs)]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [55]:
pc_data[:3]

[{'Postcode': 'M1A',
  'Borough': 'Not assigned',
  'Neighbourhood': 'Not assigned',
  'Latitude': 43.653226,
  'Longtitude': -79.3831843},
 {'Postcode': 'M2A',
  'Borough': 'Not assigned',
  'Neighbourhood': 'Not assigned',
  'Latitude': 43.653226,
  'Longtitude': -79.3831843},
 {'Postcode': 'M3A',
  'Borough': 'North York',
  'Neighbourhood': 'Parkwoods',
  'Latitude': 43.7532586,
  'Longtitude': -79.3296565}]

In [56]:
df_pc2 = pd.DataFrame(pc_data)

In [57]:
df_pc2.head()

Unnamed: 0,Borough,Latitude,Longtitude,Neighbourhood,Postcode
0,Not assigned,43.653226,-79.383184,Not assigned,M1A
1,Not assigned,43.653226,-79.383184,Not assigned,M2A
2,North York,43.753259,-79.329656,Parkwoods,M3A
3,North York,43.725882,-79.315572,Victoria Village,M4A
4,Downtown Toronto,43.65426,-79.360636,Harbourfront,M5A


In [58]:
df_pc2.tail()

Unnamed: 0,Borough,Latitude,Longtitude,Neighbourhood,Postcode
283,Etobicoke,43.628841,-79.520999,Mimico NW,M8Z
284,Etobicoke,43.628841,-79.520999,The Queensway West,M8Z
285,Etobicoke,43.628841,-79.520999,Royal York South West,M8Z
286,Etobicoke,43.653226,-79.383184,South of Bloor,M8Z
287,Not assigned,,,Not assigned,M9Z
