In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 4000)

## Scrape table from Webpage

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipage= requests.get(url).text

# Parse html/xml codes from webpage.
soup = BeautifulSoup(wikipage,'xml')

# get the table in wikipage
table = soup.find('table')

listPostcode = []
listBorough = []
listNeighbourhood = []

for row in table.find_all('tr'):      
    cells = row.find_all('td')
    if cells:   # if cells has elements
        Postcode_var = cells[0].find(text = True)
        Borough_var = cells[1].find(text = True)
        Neighbourhood_var = cells[2].find(text = True).strip()    # .strip() to remove new line character '\n' at end of text.
    else:
        continue    # skip to next row if no elements
    

    # skip to next row if Borough = 'Not assigned'    
    if Borough_var == 'Not assigned': 
        continue
    
    # if Neighbourhood = 'Not assigned', then Neighborhood will be the same as Borough   
    if Neighbourhood_var == 'Not assigned': 
        Neighbourhood_var = Borough_var

    
    listPostcode.append(Postcode_var)
    listBorough.append(Borough_var)
    listNeighbourhood.append(Neighbourhood_var)

## Combine Neighbourhoods that share same Postcode

In [3]:
listUniqPostcode = set(listPostcode)
print(f'Number of unique Postcode: {len(listUniqPostcode)}')
print(f'Number of all Postcode: {len(listPostcode)}')
listNewPostcode = []
listNewBorough = []
listNewNeighbourhood = []


for postcode in listUniqPostcode:
    p_var = ''; b_var = ''; n_var = ''; 
    for idx, item in enumerate(listPostcode):
        if item == postcode:
            p_var = item;
            b_var = listBorough[idx]
            if n_var == '':    # if Neighbourhood hasn't got value due to new postcode
                n_var = listNeighbourhood[idx]
            else:     # if Neighbourhood already has value due to same postcode
                n_var = n_var + ', ' + listNeighbourhood[idx]
                
    listNewPostcode.append(p_var)
    listNewBorough.append(b_var)
    listNewNeighbourhood.append(n_var)

Number of unique Postcode: 103
Number of all Postcode: 210


## Create dataframe

In [4]:
dict = {'Postalcode':listNewPostcode, 'Borough':listNewBorough, 'Neighbourhood':listNewNeighbourhood}
df = pd.DataFrame.from_dict(dict)
df.to_csv('toronto_part1.csv')
print(df.head(10))
print()
print(df.shape)

  Postalcode           Borough                                      Neighbourhood
0        M2K        North York                                    Bayview Village
1        M4C         East York                                   Woodbine Heights
2        M4V   Central Toronto  Deer Park, Forest Hill SE, Rathnelly, South Hi...
3        M4B         East York                    Woodbine Gardens, Parkview Hill
4        M5R   Central Toronto                The Annex, North Midtown, Yorkville
5        M4S   Central Toronto                                         Davisville
6        M4T   Central Toronto                        Moore Park, Summerhill East
7        M4W  Downtown Toronto                                           Rosedale
8        M5A  Downtown Toronto                                       Harbourfront
9        M1B       Scarborough                                     Rouge, Malvern

(103, 3)


## Install geocoder and run module to load lat/long data.

In [5]:
#!pip install geocoder

import geocoder

listlat = []
listlong = []

for postcode in listUniqPostcode:
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while (lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postcode))
        lat_lng_coords = g.latlng
    
    print(postcode, 'Lat/Long done.')    
    listlat.append(lat_lng_coords[0])
    listlong.append(lat_lng_coords[1])

print(f'Number of Latitude records = {len(listlat)}')
print(f'Number of Longitude records = {len(listlong)}')

## The geocoder module took too long to load lat/long data. Use csv file instead.

In [6]:
########## Load lat/long csv ##########   
url = 'https://cocl.us/Geospatial_data'
dflatlong = pd.read_csv(url, index_col = None)
# print(dflatlong)


########## Merge original dataframe with lat/long dataframe ##########   
dfnew = pd.merge(df, dflatlong, left_on = "Postalcode", right_on = "Postal Code")
dfnew.drop('Postal Code', axis = 1, inplace = True)
print(dfnew)


    Postalcode           Borough                                      Neighbourhood   Latitude  Longitude
0          M2K        North York                                    Bayview Village  43.786947 -79.385975
1          M4C         East York                                   Woodbine Heights  43.695344 -79.318389
2          M4V   Central Toronto  Deer Park, Forest Hill SE, Rathnelly, South Hi...  43.686412 -79.400049
3          M4B         East York                    Woodbine Gardens, Parkview Hill  43.706397 -79.309937
4          M5R   Central Toronto                The Annex, North Midtown, Yorkville  43.672710 -79.405678
5          M4S   Central Toronto                                         Davisville  43.704324 -79.388790
6          M4T   Central Toronto                        Moore Park, Summerhill East  43.689574 -79.383160
7          M4W  Downtown Toronto                                           Rosedale  43.679563 -79.377529
8          M5A  Downtown Toronto              