In [405]:
import numpy as np
import pandas as pd
import bs4
import folium
import requests
import geopy
import urllib3

Grabbing URL with urllib and scraping the html with Beautiful Soup

In [406]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
urlLib = urllib3.PoolManager()
r = urlLib.request("GET", url)

soup = bs4.BeautifulSoup(r.data)



Running through the first table and turning it into a pandas dataframe

In [407]:
table = soup.findChildren("table")[0]
rows = []
for row in table.find_all("tr"):
        temp = []
        elems = row.find_all("td")
        for elem in elems:
            if elems[-1] == elem:
                text = elem.text
                text = text[:-1]
                temp.append(text)
            else:
                temp.append(elem.text)
        rows.append(temp)
    
df = pd.DataFrame.from_records(rows[1:])
headers = [head.text for head in table.find_all("th")]
headers[-1] = headers[-1][:-1]
df.columns = headers

print(df.shape)
df.head()

(288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Dropping all the "Not assigned" boroughs and assign the borough names for neighbourhoods that lack a name

In [408]:
mask = df["Borough"] != "Not assigned"
df = df[mask]
print(df.shape)
df.head()

row = df[df["Neighbourhood"] == "Not assigned"]
df.at[row.index, "Neighbourhood"] = row.Borough.values[0]

(211, 3)


Gather all the unique postal code values

In [409]:
code_list = df["Postcode"].unique()
print(code_list)

['M3A' 'M4A' 'M5A' 'M6A' 'M7A' 'M9A' 'M1B' 'M3B' 'M4B' 'M5B' 'M6B' 'M9B'
 'M1C' 'M3C' 'M4C' 'M5C' 'M6C' 'M9C' 'M1E' 'M4E' 'M5E' 'M6E' 'M1G' 'M4G'
 'M5G' 'M6G' 'M1H' 'M2H' 'M3H' 'M4H' 'M5H' 'M6H' 'M1J' 'M2J' 'M3J' 'M4J'
 'M5J' 'M6J' 'M1K' 'M2K' 'M3K' 'M4K' 'M5K' 'M6K' 'M1L' 'M2L' 'M3L' 'M4L'
 'M5L' 'M6L' 'M9L' 'M1M' 'M2M' 'M3M' 'M4M' 'M5M' 'M6M' 'M9M' 'M1N' 'M2N'
 'M3N' 'M4N' 'M5N' 'M6N' 'M9N' 'M1P' 'M2P' 'M4P' 'M5P' 'M6P' 'M9P' 'M1R'
 'M2R' 'M4R' 'M5R' 'M6R' 'M7R' 'M9R' 'M1S' 'M4S' 'M5S' 'M6S' 'M1T' 'M4T'
 'M5T' 'M1V' 'M4V' 'M5V' 'M8V' 'M9V' 'M1W' 'M4W' 'M5W' 'M8W' 'M9W' 'M1X'
 'M4X' 'M5X' 'M8X' 'M4Y' 'M7Y' 'M8Y' 'M8Z']


Drop all duplicates and assign new neighbourhood column with the join neighbourhood data 

In [410]:
df_dropped = df.drop_duplicates(["Postcode"], keep = "first")
df_dropped.drop("Neighbourhood", axis = 1, inplace = True)
print("Number of Unique Postal Codes: {}".format(df_dropped.shape[0]))

neighbourhoods = []

for code in code_list:
    hoods = df["Neighbourhood"][df["Postcode"] == code].values
    neighbourhoods.append(",".join(hoods))
df_dropped["Neighbourhood"] = neighbourhoods
df_dropped.reset_index(drop = True, inplace = True)
df_dropped.head()

Number of Unique Postal Codes: 103


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


Printing shape of the dataset

In [411]:
df = df_dropped
print(df.shape)

# # df.drop(["Latitude", "Longitude"], axis = 1, inplace = True)
# df.head()

(103, 3)


In [412]:
# Tried to run geocoder, loop ran for 30 seconds with no output on a single postal code. Decided to simply use the csv provided

# import geocoder

# coords = None

# while coords is None:
#     g = geocoder.google("M2K, Toronto, Ontario")
#     coords = g.latlng
# print(coords)

Using join method to create a new dataframe of the existing data along with the latitude and longitude data.

In [413]:
latLon = pd.read_csv("Geospatial_Coordinates.csv")
latLon.head()

latLon.sort_values(by = "Postal Code", inplace = True)
df.sort_values(by = "Postcode", inplace = True)

df = df.join(latLon.set_index("Postal Code"), on = "Postcode")
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
12,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
18,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
22,M1G,Scarborough,Woburn,43.770992,-79.216917
26,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
