Importing BeautifulSoup and extracting the table from wikipedia page

In [5]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

req = requests.get(url)
soup = BeautifulSoup(req.content, 'lxml')
table_classes = {"class": ["sortable", "plainrowheaders"]}
wikitables = soup.findAll("table", table_classes)

Extracting the table and writing it into a file

In [6]:
for table in wikitables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postcode', 'Borough', "Neighbourhood"]:
        break
with open('postal_codes_of_canada.txt', 'w') as fo:
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        PostalCode, Borough, Neighborhood = [td.text.strip() for td in tds[:3]]
        
        print('; '.join([PostalCode, Borough, Neighborhood]), file=fo)

Converting the csv into dataframe df

In [7]:
import pandas as pd

df = pd.read_csv('postal_codes_of_canada.txt', sep = ';', header = None, names = ['PostalCode', 'Borough', 'Neighborhood'])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Removing the cells with a borough that is 'Not assigned'.

In [8]:
import re
booleans = []

for result in df.Borough:
    if not re.search('Not assigned', result):
        booleans.append(True)
    else:
        booleans.append(False)
print(booleans[0:5])
print(len(booleans))

Filtered = pd.Series(booleans)

can_data = df[Filtered].reset_index(drop = True)

can_data.head(5)

[False, False, True, True, True]
180


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Combining the rows based on similar PostalCode and aggregating the different neighborhoods separated with a comma.

In [9]:
new_cd = can_data.astype(str).groupby('PostalCode')['Neighborhood'].agg(','.join).reset_index()
merged_cd = pd.merge(can_data, new_cd, on = ['PostalCode'], how = 'inner')
duplicate_cd = merged_cd.drop(['Neighborhood_x'], axis = 1)
final_cd = duplicate_cd.drop_duplicates(subset='PostalCode')
final_cd.columns = ['PostalCode', 'Borough', 'Neighborhood']
final_cd = final_cd.reset_index(drop = True)

Replacing the Nan values in the 'Neighborhood' column with the value in the 'Borough

In [10]:
import numpy as np
boolean = []

for result in final_cd.Neighborhood:
    if not re.search('Not assigned', result):
        boolean.append(True)
    else:
        boolean.append(False)



Filter = pd.Series(boolean)

X = final_cd['Neighborhood'].where(boolean, np.nan)
X = pd.Series(X)
final_cd = final_cd.drop(['Neighborhood'], axis = 1)

final_cd['Neighborhood'] = X.values
final_cd["Neighborhood"] = final_cd["Neighborhood"].fillna(final_cd["Borough"])
final_cd.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Number of rows in the dataframe

In [11]:
final_cd.shape

(103, 3)

Importing geographical cordinates dataframe

In [12]:
path = "http://cocl.us/Geospatial_data"

ll_df = pd.read_csv(path)
ll_df.columns = ['PostalCode', 'Latitude', 'Longitude']
ll_df.head()


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging geographical cordinates and postal code dataframe

In [13]:
ll_merged_cd = pd.merge(final_cd, ll_df, on = ['PostalCode'], how = 'inner')
ll_merged_cd.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [2]:
import numpy as np 


print('Libraries imported.')

Libraries imported.


Selecting Boroughs with 'Toronto' in their name

In [14]:
toronto_data = ll_merged_cd[ll_merged_cd['Borough'].str.contains("Toronto")]
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


Retrieving coordinates for Toronto.

In [15]:
from geopy.geocoders import Nominatim
address = 'Toronto, Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6534817, -79.3839347.


Initial map for Toronto

In [None]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto