In [118]:
# import required libraries

import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes
import requests # 'requests' offers the most friendly API for opening files, including JSON support

In [119]:
# import the BeautifulSoup library so we can parse HTML and XML documents

from bs4 import BeautifulSoup

# specify which URL/web page we are going to be scraping

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# open the url and put the html into the soup variable

response = requests.get(url)
html = response.text
soup = BeautifulSoup(html,'html.parser')

In [120]:
# load the values in each row into the fields A, B and C

all_table=soup.find('table', class_='wikitable sortable')

A=[]
B=[]
C=[]

for row in all_table.findAll('tr'):
    
    cells=row.findAll('td')
    
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

**The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood**

In [134]:
dfToronto = pd.DataFrame(A,columns=['PostalCode'])
dfToronto['Borough']=B
dfToronto['Neighbourhood']=C

# had to clean the data by removing \n that has appeared at the end of each text string in each column ... don't know why?!

dfToronto['PostalCode'] = dfToronto['PostalCode'].str.rstrip('\n')
dfToronto['Borough'] = dfToronto['Borough'].str.rstrip('\n')
dfToronto['Neighbourhood'] = dfToronto['Neighbourhood'].str.rstrip('\n')
dfToronto

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


**Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.**

In [135]:
dfToronto['Borough'].replace('Not assigned', np.nan, inplace=True)
dfToronto.dropna(subset=['Borough'], inplace=True)
dfToronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


**Combine rows so that each row has a unique postal code.  Seperate multiple neighbourhoods with a comma**

In [136]:
dfToronto.groupby('PostalCode').agg({'Borough':'first','Neighbourhood': ', '.join}).reset_index()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


**If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.**

In [137]:
# print(df.loc[df['Neighbourhood'] == 'Not assigned']) ... this shows that there are no 'Not assigned' values in Neighborhood

# But in case there were, this function replaces the neighbourhood not assigned value with the bourough value

dfToronto['Neighbourhood']=dfToronto['Borough'].where(dfToronto['Neighbourhood'].eq('Not assigned'),dfToronto['Neighbourhood'])
dfToronto.head()


Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


**In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.**

In [138]:
dfToronto.shape

(103, 3)

**Load the latitude and longitude coordinates from the csv file downloaded to local machine**

In [139]:
dfGeoloc = pd.read_csv('/Users/leostepan/AnacondaProjects/Geospatial_Coordinates.csv')

# rename the postal code field in dfGeoloc to match dfToronto and enable the merge

dfGeoloc.rename(columns={'Postal Code': 'PostalCode'}, inplace=True) 
dfGeoloc

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


**Merge the two dataframes on the PostalCode field**

In [140]:
df_merge = pd.merge(dfToronto, dfGeoloc, on='PostalCode')
df_merge.head(12)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
