In [3]:
#Load BeautifulSoup library
import requests
from bs4 import BeautifulSoup
url_to_scrape = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

__Tell requests to retreive the contents our page (it'll be grabbing__
__what you see when you use the View Source feature in your browser)__

In [4]:
r = requests.get(url_to_scrape)

# We now have the source of the page, let's ask BeaultifulSoup
# to parse it for us.
soup = BeautifulSoup(r.text,'lxml')
soup.title #Verify if we get the page or not

<title>List of postal codes of Canada: M - Wikipedia</title>

In [5]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [6]:
#Load the table into a data frame
data = []
table = soup.find('table', attrs={'class':'wikitable sortable'})

__Parse the table and get a list then data frame__

In [7]:
table_body = table.find('tbody')
table_rows = table.find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df=pd.DataFrame(l, columns=["PostalCode", "Borough","Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


In [8]:
#clean up \n
df['Neighborhood'] = df['Neighborhood'].str.replace('\n','')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


__Clean up the data based on the instructions__

In [9]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df1=df.loc[(df['Borough'] != 'Not assigned') & (df['PostalCode'].notnull())].reset_index(drop=True)
#df1.head()

#More than one neighborhood can exist in one postal code area. 
df1.set_index(['PostalCode', 'Borough'],inplace =True)
df2 = df1.groupby(level=['PostalCode','Borough'], sort=False).agg( ','.join)
df3=df2.reset_index()
df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [10]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
df3.loc[df3['Neighborhood'] == 'Not assigned', ['Neighborhood']] = df3['Borough']
df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [11]:
df3.shape

(103, 3)

In [16]:
#Load zip data
df_zip = pd.read_csv('ZipCode.csv')
df_zip.rename(columns={"Postal Code": "PostalCode"},inplace=True)
df_zip.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [19]:
df5 = pd.merge(df3, df_zip, on='PostalCode')
df5.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
