IBM Applied Data Science Capstone Project's Notebook by Kurniayazid

In [0]:
# importing libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import urllib.request

In [0]:
# Targeted website's source
source = requests.get('https://en.m.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [0]:
# Scraping source using bs4
soup = BeautifulSoup(source, 'lxml')

In [0]:
# Finding all table
table = soup.find_all('table')

In [0]:
# Finding targeted table
target_table = soup.find('table', class_='wikitable sortable')

In [0]:
# Loop the data through the rows -- put the data into list
post = []
borough = []
neighborhood = []

for x in target_table.findAll('tr'):
  cells = x.findAll('td')
  if len(cells)==3:
    post.append(cells[0].find(text=True))
    borough.append(cells[1].find(text=True))
    neighborhood.append(cells[2].find(text=True))

In [7]:
# Put the data into dataframe
df = pd.DataFrame(post,columns=['postcode'])
df['borough'] = borough
df['neighborhood'] = neighborhood
df.head(12)

Unnamed: 0,postcode,borough,neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned\n
9,M9A,Etobicoke,Islington Avenue


In [9]:
# Adjust neighborhood if postcode is M5A
df.neighborhood.loc[df.postcode=='M5A'] = 'Regent Park, Harbourforont'
df[df.postcode=='M5A']

Unnamed: 0,postcode,borough,neighborhood
4,M5A,Downtown Toronto,"Regent Park, Harbourforont"


In [0]:
# Drop if borough is 'Not assigned'
df = df[df.borough!='Not assigned']

In [0]:
# Remove \n in neighborhood and borough
df['neighborhood'] = df['neighborhood'].str.replace('\n','')
df['borough'] = df['borough'].str.replace('\n','')

In [19]:
# Dataframe shape
df.shape

(210, 3)

In [20]:
# List of data
df.head(12)

Unnamed: 0,postcode,borough,neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourforont"
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [22]:
# Import Latitude and Longitude data for the respected Postal code
geocode = pd.read_csv('http://cocl.us/Geospatial_data')
geocode.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [0]:
# Rename primary key (postal code to postcode)
geocode.rename(columns={'Postal Code': 'postcode'},inplace=True)

In [0]:
# Merge the geocode data to the dataframe
dg = pd.merge(df,geocode,on='postcode',how='inner')

In [25]:
# Summary Statistics for the data
dg.describe(include='all')

Unnamed: 0,postcode,borough,neighborhood,Latitude,Longitude
count,210,210,210,210.0,210.0
unique,103,10,208,,
top,M9V,Etobicoke,Runnymede,,
freq,8,45,2,,
mean,,,,43.6976,-79.409866
std,,,,0.05408,0.103862
min,,,,43.602414,-79.615819
25%,,,,43.650943,-79.494292
50%,,,,43.6893,-79.400049
75%,,,,43.739416,-79.34726


In [26]:
# Check the top 12 data
dg.head(12)

Unnamed: 0,postcode,borough,neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourforont",43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
6,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
7,M1B,Scarborough,Rouge,43.806686,-79.194353
8,M1B,Scarborough,Malvern,43.806686,-79.194353
9,M3B,North York,Don Mills North,43.745906,-79.352188
