# Dependencies Download

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans



print('Libraries imported.')

Libraries imported.


# Dataset Scraping

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [3]:
# scraping
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html_doc = urlopen(url)

soup = BeautifulSoup(html_doc, 'lxml')


In [4]:
# get the data from the table and upload it to kanj_data
table = soup.find("table", class_="wikitable")

kanj_data = []
for items in table.find_all("tr")[:-1]:
    data = [' '.join(item.text.split()) for item in items.find_all(['th','td'])]
    
    kanj_data.append(data)
    
kanj_data

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', 'Downtown Toronto', "Queen's Park"],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', "Queen's Park", 'Not assigned'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B',

In [5]:
# naming columns
df = pd.DataFrame(kanj_data, columns=['Postcode', 'Borough', 'Neighbourhoud'])
df.drop(0, axis=0, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhoud
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [6]:
#droping rows with Not assigned values in Borough
indexNames = df[ (df['Borough'] == 'Not assigned')].index
df.drop(indexNames , inplace=True)
df = df.reset_index(drop=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhoud
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [7]:
# Replacing Not assigned cells with their values in Borough
df['Neighbourhoud']=df['Borough'].where(df['Neighbourhoud'].eq('Not assigned'),df['Neighbourhoud'])
df

Unnamed: 0,Postcode,Borough,Neighbourhoud
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [8]:
#creating a new dataframe with no duplicates in Postcode
df_stack = df.groupby('Postcode')['Neighbourhoud'].apply(', '.join)
df_stack = df_stack.reset_index()

df_stack.head()

Unnamed: 0,Postcode,Neighbourhoud
0,M1B,"Rouge, Malvern"
1,M1C,"Highland Creek, Rouge Hill, Port Union"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [9]:
#Merging df and df_stack
df_clean = pd.merge(df_stack, df, on='Postcode', how='right')

df_clean = df_clean.drop_duplicates()
df_clean

Unnamed: 0,Postcode,Neighbourhoud_x,Borough,Neighbourhoud_y
0,M1B,"Rouge, Malvern",Scarborough,Rouge
1,M1B,"Rouge, Malvern",Scarborough,Malvern
2,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,Highland Creek
3,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,Rouge Hill
4,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,Port Union
5,M1E,"Guildwood, Morningside, West Hill",Scarborough,Guildwood
6,M1E,"Guildwood, Morningside, West Hill",Scarborough,Morningside
7,M1E,"Guildwood, Morningside, West Hill",Scarborough,West Hill
8,M1G,Woburn,Scarborough,Woburn
9,M1H,Cedarbrae,Scarborough,Cedarbrae


In [10]:
#Deleting Neighbourhoud_y column
df_clean=df_clean.drop('Neighbourhoud_y',axis=1)
df_clean

Unnamed: 0,Postcode,Neighbourhoud_x,Borough
0,M1B,"Rouge, Malvern",Scarborough
1,M1B,"Rouge, Malvern",Scarborough
2,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
3,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
4,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough
5,M1E,"Guildwood, Morningside, West Hill",Scarborough
6,M1E,"Guildwood, Morningside, West Hill",Scarborough
7,M1E,"Guildwood, Morningside, West Hill",Scarborough
8,M1G,Woburn,Scarborough
9,M1H,Cedarbrae,Scarborough


In [11]:
#Rearrangements and dropping duplicates
cols=['Postcode', 'Borough', 'Neighbourhoud_x']
df_clean=df_clean[cols]
df_clean = df_clean.rename(columns={"Neighbourhoud_x": "Neighbourhoud"}).reset_index(drop=True)
df_clean = df_clean[['Postcode', 'Borough', 'Neighbourhoud']].drop_duplicates()
df_clean

Unnamed: 0,Postcode,Borough,Neighbourhoud
0,M1B,Scarborough,"Rouge, Malvern"
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
5,M1E,Scarborough,"Guildwood, Morningside, West Hill"
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae
10,M1J,Scarborough,Scarborough Village
11,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
14,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
17,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
20,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [12]:
df_clean.shape

(103, 3)