# Toronto Neighborhood Web Scrapping

## Import required libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

## Get the URL and create Beautiful Soup Object

In [2]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(URL)

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')

## Get the Table

In [4]:
table = soup.find_all('table')

In [5]:
len(table)

3

In [6]:
table_raw = pd.read_html(str(table[0]), flavor='bs4')[0]
table_raw

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods),M4ANorth York(Victoria Village),M5ADowntown Toronto(Regent Park / Harbourfront),M6ANorth York(Lawrence Manor / Lawrence Heights),M7AQueen's Park(Ontario Provincial Government),M8ANot assigned,M9AEtobicoke(Islington Avenue)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North,M4BEast York(Parkview Hill / Woodbine Gardens),"M5BDowntown Toronto(Garden District, Ryerson)",M6BNorth York(Glencairn),M7BNot assigned,M8BNot assigned,M9BEtobicoke(West Deane Park / Princess Garden...
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park),M4CEast York(Woodbine Heights),M5CDowntown Toronto(St. James Town),M6CYork(Humewood-Cedarvale),M7CNot assigned,M8CNot assigned,M9CEtobicoke(Eringate / Bloordale Gardens / Ol...
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned,M4EEast Toronto(The Beaches),M5EDowntown Toronto(Berczy Park),M6EYork(Caledonia-Fairbanks),M7ENot assigned,M8ENot assigned,M9ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned,M4GEast York(Leaside),M5GDowntown Toronto(Central Bay Street),M6GDowntown Toronto(Christie),M7GNot assigned,M8GNot assigned,M9GNot assigned
5,M1HScarborough(Cedarbrae),M2HNorth York(Hillcrest Village),M3HNorth York(Bathurst Manor / Wilson Heights ...,M4HEast York(Thorncliffe Park),M5HDowntown Toronto(Richmond / Adelaide / King),M6HWest Toronto(Dufferin / Dovercourt Village),M7HNot assigned,M8HNot assigned,M9HNot assigned
6,M1JScarborough(Scarborough Village),M2JNorth York(Fairview / Henry Farm / Oriole),M3JNorth York(Northwood Park / York University),M4JEast YorkEast Toronto(The Danforth East),M5JDowntown Toronto(Harbourfront East / Union ...,M6JWest Toronto(Little Portugal / Trinity),M7JNot assigned,M8JNot assigned,M9JNot assigned
7,M1KScarborough(Kennedy Park / Ionview / East B...,M2KNorth York(Bayview Village),M3KNorth York(Downsview)East (CFB Toronto),M4KEast Toronto(The Danforth West / Riverdale),M5KDowntown Toronto(Toronto Dominion Centre / ...,M6KWest Toronto(Brockton / Parkdale Village / ...,M7KNot assigned,M8KNot assigned,M9KNot assigned
8,M1LScarborough(Golden Mile / Clairlea / Oakridge),M2LNorth York(York Mills / Silver Hills),M3LNorth York(Downsview)West,M4LEast Toronto(India Bazaar / The Beaches West),M5LDowntown Toronto(Commerce Court / Victoria ...,M6LNorth York(North Park / Maple Leaf Park / U...,M7LNot assigned,M8LNot assigned,M9LNorth York(Humber Summit)
9,M1MScarborough(Cliffside / Cliffcrest / Scarbo...,M2MNorth York(Willowdale / Newtonbrook),M3MNorth York(Downsview)Central,M4MEast Toronto(Studio District),M5MNorth York(Bedford Park / Lawrence Manor East),M6MYork(Del Ray / Mount Dennis / Keelsdale and...,M7MNot assigned,M8MNot assigned,M9MNorth York(Humberlea / Emery)


## Testing of Preprocessing for One Column

Because the table from wikipedia page consist of multiple row and column to be processed
\
In this case we try to extract the informastion from the first column, which is M1... code

In [7]:
# get the first column
# split the neighborhood column with '(' delimiter
M1_raw = table_raw[0].str.split('(',n = 1, expand = True)
M1_raw

Unnamed: 0,0,1
0,M1ANot assigned,
1,M1BScarborough,Malvern / Rouge)
2,M1CScarborough,Rouge Hill / Port Union / Highland Creek)
3,M1EScarborough,Guildwood / Morningside / West Hill)
4,M1GScarborough,Woburn)
5,M1HScarborough,Cedarbrae)
6,M1JScarborough,Scarborough Village)
7,M1KScarborough,Kennedy Park / Ionview / East Birchmount Park)
8,M1LScarborough,Golden Mile / Clairlea / Oakridge)
9,M1MScarborough,Cliffside / Cliffcrest / Scarborough Village W...


In [8]:
# Extract the first 3 character form first column, to get the PostalCode
M1_raw[2] = M1_raw[0].str[0:3]

In [9]:
# Extract the borough, which is start from the 4th character from the first column
M1_raw[3] = M1_raw[0].str.slice(start=3)

In [10]:
# remove the ')' character on the tail of the neighborhood column (second column)
M1_raw[1] = M1_raw[1].str.replace(')','',regex=True)

In [11]:
# copy the neighborhood column (second column) to the right column
M1_raw[4] = M1_raw[1]

In [12]:
M1_raw

Unnamed: 0,0,1,2,3,4
0,M1ANot assigned,,M1A,Not assigned,
1,M1BScarborough,Malvern / Rouge,M1B,Scarborough,Malvern / Rouge
2,M1CScarborough,Rouge Hill / Port Union / Highland Creek,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
3,M1EScarborough,Guildwood / Morningside / West Hill,M1E,Scarborough,Guildwood / Morningside / West Hill
4,M1GScarborough,Woburn,M1G,Scarborough,Woburn
5,M1HScarborough,Cedarbrae,M1H,Scarborough,Cedarbrae
6,M1JScarborough,Scarborough Village,M1J,Scarborough,Scarborough Village
7,M1KScarborough,Kennedy Park / Ionview / East Birchmount Park,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
8,M1LScarborough,Golden Mile / Clairlea / Oakridge,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
9,M1MScarborough,Cliffside / Cliffcrest / Scarborough Village West,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West


In [13]:
# drop the first and the second column
M1_test = M1_raw.drop([0,1], axis = 1)
M1_test

Unnamed: 0,2,3,4
0,M1A,Not assigned,
1,M1B,Scarborough,Malvern / Rouge
2,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
3,M1E,Scarborough,Guildwood / Morningside / West Hill
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,Kennedy Park / Ionview / East Birchmount Park
8,M1L,Scarborough,Golden Mile / Clairlea / Oakridge
9,M1M,Scarborough,Cliffside / Cliffcrest / Scarborough Village West


### The Pre-Processing for the first column seems to work properly, we can use the same algorithm to do the Pre-Processing for all column

## Preprocessing for all column

In [14]:
Tor_Post = pd.DataFrame()

for col in table_raw:
    Post_per_row = table_raw[col].str.split('(',n = 1, expand = True)
    Post_per_row[2] = Post_per_row[0].str[0:3]
    Post_per_row[3] = Post_per_row[0].str.slice(start=3)
    Post_per_row[1] = Post_per_row[1].str.replace(')','',regex=True)
    Post_per_row[4] = Post_per_row[1]
    Post_per_row = Post_per_row.drop([0,1], axis = 1)
    Tor_Post = Tor_Post.append(Post_per_row)

In [15]:
Tor_Post

Unnamed: 0,2,3,4
0,M1A,Not assigned,
1,M1B,Scarborough,Malvern / Rouge
2,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
3,M1E,Scarborough,Guildwood / Morningside / West Hill
4,M1G,Scarborough,Woburn
...,...,...,...
15,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...
16,M9W,EtobicokeNorthwest,Clairville / Humberwood / Woodbine Downs / Wes...
17,M9X,Not assigned,
18,M9Y,Not assigned,


In [16]:
Tor_Post =  Tor_Post.reset_index(drop=True)

## Checking for the Not assigned borough

In [17]:
Tor_Post[Tor_Post[3]=='Not assigned']

Unnamed: 0,2,3,4
0,M1A,Not assigned,
18,M1Y,Not assigned,
19,M1Z,Not assigned,
20,M2A,Not assigned,
21,M2B,Not assigned,
...,...,...,...
173,M9S,Not assigned,
174,M9T,Not assigned,
177,M9X,Not assigned,
178,M9Y,Not assigned,


Seems that there are 77 Not assigned borough

## Drop the Not assigned borough

In [18]:
Tor_Post_assigned = Tor_Post.drop(Tor_Post[Tor_Post[3]=='Not assigned'].index)

In [19]:
Tor_Post_assigned

Unnamed: 0,2,3,4
1,M1B,Scarborough,Malvern / Rouge
2,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
3,M1E,Scarborough,Guildwood / Morningside / West Hill
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
...,...,...,...
170,M9N,York,Weston
171,M9P,Etobicoke,Westmount
172,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...
175,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...


## Check for the Not asigned Neighborhood

In [20]:
Tor_Post_assigned[Tor_Post_assigned[4] == 'Not assigned']

Unnamed: 0,2,3,4


Seems that there's no Not assigned Neighborhood for Assigned Borough

## Create the clean DataFrame and reset the index

In [21]:
Toronto_PostalCode = Tor_Post_assigned.reset_index(drop=True).rename(columns={2:'PostalCode',3: 'Borough',4:'Neighborhood'})

In [22]:
Toronto_PostalCode

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...


In [23]:
Toronto_PostalCode.shape

(103, 3)