## Notebook for Capstone Project - Battle of neighbourhood

In [2]:
import pandas as pd
import numpy as np

import libraries to webscrape

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

Create a method to read content of the link using BeautifulSoup

In [4]:
def getHTMLContent(link):
    html = urlopen(link)
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [5]:
#Call the previously created getHTMLContent method
content = getHTMLContent('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
postal_code_table = content.find("table", attrs={"class": "wikitable"})
postal_code_table_data = postal_code_table.tbody.find_all("tr")  
postal_code_table_data

table_data = []
# Get all the rows of table
for tr in postal_code_table.tbody.find_all("tr"): # find all tr's from table's tbody
    t_row = []
    # find all td's(3) in tr and zip it with t_header
    for td in tr.find_all("td"): 
         t_row.append(td.text.replace('\n', '').strip())
    #read only rows which has values in it
    if(len(t_row)>0):
         table_data.append(t_row)    


### convert list to dataframe

In [6]:
# add header to the dataframe
df = pd.DataFrame(table_data,columns = ['Postal Code','Borough', 'Neighborhood'] ) 
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### remove rows with 'Not assigned' Borough

In [7]:
df.drop(df[df['Borough']=='Not assigned']. index, axis=0, inplace=True)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


There are 103 rows

In [8]:
# check if there are 'Not assigned' in Neighborhood column
df.loc[df['Neighborhood'] == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighborhood


There are no Neighbourhood with 'Not assigned' value in it

In [9]:
#Group PostalCode to check for duplicates
df_grouped = df['Postal Code'].value_counts(ascending=False)
df_grouped.head()

M5A    1
M2L    1
M7Y    1
M4X    1
M4R    1
Name: Postal Code, dtype: int64

There are no two rows with same postal codes

In [10]:
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [11]:
df.shape

(103, 3)

In [12]:
!wget -q -O 'geospatial_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [13]:
geospatial_df = pd.read_csv('geospatial_data.csv')
geospatial_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
final_df = pd.merge(df,geospatial_df,how='inner',on='Postal Code')
final_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


In [15]:
final_df.shape

(103, 5)