In [1]:
!pip install beautifulsoup4



### Read Toronto neiborhood table from website using BeautifulSoup

In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(URL)

#Read html 
soup = BeautifulSoup(page.content, 'html.parser')

#parse and convert to table
table = soup.find_all('table')
df_nbrhood = pd.read_html(str(table))[0]


In [7]:
#adjust the column names
df_nbrhood.columns=['PostalCode','Borough','Neighborhood']

print(df_nbrhood.shape)

(180, 3)


### Data wrangling

#### Exclude rows where Borough='Not assigned'

In [10]:
df_nbrhood=df_nbrhood[df_nbrhood['Borough']!='Not assigned']

# Reset the index
df_nbrhood.reset_index(drop=True,inplace=True)
print (df_nbrhood.shape)
df_nbrhood.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Get Toronto postal code, latitude, longitude reference table

In [11]:
# Get CSV file from website and save as 'toronto_postal.csv'
!wget -q -O 'toronto_postal.csv' http://cocl.us/Geospatial_data



In [26]:

# Read data from csv file and  create a dataframe, df_postcod_loc, to store it
file_name ='toronto_postal.csv'
df_postcod_loc=pd.read_csv(file_name) 

#drop exisitng index and set index to 'Postal Code'
df_postcod_loc. reset_index(drop=True,inplace=True)
df_postcod_loc. set_index('Postal Code', inplace=True)

In [27]:
df_postcod_loc.shape

(103, 2)

#### Combine data from two dataframes and save to one dataframe df_nbrhood.

In [24]:
for i in df_nbrhood.index.values:
   p_code = df_nbrhood.loc[i,'PostalCode']
   df_nbrhood.loc[i,'Latitude'] = df_postcod_loc.loc[p_code,'Latitude']
   df_nbrhood.loc[i,'Longitude'] = df_postcod_loc.loc[p_code,'Longitude']


In [28]:
print('shape:',df_nbrhood.shape)
df_nbrhood.head(12)

shape: (103, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [31]:
df_nbrhood[df_nbrhood['PostalCode']=='M5G']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
