In [1]:
# Beginning of Part 1 --> Webscraping for the Toronto information

In [2]:
# importing the required libraries

from bs4 import BeautifulSoup 
import requests
import pandas as pd

In [3]:
# souce is the website url and .text() is used for parsing the html code of the webpage
# reading the table using the 'table' attribute of the HTML
# reaidng each row using 'tr' attribute of the HTML

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text 
soup = BeautifulSoup(source,'lxml')
table = soup.find('table').tbody
rows = table.find_all('tr')

In [4]:
# extracting the table headers from the html 
# table headers are in the first row of the 'rows' generated above
# column headers are extracted from that
# creating the data frame from the table --> tdf stands for toronto data frame
# tdf for now only contains the column headers

column_headers = [v.text.replace('\n','') for v in rows[0].find_all('th')] 

tdf = pd.DataFrame(columns=column_headers)

# at this point the tdf displays only the column headers
tdf

Unnamed: 0,Postal code,Borough,Neighborhood


In [5]:
# creating the new dataframe from the table, ignoring the rows where borough = not assigned

for i in range(1,len(rows)):
    row = [data.text.replace('\n','') for data in rows[i].find_all('td')]
    
    # assigning the empty neighborhoods to the names of the Boroughs
    
    if row[2]=='' or row[2]=='Not assigned':
        row[2]=row[1]                       
        
    # ignoring all the rows with Boroughs = Not Assigned and appending the remaining ones to tdf
    
    if row[1]!='Not assigned': 
        rowdf = pd.DataFrame([row],columns=column_headers)       
        tdf = tdf.append(rowdf, ignore_index=True) 

# dropping the rows with duplicate postal codes
tdf.drop_duplicates(subset = 'Postal code', inplace=True, ignore_index = True)


In [6]:
# dataframe and the shape


tdf.shape



(103, 3)

In [7]:
# End of Part 1 --> Webscraping for the Toronto information

In [8]:
# Beginning of Part 2 --> Concatenating the webscraped data frame and lat lon data frame to make a single data frame

In [9]:
# Reading the latitude and Longitude data into a data frame from the provided csv file

csv_path='C:\\Users\\karth\\Desktop\\Python\\Projects\\Capstone\\Toronto-Data-Project\\GC.csv'
df_latlng= pd.read_csv(csv_path)

In [10]:
# Changing the name of the postal code column for uniformity in the two data frames

df_latlng.rename(columns={'Postal Code':'Postal code'}, inplace=True)

In [11]:
# Concatenating the two data frames to make one
# tdf from the part 1 containing the boroughs and neighborhood data and df_latlng from part 2 containing the lat lng values

# first the indices in both of them are changed to the postal code as concatenation needs similar indices

df1 = df_latlng.set_index('Postal code')
df2 = tdf.set_index('Postal code')


# concatenation along y axis is done to add columns

tdf_new = pd.concat([df2,df1],axis=1)

# for the new data frame the index is reset and columns are renamed

tdf_new.reset_index(inplace=True)
tdf_new.rename(columns={'index':'Postal code'}, inplace=True)

tdf_new

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North,43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,43.636258,-79.498509


In [12]:
# showing the first few rows from the dataframe example in the coursera assignment for verification purpose
codes = ['M5G','M2H','M4B','M1J','M4G']

df = pd.DataFrame(columns=tdf_new.columns)

for code in codes:
    df=df.append(tdf_new[tdf_new['Postal code']==code],ignore_index=True)

df

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,Parkview Hill / Woodbine Gardens,43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
