# Capstone Project


In [1]:
# Import libraries

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

import geocoder # import geocoder


In [2]:
# specify URL address:
url_html='https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050'

# read the URL address into a variable as a list:
df = pd.read_html(url_html)

# convert the list to a dataframe:
df_postcodes=df[0]

print("imported dataframe has",df_postcodes['Postcode'].count(), "postcodes entries")

df_postcodes.head(10)

imported dataframe has 287 postcodes entries


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


In [3]:
# define a new dataframe where 'Borough' = 'Not assigned' are not included
df_new = df_postcodes[df_postcodes.Borough != 'Not assigned']
    
# reset the index:
df_new.reset_index(drop=True, inplace=True)

df_new.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [4]:
# group by Postcode
grouped = df_new.groupby(['Postcode']) 

# combine the Neighbourhood grouped by postcode and into a new df
Neighbourhood_grouped = grouped['Neighbourhood'].apply(lambda x: x.sum()) 

# adds spaces and commas between Neighbourhood
Neighbourhood_grouped = grouped['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))

# matches a borough to each postcode
borough_grouped = grouped['Borough'].apply(lambda x: set(x).pop())

# turn borough_grouped and Neighbourhood into dataframes
borough = borough_grouped.to_frame()
Neighbourhood = Neighbourhood_grouped.to_frame()

#combine the dataframe borough and the dataframe Neighbourhood into one dataframe
grouped_final = borough.merge(Neighbourhood, on="Postcode")

grouped_final

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [5]:
print('The numer of rows and columns in this dataframe is:')
grouped_final.shape

The numer of rows and columns in this dataframe is:


(103, 2)

In [6]:
import geocoder # import geocoder


In [21]:
column_names = ['Postcode','Borough','Neighborhoods']
tor_post_codes = pd.DataFrame(columns=column_names)

tor_post_codes = grouped_final.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

tor_post_codes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [22]:
# import the postal code to coordinates to a dataframe:
df_coordinates = pd.read_csv('Geospatial_Coordinates.csv')
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [26]:
tot_post_coordinate = tor_post_codes.join(df_coordinates.set_index('Postal Code'), on='Postcode')
tot_post_coordinate.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [29]:
# create a df with only locations in Toronto
df_toronto = tot_post_coordinate[tot_post_coordinate['Borough'].str.contains('Toronto')].reset_index(drop=True)
df_toronto.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
