## Import Libraries

In [3]:
#! pip install beautifulsoup4  / delete # if needed
#! pip install lxml            / delete # if needed
#! pip install 'pandas==1.1.0'

from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

## Create request reponse, convert to text, soupify, and narrow down to just the table

In [4]:
#create request response and convert it to text, then turn it into soup

response = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

html = response.text

soup = BeautifulSoup(html)

In [5]:
#narrow the soup down to just the table and print the contents to ensure it was read correctly

table = soup.table
table.contents

['\n',
 <tbody><tr>
 <th>Postal Code
 </th>
 <th>Borough
 </th>
 <th>Neighbourhood
 </th></tr>
 <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M2A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M3A
 </td>
 <td>North York
 </td>
 <td>Parkwoods
 </td></tr>
 <tr>
 <td>M4A
 </td>
 <td>North York
 </td>
 <td>Victoria Village
 </td></tr>
 <tr>
 <td>M5A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Regent Park, Harbourfront
 </td></tr>
 <tr>
 <td>M6A
 </td>
 <td>North York
 </td>
 <td>Lawrence Manor, Lawrence Heights
 </td></tr>
 <tr>
 <td>M7A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Queen's Park, Ontario Provincial Government
 </td></tr>
 <tr>
 <td>M8A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M9A
 </td>
 <td>Etobicoke
 </td>
 <td>Islington Avenue, Humber Valley Village
 </td></tr>
 <tr>
 <td>M1B
 </td>
 <td>Scarborough
 </td>
 <td>Malvern, Rouge
 </td></tr>
 <tr>
 <td>M2B
 </td>
 <td>Not assigned
 </

## Convert to DataFrame

In [6]:
#probably a better way to do this, but each row in the table has 3 <td> tags - one for postal code, one for borough,
#and one for neighborhood. Adding all the <td> tags and dividing by three then should calculate number of rows to be expected
#from the dataframe created below (180)

value_count = 0
for child in table.find_all('td'):
    value_count = value_count + 1
total_rows = value_count / 3

print(total_rows)

180.0


In [7]:
#simplest way I could find to convert table contents into df and double-checked to ensure 180 rows and three columns

table = soup.find_all('table')
df = pd.read_html(str(table))[0]
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


## Clean dataframe of 'not assigned" rows, combine multiple neighborhoods with same postal code, make 'not assigned' neighborhoods equivalent to boroughs

In [8]:
#Replace all 'not assigned' values with NumPy's NaN in order to drop them with dropna function

df = df.replace({'Borough': 'Not assigned'}, value=np.nan)
df = df.dropna()
print('Boroughs: \n{}'.format(df['Borough'].value_counts()))


Boroughs: 
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
York                 5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64


In [9]:
# Check to see if multiple neighborborhoods in one postal code need combined

duplicateSeries = df.duplicated(subset='Postal Code')
duplicateSeries.value_counts()

# as it turns out, the table must have been updated since creating the assignment, as the the postal codes that 
# represent multiple neighborhoods have already been consolidated

False    103
dtype: int64

In [10]:
df['Neighbourhood'].value_counts()

Downsview                                          4
Don Mills                                          2
Regent Park, Harbourfront                          1
Rouge Hill, Port Union, Highland Creek             1
York Mills, Silver Hills                           1
                                                  ..
Humberlea, Emery                                   1
India Bazaar, The Beaches West                     1
Forest Hill North & West, Forest Hill Road Park    1
Runnymede, The Junction North                      1
Weston                                             1
Name: Neighbourhood, Length: 99, dtype: int64

In [11]:
# rename "not assigned" neighborhoods to their corresponding Borough

for neighborhood in df['Neighbourhood']:
    df = df.replace({'Neighbourhood': 'Not assigned'}, len(df['Borough']))

In [12]:
df.shape

(103, 3)

## Bring in CSV and append latitude/longitude to Dataframe

In [70]:
latLong_df = pd.read_csv('/Users/jimmy/Downloads/Geospatial_Coordinates.csv')
latLong_df = latLong_df.set_index('Postal Code')
latLong_df = latLong_df.sort_index()

In [73]:
new_df = df[['Postal Code']]

new_df = new_df.set_index('Postal Code')
new_df = new_df.sort_index()

# Luckily for the sake of this project, the CSV file containing lat/long is the same indexes in the same order as 
# the new dataframe, so the only thing necessary was to make the indexes the same
new_df = new_df.join(latLong_df)

    


In [79]:
df = df.set_index('Postal Code')
df = df.sort_index()

KeyError: "None of ['Postal Code'] are in the columns"

In [83]:
# notable this is a terrible way of doing this as there is no real assurance that the data matches other than 
# picking a few values and making sure they're correct, a better way would have been to implement a RaiseError 
# condition if there was any index that did not match in the two dataframes

df = df.join(new_df)

In [90]:
df.head(20)

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
M1J,Scarborough,Scarborough Village,43.744734,-79.239476
M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


(103, 4)