# Coursera Capstone Project w GeoCoder

In [10]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [11]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


# Get Wikipedia HTML Page

In [12]:
!wget -O Canada.html https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada%3A_M

--2019-06-03 20:47:38--  https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada%3A_M
Resolving en.wikipedia.org (en.wikipedia.org)... 208.80.154.224, 2620:0:861:ed1a::1
Connecting to en.wikipedia.org (en.wikipedia.org)|208.80.154.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78947 (77K) [text/html]
Saving to: ‘Canada.html’


2019-06-03 20:47:39 (1.49 MB/s) - ‘Canada.html’ saved [78947/78947]



In [13]:
with open("Canada.html") as fp:
   WikiCanada_bs = BeautifulSoup(fp, 'lxml')          # parser lxml or html.parser

## Get the table

In [14]:
tbl = WikiCanada_bs.find('tbody')

## Get all rows of the table and store them as a List

In [15]:
rows = tbl.find_all('tr')

In [16]:
can_Lst = []

for i, row in enumerate(rows):
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    can_Lst.append(cols)


## Convert to a Pandas Dataframe and Set Column Names

In [17]:
can_df = pd.DataFrame( can_Lst[1:] )

In [18]:
can_df.head(3)

Unnamed: 0,0,1,2
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods


In [19]:
can_df.columns = ['PostalCode','Borough','Neighborhood']
can_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Print number of rows of the original data

In [20]:
can_df.shape

(288, 3)

## Replace 'Not Assign' with NaN

In [21]:
can_df.replace('Not assigned', np.nan, inplace = True)

## Set Neighborhood with Borough name when Neighborhood is NaN 

In [22]:
can_df['Neighborhood'].replace( np.nan,  can_df['Borough'], inplace = True )

## Confirm if successful, check M7A

In [23]:
can_df[ can_df['PostalCode'] == 'M7A'  ]

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Queen's Park


## Remove Row with NaN

In [24]:
can_df.dropna(axis = 0, inplace = True)
can_df.reset_index(drop=True, inplace=True)
can_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


## Check again the number of Row after deletion

In [25]:
can_df.shape

(211, 3)

## Display Neighborhoods of M5A

In [26]:
can_df[ can_df['PostalCode'] == 'M5A'  ]

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park


# The code Below will merge all Neighborhood from the same PostalCode

## Use Dictionary to Merge Neigborhood
###   Postal Code + '.' + Borough will be the key of the dictionary

In [28]:
can_dict = {}

for vPostal, vBor, vNeigh in zip( can_df.PostalCode, can_df.Borough, can_df.Neighborhood):
    vKey = vPostal + "." + vBor
    try:
        can_dict[ vKey ].append(vNeigh) 
    except:
        can_dict[ vKey ] = [vNeigh]
    

## Split the dictionary to 3 seperate lists

In [29]:
can_pc = []
can_bo = []
can_ng = []
for vDict, vVal in zip( can_dict.keys(), can_dict.values() ):
    v_pc_bo = vDict.split('.') 
    can_pc.append( v_pc_bo[0] )
    can_bo.append( v_pc_bo[1] )
    can_ng.append( ','.join(vVal) )


## Convert the List to a Pandas Dataframe

In [30]:
can_df2 = pd.DataFrame( {'PostalCode':can_pc , 'Borough':can_bo, 'Neighborhood':can_ng} )
can_df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


## Check the number of rows of the final result

In [31]:
can_df2.shape

(103, 3)

# GET Geospatial_Coordinates.csv from the following URL 
###   and Save it as Geospatial_Coordinates.csv

In [34]:
!wget -O Geospatial_Coordinates.csv https://cocl.us/Geospatial_data

--2019-06-03 20:48:44--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.201
Connecting to cocl.us (cocl.us)|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-03 20:48:45--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.26.197
Connecting to ibm.box.com (ibm.box.com)|107.152.26.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-03 20:48:45--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-03 

In [35]:
geo_df = pd.read_csv("Geospatial_Coordinates.csv")

In [36]:
geo_df.rename( columns={'Postal Code':'PostalCode'}, inplace=True)
geo_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## MERGE our dataframe with Geospatial coordinate
###   and Display the first 5 rows

In [37]:
can_df3 = pd.merge(can_df2, geo_df, on='PostalCode', how='inner')
can_df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [40]:
can_df3.shape

(103, 5)