# Capstone Week Three, Part 2
## Add geolocation data to Toronto postal code dataframe.

First we re-create the dataframe from part one of the assignment.

In [1]:
import os
import urllib.request
from bs4 import BeautifulSoup

target_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
webpage = urllib.request.urlopen(target_url)
soup = BeautifulSoup(webpage,'html.parser')
#print(soup.prettify())

import pandas as pd
import numpy as np
nbh_frame = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood'])

for entry in soup.find_all('tr'):
    table_row = []
    for cell in entry.find_all('td'):
        table_row.append(cell.get_text())
    # following line works for the postal code page, but is not robust
    if len(table_row)==3:
        table_row[2] = table_row[2].rstrip()
        if table_row[2] == 'Not assigned':
            table_row[2] = table_row[1]
        if table_row[1] != 'Not assigned':
            if table_row[0] in nbh_frame.PostalCode.values:
                # duplicate postal zone, so append neighborhood
                idx = nbh_frame[nbh_frame['PostalCode']==table_row[0]].index.values
                nbh_frame.iloc[idx,2] += (', ' + table_row[2])
            else:
                nbh_frame.loc[len(nbh_frame)] = table_row


## Attempt to geocode postal codes
The following code showed that the Geopy service has trouble resolving the addresses of the  the postal codes, repeatedly failing, or returning the coordinates 43.653963, -79.387207.

Changes to the address format did not improve the performance of the geocoding.

```python
import time
from geopy.geocoders import Nominatim 

for postal_code in nbh_frame['PostalCode']:
    address = '{}, Toronto, Ontario'.format(postal_code)
    geolocator = Nominatim(user_agent="foursquare_agent_js")
    count = 0
    location = None
    while(location is None):
        location = geolocator.geocode(address)
        #time.sleep(1)
        count += 1
        if count > 200:
            break
    if location == None:
        print('{} not found'.format(postal_code))
    else:
        latitude = location.latitude
        longitude = location.longitude
```

Completing the geocoding with the help of the provided data.

In [2]:
target_url = 'http://cocl.us/Geospatial_data'
geodata_df = pd.read_csv(target_url)
geodata_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [3]:
merged_df = pd.merge(nbh_frame, geodata_df, left_on='PostalCode', right_on='Postal Code')
merged_df.drop(['Postal Code'], axis = 1)
merged_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,M7A,43.662301,-79.389494


In [4]:
merged_df.shape

(103, 6)