# Toronto Geo Data

#### Author : Sumit Chhabra

# Convert HTML to pandas dataframe

Combine all steps from previous assignment 

In [20]:
#Read from wikipedia
import requests
from bs4 import BeautifulSoup
import pandas as pd

NA = 'Not assigned'

res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')

#extract table from html
table = soup.find_all('table')[0]

#convert to dataframe
df = pd.read_html(str(table))[0]
print("Old:", df.shape)
#convert first row to columns and drop first row
headers = df.iloc[0]
new_df  = pd.DataFrame(df.values[1:], columns=headers)
new_df.rename(columns={'Postcode': 'PostalCode'}, inplace=True)
#print(new_df.head())

#drop Borough with "Not assigned"
new_df = new_df[~new_df['Borough'].isin([NA])]
#new_df.head()
print("New:", new_df.shape)

#Replace Not assigned Neighborhood with Borough name
for index, row in new_df.iterrows():
    if row['Neighbourhood'] == NA:
        print ('Found ', row['Borough'], ' - replace it')
        new_df.at[index, 'Neighbourhood'] = row['Borough']

#groupy by brough and comma separated neighbourhoods
grouped = new_df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].agg(','.join).reset_index()
#grouped
print(grouped.loc[grouped['PostalCode'] == 'M4B'])
print(grouped.loc[grouped['PostalCode'] == 'M5A'])

Old: (290, 3)
New: (212, 3)
Found  Queen's Park  - replace it
   PostalCode    Borough                   Neighbourhood
35        M4B  East York  Woodbine Gardens,Parkview Hill
   PostalCode           Borough             Neighbourhood
53        M5A  Downtown Toronto  Harbourfront,Regent Park


In [5]:
#install geocoder in IBM Watson Studio
#skip this step if already installed
!conda install -c conda-forge geocoder --yes

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geocoder:   1.38.1-py_0  conda-forge
    orderedset: 2.0-py35_0   conda-forge
    ratelim:    0.1.6-py35_0 conda-forge

orderedset-2.0 100% |################################| Time: 0:00:00  54.17 MB/s
ratelim-0.1.6- 100% |################################| Time: 0:00:00  12.97 MB/s
geocoder-1.38. 100% |################################| Time: 0:00:00  43.57 MB/s


### define function to get geo coordinates using geocoder package

In [11]:
import geocoder # import geocoder

def get_coordinates (postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    return lat_lng_coords

## Note: Skip next cell as geocoder package takes very long time to run

In [None]:
for index, row in grouped.iterrows():
        lat_lng_coords = get_coordinates(row['PostalCode'])
        grouped.at[index, 'Latitude'] = lat_lng_coords[0]
        grouped.at[index, 'Longitude'] = lat_lng_coords[1]
        
grouped.head(11)

## use this backup method to get geo coordinates

In [21]:
#backup
url="http://cocl.us/Geospatial_data"
geodata=pd.read_csv(url)
geodata.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
geodata.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### merge two dataframe on matching postal code

In [22]:
merged_df = grouped.merge(geodata, how = 'inner', on = ['PostalCode'])
merged_df.head(11)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [23]:
print(merged_df.loc[merged_df['PostalCode'] == 'M5G'])
print(merged_df.loc[merged_df['PostalCode'] == 'M5V'])

   PostalCode           Borough       Neighbourhood   Latitude  Longitude
57        M5G  Downtown Toronto  Central Bay Street  43.657952 -79.387383
   PostalCode           Borough  \
68        M5V  Downtown Toronto   

                                        Neighbourhood   Latitude  Longitude  
68  CN Tower,Bathurst Quay,Island airport,Harbourf...  43.628947  -79.39442  
