In [None]:
# The code was removed by Watson Studio for sharing.

# Geographical Coordinates of the Neighborhoods of Toronto

The data are on a Wikipedia table, at https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

Web scraping --> I will use the Beautiful Soup library to get the data from the table.

_NOTE: the first part is the same as the "Segmentation and Clustering of the Neighborhoods of Toronto" Notebook_

In [1]:
# import the request library
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
website_url = requests.get(url).text

# Prettify() function in BeautifulSoup will enable us to view how the tags are nested in the document
soup = BeautifulSoup(website_url,'lxml')

### Extract the table using the soup.find. Then, extract all the <td> ... </td> which contain the postcodes, boroughs and neighborhoods

In [2]:
# extract the table
mytable = soup.find('table',{'class':'wikitable sortable'})
# extract the rows that start with <td>
tdALL   = mytable.find_all('td')

### Loop in the tdALL and extract the data that don't have the 'Not Assigned' in the Borough column

In [46]:
postcode = []
borough  = []
neighborhood = []
for ii in range(0,len(tdALL)-3,3):
    if "Not" not in tdALL[ii+1].text:
        postcode.append(tdALL[ii].text)
        borough.append(tdALL[ii+1].text)
        neighborhood.append(tdALL[ii+2].text)        

### Use the lists just found to create a dataframe

In [47]:
# create a dataframe with PostalCode, Borough, and Neighborhood using the lists found above
import pandas as pd
df = pd.DataFrame()
df['PostalCode']   = postcode
df['Borough']      = borough
df['Neighborhood'] = neighborhood

# strip off the '\n' from the Neighborhood column
df['Neighborhood'] = df['Neighborhood'].map(lambda x: x.rstrip('\n'))
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Shape of the dataframe

In [48]:
df.shape

(212, 3)

### Create a new dataframe, grouping by PostalCode

In [50]:
# group by PostalCode
df2    = df[['PostalCode','Borough','Neighborhood']].groupby('PostalCode')
# get the arrays with the values for Boroughs and Neighborhoods, given a unique PostalCode
l1     = df2.apply(lambda x: x['Neighborhood'].unique())
l2     = df2.apply(lambda x: x['Borough'].unique())                                            
# create a dictionary with the 2 lists
d      = {'Borough':l2,'Neighborhood':l1}
dfnew  = pd.DataFrame(d)
# or:
#dfnew = pd.DataFrame({k: v for k, v in d.items()})
dfnew.reset_index(level=0, inplace=True)
dfnew.rename(columns={'index': 'PostalCode'},inplace=True)
dfnew.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,[Scarborough],"[Rouge, Malvern]"
1,M1C,[Scarborough],"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,[Scarborough],"[Guildwood, Morningside, West Hill]"
3,M1G,[Scarborough],[Woburn]
4,M1H,[Scarborough],[Cedarbrae]


### Import Geocoder: library used to extract the longitude and latitude for any given postal code

In [7]:
# Intall the library using Andaconda:
#!conda install -c conda-forge geocoder 

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geocoder:   1.38.1-py_0  conda-forge
    orderedset: 2.0-py35_0   conda-forge
    ratelim:    0.1.6-py35_0 conda-forge

orderedset-2.0 100% |################################| Time: 0:00:00  37.30 MB/s
ratelim-0.1.6- 100% |################################| Time: 0:00:00   6.69 MB/s
geocoder-1.38. 100% |################################| Time: 0:00:00  22.49 MB/s


In [8]:
# import the library:
import geocoder 

### Use geocoder to get the coordinates of the postal codes

In [15]:
# Create 2 new columns in the dfnew dataframe, which will be filled with the longitudes and latitudes
import numpy as np
dfnew["Latitude"] = np.nan
dfnew["Longitude"] = np.nan

dfnew.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,[Scarborough],"[Rouge, Malvern]",,
1,M1C,[Scarborough],"[Highland Creek, Rouge Hill, Port Union]",,
2,M1E,[Scarborough],"[Guildwood, Morningside, West Hill]",,
3,M1G,[Scarborough],[Woburn],,
4,M1H,[Scarborough],[Cedarbrae],,


#### I tried to use the geocoder in the following loop and save the values in the new columns of the dfnew dataframe, but it was taking too long, so I am importing the coordinates from the csv file provided (see below)
for postal_code in dfnew['PostalCode']:
    # initialize the variable to None
    lat_lng_coords = None

    # loop until getting the coordinates (the library might not give the coordinates the first time it's run...)
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    # insert the coordinates at the postal_code row
    dfnew.loc[dfnew['PostalCode']==postal_code, 'Latitude']  = lat_lng_coords[0]
    dfnew.loc[dfnew['PostalCode']==postal_code, 'Longitude'] = lat_lng_coords[1]


In [52]:
# The code was removed by Watson Studio for sharing.

In [35]:
# I'm going to use the provided table that has already the coordinates for each postal code
# First, I need to import the project_lib, so that I can access my file in the Data Asset
from project_lib import Project
project = Project(None,projectID,projectToken)

In [53]:
df_coords = pd.read_csv(project.get_file('Geospatial_Coordinates.csv'))
df_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [54]:
print('Shape of df_coords: \t',df_coords.shape)
print('Shape of dfnew: \t',dfnew.shape)

Shape of df_coords: 	 (103, 3)
Shape of dfnew: 	 (103, 3)


#### Fill the Lat and Lon columns in dfnew with values from df_coords, using the postal code as a key 

In [56]:
for postal_code in dfnew['PostalCode']:
    latitude  = df_coords.loc[df_coords['Postal Code']==postal_code]['Latitude']
    longitude = df_coords.loc[df_coords['Postal Code']==postal_code]['Longitude']
    dfnew.loc[dfnew['PostalCode']==postal_code, 'Latitude']  = latitude
    dfnew.loc[dfnew['PostalCode']==postal_code, 'Longitude'] = longitude

In [57]:
dfnew.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,[Scarborough],"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,[Scarborough],"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,[Scarborough],"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,[Scarborough],[Woburn],43.770992,-79.216917
4,M1H,[Scarborough],[Cedarbrae],43.773136,-79.239476
