# Coursera Peer-graded Assignment
##### Part 2: Getting geo location data for segmenting and clustering neighborhoods in Toronto

Import necessary libraries.

In [1]:
import requests
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

Get the wikipedia page containing the neighborhood data.

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(wiki_url)

Get the html table containing the neighborhood data.

In [3]:
wikipg = BeautifulSoup(response.text, 'html.parser')
table = wikipg.body.find_all('table', 'wikitable sortable')[0]

Convert html table to pandas dataframe.

In [4]:
data = None
for tr in table.tbody.find_all('tr'):
    # extract headers
    th = tr.find_all('th')
    if len(th) > 0:
        data = {t.text.strip(): [] for t in th}
    else:
        for hdr,entry in zip(data.keys(), tr.find_all('td')):
            data[hdr].append(entry.text.strip())
df = pd.DataFrame(data=data, columns=data.keys())
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Remove rows from the dataframe where Borough is unassigned.

In [5]:
def set_unassigned_neigh(row):
    data = dict(zip(row.index.values, row.values))
    if data['Neighbourhood'] == 'Not assigned':
        data['Neighbourhood'] = data['Borough']
    return pd.Series(data)

df_assigned = df.loc[df['Borough'] != 'Not assigned']
print('Not assigned' in df_assigned['Borough'].unique())
df_assigned = df_assigned.apply(set_unassigned_neigh, axis=1)
print('Not assigned' in df_assigned['Neighbourhood'].unique())

False
False


In [6]:
df_assigned.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Combine neighborhoods in a single postal code area.

In [7]:
def join_neighborhoods(row):
    fields = ['Borough', 'Neighbourhood']
    borough = row[fields[0]].values.tolist()[0]
    neighborhoods = ', '.join(row[fields[1]])
    values = [borough, neighborhoods]
    return pd.Series(dict(zip(fields, values)))

df_joined = df_assigned.groupby('Postcode').apply(join_neighborhoods).reset_index()

In [8]:
df_joined.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Display the shape of the dataframe.

In [9]:
print('Number of rows in the dataframe: %d' %df_joined.shape[0])

Number of rows in the dataframe: 103


Import geocoder library needed for getting coordinates.

In [16]:
import geocoder

Since, geocoder is not working. Load Geospatial data from csv.

In [17]:
geodata = pd.read_csv('./Geospatial_Coordinates.csv')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Define function to get coordinates for a particular postal code.

In [20]:
def get_coordinates(row):
    rowdata = dict(zip(row.index.values, row.values))
    postcode, borough = rowdata['Postcode'],rowdata['Borough']
    rowdata['Latitude'] = geodata.loc[geodata['Postal Code']==postcode, 'Latitude'].values[0]
    rowdata['Longitude'] = geodata.loc[geodata['Postal Code']==postcode, 'Longitude'].values[0]
    return pd.Series(rowdata)

In [21]:
df_geo = df_joined.apply(get_coordinates, axis=1)
df_geo.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Rename headers properly.

In [23]:
df_geo.rename(columns={
    'Postcode': 'PostalCode',
    'Neighbourhood': 'Neighborhood'
}, inplace=True)
df_geo.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
