# Segmenting and Clustering Neighborhoods in Toronto
### Our goal is to visual represent clusters of neighborhoods in Toronto using data available on Wikipedia

## Step 1 - Scrape Wikipedia page to build initial dataframe of postal codes and dependent attributes

In [2]:
# Scrap the Wikipedia webpage to get a dataframe of postal codes in Toronto
# Use Beautifulsoup

# The HTML file was saved to a local file on the filesystem

from bs4 import BeautifulSoup
import pandas as pd

with open('C:/Public/IBM Data Science/List of postal codes of Canada_ M - Wikipedia.html') as fp:
    soup = BeautifulSoup(fp)

table = soup.find('table')
table_rows = table.find_all('tr')

l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.replace('\n', '') for tr in td]
    l.append(row)
postal_code_df = pd.DataFrame(l, columns=["PostalCode", "Borough", "Neighborhood"])

In [3]:
# Clean dataframe

# Remove first row (None, None, None)
postal_code_df.drop(axis=0,index=0, inplace=True)
postal_code_df.head()

# Remove Borough = Not Assigned
postal_code_df.drop(postal_code_df.loc[postal_code_df['Borough']=='Not assigned'].index, inplace=True)

In [4]:
# Test for any Neighborhood that are Not Assigned -> There are none
postal_code_df[postal_code_df['Neighborhood']=='Not assigned']


Unnamed: 0,PostalCode,Borough,Neighborhood


In [5]:
# Display first 20 rows
postal_code_df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
10,M1B,Scarborough,"Malvern, Rouge"
12,M3B,North York,Don Mills
13,M4B,East York,"Parkview Hill, Woodbine Gardens"
14,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
# Shape of dataframe per assignment instructions
postal_code_df.shape

(103, 3)

## Step 2 - Get latitude and longitude for each row using Nominatim service
### This process is iterative in nature. The work is to get a valid lookup address from the information provided on Wikipedia. This involves some manual adjustment to the address logic.

In [7]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# define a function to get the latitude and longitude as a tuple for the input location
geolocator = Nominatim(user_agent="toronto_explorer")

def geolookup(address):
    try:
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
    except:
        latitude = 0.0
        longitude = 0.0
        
    return str(latitude) + ',' + str(longitude)

In [8]:
# create a lookup address for the lat/lon lookup processing
# first split the Neighborhood by comma
new_df = postal_code_df["Neighborhood"].str.split(",", n = 1, expand = True)

# create the lookup address using only the first name in the Neighborhood
postal_code_df['lookup address'] = new_df[0] + ', Toronto, Ontario'

# Lookup the lat/lon using Nominatim
postal_code_df['location result'] = postal_code_df.apply(lambda row: geolookup(row['lookup address']), axis=1)


In [9]:
# identify any lookup addresses that returned no lat/lon
postal_code_df[postal_code_df['location result']=='0.0,0.0']

Unnamed: 0,PostalCode,Borough,Neighborhood,lookup address,location result
24,M6C,York,Humewood-Cedarvale,"Humewood-Cedarvale, Toronto, Ontario","0.0,0.0"
33,M6E,York,Caledonia-Fairbanks,"Caledonia-Fairbanks, Toronto, Ontario","0.0,0.0"
41,M5G,Downtown Toronto,Central Bay Street,"Central Bay Street, Toronto, Ontario","0.0,0.0"
87,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn","Del Ray, Toronto, Ontario","0.0,0.0"
115,M7R,Mississauga,Canada Post Gateway Processing Centre,"Canada Post Gateway Processing Centre, Toronto...","0.0,0.0"
149,M5W,Downtown Toronto,Stn A PO Boxes,"Stn A PO Boxes, Toronto, Ontario","0.0,0.0"
169,M7Y,East Toronto,"Business reply mail Processing Centre, South C...","Business reply mail Processing Centre, Toronto...","0.0,0.0"


In [10]:
# test neighborhood values with Nominatim to identify manual overrides of lookup address
address = 'East Toronto, Toronto, Ontario' 

geolocator = Nominatim(user_agent="toronto_explorer")
output = geolookup(address)
print(output)

43.6247901,-79.3934918


In [11]:
# Manually update the lookup address to ensure we get a valid lat/lon
# Note we are using row index which may change from execution to execution
# Use the results above to get the index to use here
postal_code_df.at[24,'lookup address']='Humewood, Toronto, Ontario'
postal_code_df.at[33,'lookup address']='Caledonia, Toronto, Ontario'
postal_code_df.at[41,'lookup address']='MG5, Toronto, Ontario'
postal_code_df.at[87,'lookup address']='Silverthorn, Toronto, Ontario'
postal_code_df.at[115,'lookup address']='Mississauga, Toronto, Ontario'
postal_code_df.at[149,'lookup address']='Downtown Toronto, Toronto, Ontario'
postal_code_df.at[169,'lookup address']='East Toronto, Toronto, Ontario'

# Lookup the lat/lon using Nominatim
postal_code_df['location result'] = postal_code_df.apply(lambda row: geolookup(row['lookup address']), axis=1)

In [12]:
# retest to confirm we have no missing lat/lon -> there are none so we can move forward
postal_code_df[postal_code_df['location result']=='0.0,0.0']

Unnamed: 0,PostalCode,Borough,Neighborhood,lookup address,location result


In [13]:
# Update the lat/lon columns
# new data frame with split value columns 
new_df = postal_code_df["location result"].str.split(",", n = 1, expand = True)

# Add lattitdde and Longitude
postal_code_df['latitude'] = 0.0
postal_code_df['longitude'] = 0.0
  
# making separate first name column from new data frame 
postal_code_df["latitude"]= new_df[0].astype(float) 
  
# making separate last name column from new data frame 
postal_code_df["longitude"]= new_df[1].astype(float) 

In [14]:
postal_code_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,lookup address,location result,latitude,longitude
3,M3A,North York,Parkwoods,"Parkwoods, Toronto, Ontario","43.7587999,-79.3201966",43.7588,-79.320197
4,M4A,North York,Victoria Village,"Victoria Village, Toronto, Ontario","43.732658,-79.3111892",43.732658,-79.311189
5,M5A,Downtown Toronto,"Regent Park, Harbourfront","Regent Park, Toronto, Ontario","43.6607056,-79.3604569",43.660706,-79.360457
6,M6A,North York,"Lawrence Manor, Lawrence Heights","Lawrence Manor, Toronto, Ontario","43.7220788,-79.4375067",43.722079,-79.437507
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government","Queen's Park, Toronto, Ontario","43.659659,-79.3903399",43.659659,-79.39034


## Step 3 - Vizualize the postal codes

In [15]:
import folium # map rendering library

# create map of Toronto using latitude and longitude values
latitude = 43.6487
longitude = -79.38544
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add title to map
loc = 'Toronto Postal Codes'
title_html = '''
             <h3 align="center" style="font-size:16px"><b>{}</b></h3>
             '''.format(loc)   

map_toronto.get_root().html.add_child(folium.Element(title_html))

# add markers to map
for lat, lng, borough, neighborhood in zip(postal_code_df['latitude'], postal_code_df['longitude'], postal_code_df['Borough'], postal_code_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto