<a href="https://colab.research.google.com/github/heyl-steve/Coursera_Capstone/blob/main/Capstone_Week_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Heyl - Capstone - Week 3 - Clustering Neighborhoods in Toronto

Get neighborhoods list from: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M


In [123]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install folium
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


<h1>Step 1: Get neighborhood data</h1>

In [124]:
# code to scrape table and get it into a dataframe

# based on https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059
import lxml.html as lh

# find table on the page
# note - new version of table is organized very differently, use the old version
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

# check table rows length - sanity check that we have the table
[len(T) for T in tr_elements[:12]]

# get column names from table (we will change these in the dataframe)
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    # remove backslash N
    col.append((name.replace('\n',''),[]))


#Since out first row is the header, data is stored on the second row onwards
# last four rows don't contain postal code info; discovered this by trial and error
for j in range(1,len(tr_elements)-4):
    #T is our j'th row
    T=tr_elements[j]
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        # remove backslash n
        col[i][1].append(data.replace('\n',''))
        #Increment i for the next column
        i+=1

Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [125]:
# verify that we got the entire table
df.tail()


Unnamed: 0,Postal Code,Borough,Neighbourhood
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."
179,M9Z,Not assigned,Not assigned


In [126]:
# check for duplicates
duplicates = df[df['Postal Code'].duplicated()]
# first row is headers
print("number of duplicate records %2d" %len(duplicates))

number of duplicate records  0


In [127]:
print('intial df rows = %2d' %len(df))
# remove records where Borough = "Not assigned"
df_toronto_nb = df[df['Borough'] != 'Not assigned']

print('toronto neighborhoods df rows = %2d' %len(df_toronto_nb))

df_toronto_nb.head()

intial df rows = 180
toronto neighborhoods df rows = 103


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [128]:
# do we have any records with Neighborhood = "Not assigned"?
df_unassigned_toronto_neighborhoods = df_toronto_nb[df_toronto_nb['Neighbourhood'] == 'Not assigned']

print('unassigned toronto neighbhoroods = %2d' %len(df_unassigned_toronto_neighborhoods))

unassigned toronto neighbhoroods =  0


<h1>Step 1: Get neighborhood data - Completed</h1>
<h3>Dataframe 'df_toronto_nb' contains the postal code and neighborhood data</h3>

<h1>Step 2: Add latitude and longitude</h1>

In [129]:
#get latitude & longitude from csv
!wget -q -O 'longandlat_data.csv' https://cocl.us/Geospatial_data
df_toronto_long_lat=pd.read_csv('longandlat_data.csv')
print(df_toronto_long_lat.shape)
df_toronto_long_lat.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [130]:
# join neighborhood and latitude-longitude data using Postal Code
df_toronto_neighborhoods = pd.merge(df_toronto_nb, df_toronto_long_lat, on=['Postal Code'], how='inner')
print(df_toronto_neighborhoods.shape)
df_toronto_neighborhoods.head()

(103, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


<h1>Step 2: Add latitude and longitude - Completed</h1>
<h3>Dataframe 'df_toronto_neighborhoods' contains the postal code, neighborhood, and location data</h3>

<H1>Step 3: Explore Neighborhoods</H1>

In [132]:
# create map of Toronto using latitude and longitude values - see M5A above 
latitude = 43.65
longitude = -79.38
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto_neighborhoods['Latitude'], df_toronto_neighborhoods['Longitude'], df_toronto_neighborhoods['Borough'], df_toronto_neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

    
map_toronto

Use FourSquare to find gay bars

In [133]:
#define FourSquare info
CLIENT_ID = 'OYH3I15E0DNFHKA003BYXMSTBHDAEVAK0N14YMI4AIHPC55K' # your Foursquare ID
CLIENT_SECRET = 'QGVECOVC25B5TBBJW0RCKOUZUMNXHNOMVEHGMPYKM1YGGJUZ' # your Foursquare Secret
ACCESS_TOKEN = '' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: OYH3I15E0DNFHKA003BYXMSTBHDAEVAK0N14YMI4AIHPC55K
CLIENT_SECRET:QGVECOVC25B5TBBJW0RCKOUZUMNXHNOMVEHGMPYKM1YGGJUZ


In [134]:
search_query = 'Gay Bar'
radius = 1500
#url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, search_query, radius, LIMIT)
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&query={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, search_query)
url

'https://api.foursquare.com/v2/venues/search?client_id=OYH3I15E0DNFHKA003BYXMSTBHDAEVAK0N14YMI4AIHPC55K&client_secret=QGVECOVC25B5TBBJW0RCKOUZUMNXHNOMVEHGMPYKM1YGGJUZ&ll=43.65,-79.38&oauth_token=&v=20180604&query=Gay Bar'

In [135]:
results = requests.get(url).json()
# assign relavent part of JSON to venues
venues = results['response']['venues']

#put venues in a dataframe    
df_bars = json_normalize(venues) 
print(df_bars.shape)
df_bars.head()



(30, 19)


  


Unnamed: 0,id,name,categories,referralId,hasPerk,location.address,location.crossStreet,location.lat,location.lng,location.labeledLatLngs,location.distance,location.cc,location.city,location.state,location.country,location.formattedAddress,location.postalCode,venuePage.id,location.neighborhood
0,4ee4278e9adf3982ff455f88,Miro's Gay Bar,"[{'id': '4bf58dd8d48988d1d8941735', 'name': 'G...",v-1615866741,False,69 Church St,Queen St,43.652755,-79.387072,"[{'label': 'display', 'lat': 43.65275543496965...",646,CA,Toronto,ON,Canada,"[69 Church St (Queen St), Toronto ON, Canada]",,,
1,5a7bc713c3658814224ad10c,Bar Adelaide,"[{'id': '4bf58dd8d48988d1d5941735', 'name': 'H...",v-1615866741,False,325 Bay Street,,43.6498,-79.38016,"[{'label': 'display', 'lat': 43.6498, 'lng': -...",25,CA,Toronto,ON,Canada,"[325 Bay Street, Toronto ON M5H, Canada]",M5H,,
2,4ad69511f964a520e40721e3,The Keg Steakhouse + Bar - York Street,"[{'id': '4bf58dd8d48988d1c4941735', 'name': 'R...",v-1615866741,False,165 York St,btwn Richmond St. & Adelaide St.,43.649987,-79.384103,"[{'label': 'display', 'lat': 43.64998659318569...",330,CA,Toronto,ON,Canada,[165 York St (btwn Richmond St. & Adelaide St....,M5H 3R8,1359966175.0,
3,4ae73054f964a5203ca921e3,Ki Modern Japanese + Bar,"[{'id': '4bf58dd8d48988d111941735', 'name': 'J...",v-1615866741,False,181 Bay St,at Wellington St. W,43.647223,-79.379374,"[{'label': 'display', 'lat': 43.647223, 'lng':...",313,CA,Toronto,ON,Canada,"[181 Bay St (at Wellington St. W), Toronto ON ...",M5J 2T3,,
4,4ba192cef964a52079c137e3,Consort Bar,"[{'id': '4bf58dd8d48988d1d5941735', 'name': 'H...",v-1615866741,False,37 King St East,Le Royal Meridien King Edward Hotel,43.649566,-79.376359,"[{'label': 'display', 'lat': 43.64956578342230...",297,CA,Toronto,ON,Canada,[37 King St East (Le Royal Meridien King Edwar...,M5C 1E9,500209987.0,


In [136]:
# create map of Toronto using latitude and longitude values 
latitude = 43.65
longitude = -79.38
map_toronto_lgbtq = folium.Map(location=[latitude, longitude], zoom_start=14)

# add markers to map
for lat, lng, bar_name, bar_addr in zip(df_bars['location.lat'], df_bars['location.lng'], df_bars['name'], df_bars['location.formattedAddress']):
    label = '{}, {}'.format(bar_name, bar_addr)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_lgbtq)  

    
map_toronto_lgbtq

<H1>Step 3: Explore Neighborhoods - Completed</H1>