This is the notebook for the capstone project. It explores the neighborhoods of Toronto.

Import the necessary packages:

In [1]:
import urllib.request
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
%matplotlib inline
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize
import folium

We will be exploring the gyms in the city of Toronto.

In [2]:
latitude = 43.653908
longitude = -79.384293
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653908, -79.384293.


We will look for all gyms within a radius of 10 km of the Toronto city center.

In [3]:
CLIENT_ID = 'D5JHB5WWG3CJDYXM31314PQ12NXCEUWPM3Q10ZANJB4GT1HH' # your Foursquare ID
CLIENT_SECRET = 'HOAYHUKM53XZYMF4HIDMCTK2Z3QO1VL2FC2PHJURLLT5NYFN' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 500
search_query = 'Gym'
radius = 1000
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
results = requests.get(url).json()
# assign relevant part of JSON to venues
venues = results['response']['venues']

In [4]:
# tranform venues into a dataframe
df = json_normalize(venues)
df.head()

Unnamed: 0,categories,hasPerk,id,location.address,location.cc,location.city,location.country,location.crossStreet,location.distance,location.formattedAddress,location.labeledLatLngs,location.lat,location.lng,location.postalCode,location.state,name,referralId
0,"[{'id': '4bf58dd8d48988d176941735', 'name': 'G...",False,5133f680e4b02e871367c60c,,CA,,Canada,,598,[Canada],"[{'label': 'display', 'lat': 43.64877382613179...",43.648774,-79.386517,,,The Gym at the Shangri-La,v-1555353442
1,"[{'id': '4bf58dd8d48988d175941735', 'name': 'G...",False,4cd044c29d87224bf129543b,,CA,,Canada,,219,[Canada],"[{'label': 'display', 'lat': 43.653571, 'lng':...",43.653571,-79.386979,,,University Centre Gym,v-1555353442
2,"[{'id': '4bf58dd8d48988d176941735', 'name': 'G...",False,4f61e092e4b0d7325fb1f8a3,145 Richmond Street W,CA,Toronto,Canada,at University Ave.,450,"[145 Richmond Street W (at University Ave.), T...","[{'label': 'display', 'lat': 43.64999445230568...",43.649994,-79.38573,,ON,Hilton Gym,v-1555353442
3,"[{'id': '4bf58dd8d48988d175941735', 'name': 'G...",False,506a1889e4b05fc962888176,Hilton Garden Inn,CA,Toronto,Canada,,917,"[Hilton Garden Inn, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65739316527909...",43.657393,-79.373963,,ON,Gym,v-1555353442
4,"[{'id': '4f4528bc4b90abdf24c9de85', 'name': 'A...",False,514898c3e4b0f2687d7c083a,CBC,CA,Toronto,Canada,,1006,"[CBC, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.64512029038736...",43.64512,-79.387259,,ON,Gym,v-1555353442


In [5]:
df.shape

(50, 17)

So there are 50 gyms in this area.

Clean the dataframe:

In [6]:
df = df.rename(columns = {'location.lat':'Latitude','location.lng':'Longitude','name':'Name'})
df = df[['Name','Latitude','Longitude']]
df.head()

Unnamed: 0,Name,Latitude,Longitude
0,The Gym at the Shangri-La,43.648774,-79.386517
1,University Centre Gym,43.653571,-79.386979
2,Hilton Gym,43.649994,-79.38573
3,Gym,43.657393,-79.373963
4,Gym,43.64512,-79.387259


Visualise the gyms on the map of Toronto

In [7]:
# create map
map = folium.Map(location=[latitude, longitude], zoom_start=13.8)

for lat, lon, poi in zip(df['Latitude'], df['Longitude'], df['Name']):
    label = folium.Popup(str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        fill=True,
        fill_opacity=0.7).add_to(map)
       
map

Density-based clustering of the gyms using DBSCAN:

In [8]:
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
sklearn.utils.check_random_state(1000)
Clus_dataSet = df[['Latitude','Longitude']]
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)

# Compute DBSCAN
db = DBSCAN(eps=.5, min_samples=5).fit(Clus_dataSet)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
df["Clus_Db"]=labels

realClusterNum=len(set(labels)) - (1 if -1 in labels else 0)
clusterNum = len(set(labels)) 


# A sample of clusters
df.head(10)

Unnamed: 0,Name,Latitude,Longitude,Clus_Db
0,The Gym at the Shangri-La,43.648774,-79.386517,0
1,University Centre Gym,43.653571,-79.386979,-1
2,Hilton Gym,43.649994,-79.38573,0
3,Gym,43.657393,-79.373963,2
4,Gym,43.64512,-79.387259,-1
5,1 King West Gym,43.649176,-79.378005,-1
6,The Gym,43.645418,-79.387059,-1
7,Neill-Wycik (Gym),43.660581,-79.37756,-1
8,The Gym @ Motion,43.655452,-79.384114,-1
9,Eaton Chelsea Gym,43.658326,-79.383075,1


In [9]:
df['Clus_Db'].unique()

array([ 0, -1,  2,  1])

There are 3 clusters identified.

Visualize the clusters:

In [10]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [11]:
kclusters = 4
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=14)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Name'], df['Clus_Db']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Calculate the centerpoints of the 3 clusters (outliers are excluded)

In [12]:
df = df[df['Clus_Db'] != -1]
df.groupby(['Clus_Db']).mean()

Unnamed: 0_level_0,Latitude,Longitude
Clus_Db,Unnamed: 1_level_1,Unnamed: 2_level_1
0,43.649297,-79.386696
1,43.660967,-79.383353
2,43.655706,-79.376297


Therefore the recommendation is to look for locations close to these 3 centerpoints.