# Segmenting and Clustering Neighborhoods in Toronto

## Week 3 Portion

### 1. Get data and clean

In [107]:
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
#!conda install -c conda-forge geocoder --yes
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ca-certificates-2020.4.5.2 |       hecda079_0         147 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    certifi-2020.4.5.2         |   py36h9f0ad1d_0         152 KB  conda-forge
    ------------------------------------------------------------
                       

In [108]:
import numpy as np
import pandas as pd
import json # library to handle JSON files

import requests
from bs4 import BeautifulSoup

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
#import geocoder # import geocoder

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

In [19]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

results = pd.read_html(url)
df = results[0]
df_toronto = df[df['Neighborhood']!="Not assigned"].reset_index(drop=True)
print(df_toronto.shape)

(103, 3)


### 2. Get latitude and longitude

In [None]:
# Try obtaining coordinates using geocoder-Google
#latitude = np.zeros(df_toronto.shape[0])
#longitude = np.zeros(df_toronto.shape[0])
#for i, postal_code in enumerate(df_toronto['Postal Code']):
#    print(i, postal_code)
#    # initialize your variable to None
#    lat_lng_coords = None

#    # loop until you get the coordinates
#    while(lat_lng_coords is None):
#        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#        print(g)
#        lat_lng_coords = g.latlng

#    latitude[i] = lat_lng_coords[0]
#    longitude[i] = lat_lng_coords[1]

In [20]:
# Obtain coordinates from the csv file
!wget -O Geospatial_Coordinates.csv http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv
df_geo = pd.read_csv("Geospatial_Coordinates.csv")
df_toronto.sort_values(by=['Postal Code'],inplace=True)
df_geo.sort_values(by=['Postal Code'],inplace=True)
df_toronto['Latitude'] = df_geo['Latitude']
df_toronto['Longitude'] = df_geo['Longitude']
df_toronto.reset_index(inplace=True)
df_toronto.drop(columns=['index'],inplace=True)
df_toronto

--2020-06-17 20:46:31--  http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv
Resolving cocl.us (cocl.us)... 158.85.108.86, 169.48.113.194, 158.85.108.83
Connecting to cocl.us (cocl.us)|158.85.108.86|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv [following]
--2020-06-17 20:46:32--  https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv
Connecting to cocl.us (cocl.us)|158.85.108.86|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-06-17 20:46:32--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.29.197
Connecting to ibm.box.com (ibm.box.com)|107.152.29.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjj

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.727929,-79.262029
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.794200,-79.262029
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.778517,-79.346556
3,M1G,Scarborough,Woburn,43.770120,-79.408493
4,M1H,Scarborough,Cedarbrae,43.745906,-79.352188
5,M1J,Scarborough,Scarborough Village,43.728496,-79.495697
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.709060,-79.363452
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.728020,-79.388790
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.667967,-79.367675
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.650571,-79.384568


## 3. Cluster postal codes into Boroughs

##### Here, let's try something very simple - applying K-means to Borough encoding. It is just to check if K-means indeed successfully clusters by Boroughs. Also, it would be nice to visualize where these Boroughs are within Toronto.

In [21]:
nborough = len(df_toronto['Borough'].unique())
print('There are {} Boroughs.'.format(nborough))

# do one hot encoding on Boroughs
df_onehot = pd.get_dummies(df_toronto[['Borough']], prefix="", prefix_sep="")

# add the Postal Code column back to dataframe
df_onehot['Postal Code'] = df_toronto['Postal Code'] 
df_onehot.sort_index

# move the Postal Code column to the first column
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]
#df_grouped = df_onehot.groupby('Postal Code').mean().reset_index()
#df_grouped.head()

There are 10 Boroughs.


In [22]:
# set number of clusters
kclusters = len(df_toronto['Borough'].unique()) # cluster into Boroughs

df_cluster = df_onehot.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
df_cluster.insert(0, 'Cluster Label', kmeans.labels_)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
#manhattan_merged = manhattan_data
#manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
#manhattan_merged.head() # check the last columns!

In [5]:
# create map
map_clusters = folium.Map(location=[df_toronto['Latitude'].mean(), df_toronto['Longitude'].mean()], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_cluster['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

NameError: name 'folium' is not defined

### 4. Learn Borough classification model using coordinates

In [106]:
# X = feature matrix, y = response data                                                                                                                                                                          
X = np.asarray(df_toronto[['Latitude','Longitude']])
y = np.asarray(df_toronto['Borough'].astype('category').cat.codes)

# normalize dataset                                                                                                                                                                                              
from sklearn import preprocessing
X = preprocessing.MinMaxScaler().fit(X).transform(X)
#StandardScaler().fit(X).transform(X)

# train-test split                                                                                                                                                                                               
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4 )
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

# Logistic Regression                                                                                                                                                                                           
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(solver='newton-cg', max_iter=100, multi_class='multinomial', penalty='l2', C=0.1).fit(X_train,y_train)
yhat = LR.predict(X_test)
yhat_prob = LR.predict_proba(X_test)
print(y_test == yhat)
print(np.sum(y_test == yhat)/len(y_test == yhat))

# Support Vector Machine                                                                                                                                                                                           
from sklearn import svm
clf = svm.SVC(kernel='sigmoid',gamma='auto', C=1)
clf.fit(X_train, y_train)                                                                                                                                                                                        
yhat = clf.predict(X_test)
print(y_test == yhat)
print(np.sum(y_test == yhat)/len(y_test == yhat))

# run k-means clustering
kmeans = KMeans(n_clusters=10, random_state=0).fit(X)
# check cluster labels generated for each row in the dataframe
kmeans.labels_

Train set: (82, 2) (82,)
Test set: (21, 2) (21,)
[False False False  True False False False False  True False  True False
 False False False  True False False False False  True]
0.23809523809523808
[False False False  True False False False False  True False  True False
 False False False  True False False False False  True]
0.23809523809523808


array([4, 6, 6, 1, 4, 7, 5, 5, 0, 0, 0, 5, 0, 3, 0, 3, 9, 4, 7, 8, 5, 0,
       0, 0, 5, 2, 4, 6, 1, 4, 8, 5, 0, 0, 2, 4, 6, 1, 1, 1, 8, 8, 5, 0,
       0, 0, 5, 7, 3, 9, 3, 7, 7, 2, 8, 6, 1, 1, 7, 8, 8, 5, 0, 5, 0, 5,
       3, 3, 8, 9, 7, 2, 4, 2, 1, 4, 7, 8, 0, 5, 0, 5, 0, 0, 3, 2, 3, 3,
       9, 3, 7, 7, 7, 4, 4, 6, 0, 0, 5, 0, 0, 9, 9], dtype=int32)

In [None]:
# create map
map_clusters = folium.Map(location=[df_toronto['Latitude'].mean(), df_toronto['Longitude'].mean()], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### 5. Use Foursquare to scrape venue data

In [23]:
CLIENT_ID = 'M0AMS10V3MXAFMKQGK3JY3TE0SDODCNBRLBVDAUQCJPTVS5X' # your Foursquare ID
CLIENT_SECRET = 'C1PYOJOMKZEDOJN143T3IVFRH5QNMYAK5QYMWBK2BDQPYD5G' # your Foursquare Secret
VERSION = '20180604'
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: M0AMS10V3MXAFMKQGK3JY3TE0SDODCNBRLBVDAUQCJPTVS5X
CLIENT_SECRET:C1PYOJOMKZEDOJN143T3IVFRH5QNMYAK5QYMWBK2BDQPYD5G


In [46]:
search_query = 'coffee'
radius = 5000 # in meters
LIMIT = 1000  # Maximum number of results to obtain
print('Querying for',search_query)

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, df_toronto['Latitude'].mean(), df_toronto['Longitude'].mean(), VERSION, search_query, radius, LIMIT)
results = requests.get(url).json()

# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
df_venues = json_normalize(venues)
df_venues = df_venues[df_venues['location.city']=='Toronto']
print(len(df_venues),'venues obtained')
df_venues.head(10)

Querying for coffee
48 venues obtained


Unnamed: 0,categories,hasPerk,id,location.address,location.cc,location.city,location.country,location.crossStreet,location.distance,location.formattedAddress,location.labeledLatLngs,location.lat,location.lng,location.neighborhood,location.postalCode,location.state,name,referralId
0,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4ae33eb5f964a5203a9221e3,700 mt pleasant ave,CA,Toronto,Canada,,646,"[700 mt pleasant ave, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.70602710190901...",43.706027,-79.389369,,,ON,Timothy's World Coffee,v-1592429354
3,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,5aff3c8b89e490002ca035e3,1860 Bayview Ave,CA,Toronto,Canada,,1933,"[1860 Bayview Ave, Toronto ON M4G 0C3, Canada]","[{'label': 'display', 'lat': 43.71489, 'lng': ...",43.71489,-79.37778,,M4G 0C3,ON,WFM Coffee Bar,v-1592429354
4,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4aeed74cf964a5205dd421e3,1 Pleasant Blvd,CA,Toronto,Canada,at Yonge St.,1972,"[1 Pleasant Blvd (at Yonge St.), Toronto ON, C...","[{'label': 'display', 'lat': 43.687086, 'lng':...",43.687086,-79.393541,,,ON,Timothy's World Coffee,v-1592429354
5,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4c8633a8d92ea09359ed6b72,2300 Yonge Street,CA,Toronto,Canada,,268,"[2300 Yonge Street, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.70671799326533...",43.706718,-79.398766,,,ON,Timothy's World Coffee,v-1592429354
6,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4bec3c4662c0c928c61ae3d4,90 Eglinton Ave E,CA,Toronto,Canada,btwn. Yonge St. and Redpath Ave.,323,[90 Eglinton Ave E (btwn. Yonge St. and Redpat...,"[{'label': 'display', 'lat': 43.70733914888304...",43.707339,-79.39577,,M4P 1A6,ON,Timothy's World Coffee,v-1592429354
7,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4c5783a28fe2ef3b466b303a,233 Eglinton Ave West,CA,Toronto,Canada,Oriole Parkway and Eglinton,709,[233 Eglinton Ave West (Oriole Parkway and Egl...,"[{'label': 'display', 'lat': 43.703966, 'lng':...",43.703966,-79.40592,,,ON,Coffee Time,v-1592429354
8,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4ad79243f964a5204c0c21e3,519 Parliament St.,CA,Toronto,Canada,btwn Carlton & Winchester,4953,[519 Parliament St. (btwn Carlton & Winchester...,"[{'label': 'display', 'lat': 43.66529519392083...",43.665295,-79.368335,,M4X 1P3,ON,Jetfuel Coffee,v-1592429354
9,"[{'id': '4bf58dd8d48988d16d941735', 'name': 'C...",False,4ff75b7ee4b09036684f84cb,2011 Yonge St.,CA,Toronto,Canada,,399,"[2011 Yonge St., Toronto ON, Canada]","[{'label': 'display', 'lat': 43.70102127431561...",43.701021,-79.397027,,,ON,Rachel's Coffee House,v-1592429354
10,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4ea88fc7f5b9df9610511b8d,1471 Eglinton Avenue W.,CA,Toronto,Canada,at Winona Dr.,3459,"[1471 Eglinton Avenue W. (at Winona Dr.), Toro...","[{'label': 'display', 'lat': 43.69185299317205...",43.691853,-79.436344,,M6E 2G6,ON,Coffee Time Donuts,v-1592429354
11,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4c3b41985810a593cc55ba3c,10 Scrivener Square,CA,Toronto,Canada,,2708,"[10 Scrivener Square, Toronto ON, Canada]","[{'label': 'display', 'lat': 43.68070416529024...",43.680704,-79.390905,,,ON,Timothy's World Coffee,v-1592429354


In [29]:
# create map
map_clusters = folium.Map(location=[df_toronto['Latitude'].mean(), df_toronto['Longitude'].mean()], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, name in zip(df_venues['location.lat'], df_venues['location.lng'], df_venues['name']):
    label = folium.Popup(str(name), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label).add_to(map_clusters)
       
map_clusters

NameError: name 'folium' is not defined

In [45]:
for venue_id, lat, lon, name in zip(df_venues['id'],df_venues['location.lat'],df_venues['location.lng'],df_venues['name']):
    url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(venue_id, CLIENT_ID, CLIENT_SECRET, VERSION)

    result = requests.get(url).json()
    print(result)
    try:
        print(name, result['response']['venue']['rating'])
    except:
        print(name, '- This venue has not been rated yet.')

{'meta': {'code': 429, 'errorType': 'quota_exceeded', 'errorDetail': 'Quota exceeded', 'requestId': '5eea8339b1cac0001be422d4'}, 'response': {}}
Timothy's World Coffee - This venue has not been rated yet.
{'meta': {'code': 429, 'errorType': 'quota_exceeded', 'errorDetail': 'Quota exceeded', 'requestId': '5eea82f8fb34b5001b01e415'}, 'response': {}}
WFM Coffee Bar - This venue has not been rated yet.
{'meta': {'code': 429, 'errorType': 'quota_exceeded', 'errorDetail': 'Quota exceeded', 'requestId': '5eea828fa2e538001be57472'}, 'response': {}}
Timothy's World Coffee - This venue has not been rated yet.
{'meta': {'code': 429, 'errorType': 'quota_exceeded', 'errorDetail': 'Quota exceeded', 'requestId': '5eea8390542890001bd83ae4'}, 'response': {}}
Timothy's World Coffee - This venue has not been rated yet.
{'meta': {'code': 429, 'errorType': 'quota_exceeded', 'errorDetail': 'Quota exceeded', 'requestId': '5eea838e216785001b03097f'}, 'response': {}}
Timothy's World Coffee - This venue has not