Code to segment and cluster Toronto neighborhoods based on data scraped from the web

In [258]:
import pandas as pd
import numpy as np
import requests
#import json
#from pandas.io.json import json_normalize

Scrape Toronto neighborhood data from Wikipedia

In [259]:
#set wikipedia url that shows Toronto neighborhoods and postal codes
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [260]:
df_raw = pd.read_html(wiki_url)

In [261]:
#Read the first table on the page into a data frame (assuming that the first table contains the neighborhood data)
neighborhoods = pd.DataFrame(df_raw[0])

In [262]:
#drop unassigned post codes
neighborhoods=neighborhoods[neighborhoods['Borough']!='Not assigned'].reset_index(drop=True)

In [263]:
neighborhoods.shape

(103, 3)

Get neighborhood coordinates from Google

In [264]:
!pip install geocoder
import geocoder # import geocoder



In [265]:
#define Google API key
API_key = input("Google API key: ")
# initialize variables
lat_lng_coords = None
latitude = []
longitude = []

#Lookup coordinates for postal codes and store in latitude, longitude lists
for index, postal_code in zip(range(0,len(neighborhoods['Postal Code'])),neighborhoods['Postal Code']):
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code),key=API_key)
    lat_lng_coords = g.latlng    
    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])
 

Google API key: AIzaSyC1Iz9BHdQIcR3iyCg7oEuJoxNtmPuL494


In [266]:
neighborhoods['Latitude'] =  latitude
neighborhoods['Longitude'] =  longitude

In [267]:
neighborhoods.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Import libraries to cluster Toronto neighborhoods and display the results on a map

In [268]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



Define Foursquare Credentials and Version

In [269]:
CLIENT_ID = input('Foursquare client ID: ') # your Foursquare ID
CLIENT_SECRET = input('Foursquare client secret: ') # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

Foursquare client ID: FSE4ZUITGDEMRGRLETJA1MYLJHATSRUS2ZNRCX3IKILU3R5V
Foursquare client secret: K5WH3QB2JB0JTPXBU34G4LKKRTTAEBAPUORGJHOG4OJUBILJ


Define function that gets the top 100 venues within 500 meters of a given neighborhood

In [270]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
                   
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        if len(results) == 0: 
            # if no venues were found within the radius specified
            venues_list.append([(
                name, 
                lat, 
                lng, 
                "No venues nearby", 
                lat, 
                lng,  
                "None")])     
        else:
            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Create new data frame that includes venue data for each neighborhood in Toronto

In [271]:
Toronto_venues = getNearbyVenues(names=neighborhoods['Neighbourhood'], 
                                 latitudes=neighborhoods['Latitude'],
                                 longitudes=neighborhoods['Longitude']
                                  )

Group neighborhoods by venue category

In [272]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighbourhood'] = Toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_grouped = Toronto_onehot.groupby('Neighbourhood').mean().reset_index()

Define function that sort venues in descending order and then create a dataframe that shows the top 10 venues for each neighborhood

In [273]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
   
    return row_categories_sorted.index.values[0:num_top_venues]

In [274]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = Toronto_grouped['Neighbourhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

#neighborhoods_venues_sorted.head()

Clean data set where a neighbourhood has less that the num_top_venues number of venues

In [275]:
#check 
neighborhoods_venues_sorted_clean = neighborhoods_venues_sorted.copy()
for nhood in range(neighborhoods_venues_sorted_clean.shape[0]):
    for ven in range(num_top_venues):
        current_venue = neighborhoods_venues_sorted_clean.iloc[nhood,ven+1]
        if Toronto_grouped.loc[nhood,current_venue] == 0.0:
            neighborhoods_venues_sorted_clean.iloc[nhood,ven+1] = "None"

Cluster the neighborhoods into 5 clusters and then create new dataframe that includes the cluster and top 10 venues per neighborhood

In [279]:
# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
neighborhoods_venues_sorted_clean.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = neighborhoods

# merge Toronto_grouped with neighborhoods to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted_clean.set_index('Neighbourhood'), on='Neighbourhood')


Visualize resulting clusters

In [280]:
#Set coordinates for neighborhood that centers map display
address = 'Leaside, East York' #set Toronto coordinates to neighborhood that centers map display 
location = None #Initialize location variable

g = geocoder.google(address,key=API_key)
location = g.latlng 
latitude_TO = location[0]
longitude_TO = location[1]

In [281]:
# create map
map_Toronto = folium.Map(location=[latitude_TO, longitude_TO], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighbourhood'], Toronto_merged['Cluster Labels']):
    cluster_int = int(cluster)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster_int), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster_int-1],
        fill=True,
        fill_color=rainbow[cluster_int-1],
        fill_opacity=0.7).add_to(map_Toronto)
       
map_Toronto