# Week 3 - Segmenting and Clustering Neighborhoods in Toronto

Irene Bonati

### Import the necessary packages

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import geocoder
from geopy.geocoders import Nominatim # Converts an address into latitude and longitude values
import folium
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## Part 1

### Scrape Wikipedia website containing the list of postal codes of Canada

In [2]:
# Define URL and scrape through website
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = requests.get(url) 
soup = BeautifulSoup(req.content,'lxml') 

tor_neighb=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned': # Skip if no borough is assigned
        pass
    else:
        # Define three columns: PostalCode, Borough, and Neighborhood
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        tor_neighb.append(cell) # Append name if more there is more than one neighborhood in a given borough

# Create dataframe of Toronto neighbourhoods        
df=pd.DataFrame(tor_neighb)   

# Replace name of specific cells within the dataframe
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [3]:
# Print the shape of the dataframe
df.shape

(103, 3)

## Part 2

### Get the latitude and the longitude coordinates of each neighborhood and add them to the dataframe

In [4]:
def lat_long(pc):
    lati_long_coords = None
    while(lati_long_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(pc))
        lati_long_coords = g.latlng
    return lati_long_coords

# Retrieve the postal code coordinates
postal_codes = df['PostalCode']    
coords = [lat_long(postal_code) for postal_code in postal_codes.tolist()]

# Adding columns (latitude and longitude)
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188


## Part 3

### Explore and cluster the neighborhoods in Toronto

Let's first get the geographical coordinates of Toronto:

In [5]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Let's only consider the boroughs that have the word 'Toronto' in them.

In [6]:
neighb_tor= df[df['Borough'].str.contains('Toronto', na = False)].reset_index(drop=True)
neighb_tor.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804
2,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587
3,M4E,East Toronto,The Beaches,43.67709,-79.29547
4,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306


Let's visualize these neighborhoods in the Toronto map using Folium

In [7]:
# Create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# Add markers to map
for lat, lng, label in zip(neighb_tor['Latitude'], neighb_tor['Longitude'], neighb_tor['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Define the Foursquare credentials

In [8]:
CLIENT_ID = 'WUNQXWP4CDMN5ZXSSTACR3QNNAOVA2GLKPD2HNKOXOPPRPQ3' # your Foursquare ID
CLIENT_SECRET = 'ZEOLBKODZBJ1QWY0C55KLJVZPYPRULXAY55F033INKHM0R54' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WUNQXWP4CDMN5ZXSSTACR3QNNAOVA2GLKPD2HNKOXOPPRPQ3
CLIENT_SECRET:ZEOLBKODZBJ1QWY0C55KLJVZPYPRULXAY55F033INKHM0R54


Let's first explore the first neighborhood(s). First, we will define its geographical coordinates.

In [9]:
print (neighb_tor.loc[0, 'Neighborhood'])

neighborhood_latitude = neighb_tor.loc[0, 'Latitude']   # neighborhood latitude value
neighborhood_longitude = neighb_tor.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = neighb_tor.loc[0, 'Neighborhood']   # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Regent Park, Harbourfront
Latitude and longitude values of Regent Park, Harbourfront are 43.65512000000007, -79.36263999999994.


The first neighborhoods are Regent Park and Harbourfront. Now let's look at the top 100 venues within a radius of 500 meters. To do this I created a GET request URL.

In [10]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # Define radius (here it is 500 m)


url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

Define function that gets the category of a given venue.

In [11]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Let's send the GET request. The data will be collecte in a JSON file which will be then transformed into a pandas data frame.

In [12]:
results = requests.get(url).json()
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# Filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# Filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# Clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  nearby_venues = json_normalize(venues) # flatten JSON


Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Figs Breakfast & Lunch,Breakfast Spot,43.655675,-79.364503
3,The Yoga Lounge,Yoga Studio,43.655515,-79.364955
4,Body Blitz Spa East,Spa,43.654735,-79.359874


Check how many venues were returned by Foursquare

In [13]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

23 venues were returned by Foursquare.


For the first neighborhood Foursquare found 23 venues. Let's define a function to get nearby venues for all the neighborhoods.

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run the above function on each neighborhood.

In [15]:
tor_venues = getNearbyVenues(names=neighb_tor['Neighborhood'],
                                   latitudes=neighb_tor['Latitude'],
                                   longitudes=neighb_tor['Longitude']
                                  )

Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Enclave of M5E
St. James Town, Cabbagetown
First Canadi

Display venues dataframe and calculate its shape.

In [16]:
print(tor_venues.shape)
tor_venues.head()

(1703, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65512,-79.36264,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65512,-79.36264,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65512,-79.36264,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
3,"Regent Park, Harbourfront",43.65512,-79.36264,The Yoga Lounge,43.655515,-79.364955,Yoga Studio
4,"Regent Park, Harbourfront",43.65512,-79.36264,Body Blitz Spa East,43.654735,-79.359874,Spa


There are a total of 1703 venues. Let's check how many venues were found for each neighborhood. This is done by grouping the venues by the different neighborhoods.

In [17]:
tor_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,63,63,63,63,63,63
"Brockton, Parkdale Village, Exhibition Place",83,83,83,83,83,83
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",81,81,81,81,81,81
Central Bay Street,64,64,64,64,64,64
Christie,10,10,10,10,10,10
Church and Wellesley,83,83,83,83,83,83
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,26,26,26,26,26,26
Davisville North,11,11,11,11,11,11
"Dufferin, Dovercourt Village",20,20,20,20,20,20


How many unique categories of venues are there?

In [18]:
print('There are {} uniques categories.'.format(len(tor_venues['Venue Category'].unique())))

There are 222 uniques categories.


Let's analyze each neighborhood.

In [19]:
# one hot encoding
tor_onehot = pd.get_dummies(tor_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = tor_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

tor_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
tor_onehot.shape

(1703, 222)

Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each venue category.

In [21]:
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()
tor_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,Berczy Park,0.015873,0.0,0.0,0.015873,0.0,0.015873,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.012048,0.012048,0.0,0.0,0.0,0.0,0.0,0.024096,0.0,...,0.012048,0.0,0.0,0.0,0.012048,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, King and Spadina, Railway Lands, Har...",0.012346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,...,0.0,0.0,0.0,0.012346,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.015625,0.015625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.015625,0.015625,0.015625,0.0,0.0
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.012048,0.0,0.012048,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Commerce Court, Victoria Hotel",0.01,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Dufferin, Dovercourt Village",0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Check the shape of the data frame (number of rows should be the same as the number of neighborhoods, 39 if only the ones with 'Toronto' are considered.

In [22]:
tor_grouped.shape

(39, 222)

For each neighborhood let's show the top 5 most common venues

In [23]:
num_top_venues = 5

for hood in tor_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = tor_grouped[tor_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                venue  freq
0        Cocktail Bar  0.05
1         Coffee Shop  0.05
2  Seafood Restaurant  0.05
3              Bakery  0.05
4          Restaurant  0.03


----Brockton, Parkdale Village, Exhibition Place----
         venue  freq
0  Coffee Shop  0.06
1          Bar  0.06
2         Café  0.06
3   Restaurant  0.05
4    Gift Shop  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                venue  freq
0  Italian Restaurant  0.07
1         Coffee Shop  0.06
2                Café  0.05
3   French Restaurant  0.04
4       Grocery Store  0.04


----Central Bay Street----
                       venue  freq
0                Coffee Shop  0.11
1             Clothing Store  0.06
2  Middle Eastern Restaurant  0.03
3                      Hotel  0.03
4           Sushi Restaurant  0.03


----Christie----
           venue  freq
0           Café   0.3
1  Grocery Store   0.2
2     Baby Store   

It seems like coffee shops are the most common type of venue in most of the neighborhoods explored here. Let's define a function that sorts the venues in descending order. We will use it to put the results into a dataframe.

In [24]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Let's create the new dataframe and display the top 10 venues for each neighborhood.

In [25]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Bakery,Cocktail Bar,Coffee Shop,Seafood Restaurant,Pharmacy,Restaurant,Cheese Shop,Pub,Breakfast Spot,Italian Restaurant
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Bar,Restaurant,Sandwich Place,Gift Shop,Lounge,Japanese Restaurant,Breakfast Spot,Nightclub
2,"CN Tower, King and Spadina, Railway Lands, Har...",Italian Restaurant,Coffee Shop,Café,Bar,French Restaurant,Park,Grocery Store,Gym / Fitness Center,Speakeasy,Bakery
3,Central Bay Street,Coffee Shop,Clothing Store,Cosmetics Shop,Hotel,Sushi Restaurant,Sandwich Place,Restaurant,Middle Eastern Restaurant,Bubble Tea Shop,Plaza
4,Christie,Café,Grocery Store,Coffee Shop,Italian Restaurant,Baby Store,Playground,Candy Store,Ethiopian Restaurant,Electronics Store,Escape Room


Let's run the k-means algorithm to cluster the neighborhoods into 4 clusters.

In [26]:
# set number of clusters
kclusters = 3

tor_grouped_clustering = tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster index and the top 10 venues for every neighborhood.

In [27]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tor_merged = neighb_tor

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
tor_merged = tor_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

tor_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264,0,Coffee Shop,Restaurant,Breakfast Spot,Yoga Studio,Bakery,Wine Shop,Italian Restaurant,Food Truck,Event Space,Electronics Store
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.65739,-79.37804,0,Coffee Shop,Clothing Store,Café,Japanese Restaurant,Italian Restaurant,Middle Eastern Restaurant,Cosmetics Shop,Hotel,Bubble Tea Shop,Ramen Restaurant
2,M5C,Downtown Toronto,St. James Town,43.65215,-79.37587,0,Coffee Shop,Italian Restaurant,Café,Cosmetics Shop,Cocktail Bar,Lingerie Store,Hotel,Restaurant,Japanese Restaurant,Beer Bar
3,M4E,East Toronto,The Beaches,43.67709,-79.29547,0,Health Food Store,Trail,Pub,Donut Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space
4,M5E,Downtown Toronto,Berczy Park,43.64536,-79.37306,0,Bakery,Cocktail Bar,Coffee Shop,Seafood Restaurant,Pharmacy,Restaurant,Cheese Shop,Pub,Breakfast Spot,Italian Restaurant


Let's visualize the clusters in the map using Folium.

In [28]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighborhood'], tor_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Let's examine each cluster and show the venue categories that distinguish each cluster.

Cluster 1: coffee shops and food stores

In [29]:
tor_merged.loc[tor_merged['Cluster Labels'] == 0, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,0,Coffee Shop,Restaurant,Breakfast Spot,Yoga Studio,Bakery,Wine Shop,Italian Restaurant,Food Truck,Event Space,Electronics Store
1,Downtown Toronto,0,Coffee Shop,Clothing Store,Café,Japanese Restaurant,Italian Restaurant,Middle Eastern Restaurant,Cosmetics Shop,Hotel,Bubble Tea Shop,Ramen Restaurant
2,Downtown Toronto,0,Coffee Shop,Italian Restaurant,Café,Cosmetics Shop,Cocktail Bar,Lingerie Store,Hotel,Restaurant,Japanese Restaurant,Beer Bar
3,East Toronto,0,Health Food Store,Trail,Pub,Donut Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space
4,Downtown Toronto,0,Bakery,Cocktail Bar,Coffee Shop,Seafood Restaurant,Pharmacy,Restaurant,Cheese Shop,Pub,Breakfast Spot,Italian Restaurant
5,Downtown Toronto,0,Coffee Shop,Clothing Store,Cosmetics Shop,Hotel,Sushi Restaurant,Sandwich Place,Restaurant,Middle Eastern Restaurant,Bubble Tea Shop,Plaza
6,Downtown Toronto,0,Café,Grocery Store,Coffee Shop,Italian Restaurant,Baby Store,Playground,Candy Store,Ethiopian Restaurant,Electronics Store,Escape Room
7,Downtown Toronto,0,Hotel,Café,Coffee Shop,Gym,Restaurant,Japanese Restaurant,Salad Place,Steakhouse,Asian Restaurant,Bookstore
8,West Toronto,0,Grocery Store,Park,Pet Store,Bank,Furniture / Home Store,Middle Eastern Restaurant,Pharmacy,Pizza Place,Post Office,Café
10,Downtown Toronto,0,Coffee Shop,Hotel,Japanese Restaurant,Park,Plaza,Boat or Ferry,Sandwich Place,Sports Bar,Sporting Goods Shop,IT Services


Cluster 2: Fast food restaurants

In [30]:
tor_merged.loc[tor_merged['Cluster Labels'] == 1, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,Central Toronto,1,Fast Food Restaurant,Wings Joint,Donut Shop,Fish & Chips Shop,Farmers Market,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant,Escape Room


Cluster 3: Parks and gyms

In [31]:
tor_merged.loc[tor_merged['Cluster Labels'] == 2, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,East York/East Toronto,2,Park,Intersection,Wings Joint,Donut Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space
21,Central Toronto,2,Park,Business Service,Wings Joint,Dumpling Restaurant,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space
22,West Toronto,2,Park,Sandwich Place,Wings Joint,Donut Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant
23,Central Toronto,2,Gym Pool,Photography Studio,Park,Playground,Dog Run,Farmers Market,Farm,Falafel Restaurant,Event Space,Ethiopian Restaurant
29,Central Toronto,2,Gym,Park,Playground,Tennis Court,Wings Joint,Dog Run,Farmers Market,Farm,Falafel Restaurant,Event Space
33,Downtown Toronto,2,Park,Japanese Restaurant,Playground,Bike Trail,Donut Shop,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space
