# Coursera Capstone - Segmenting and Clustering Neighborhoods in Toronto

## Part 1 - Scrape the Toronto neighbourhoods list from Wikipedia using BeautifulSoup

#### 1/ Imports

In [2]:
import pandas as pd
import numpy as np
!conda install -c ulmo beautifulsoup4      #Uncomment this if BeautifulSoup is not installed on your system
!conda install -c ulmo urllib3             #Uncomment this if urllib3 is not installed on your system
from bs4 import BeautifulSoup
import urllib3

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.8.1       |           py36_0         153 KB

The following packages will be UPDATED:

    beautifulsoup4: 4.7.1-py36_1                  --> 4.8.1-py36_0     
    openssl:        1.1.1d-h516909a_0 conda-forge --> 1.1.1d-h7b6447c_3


Downloading and Extracting Packages
beautifulsoup4-4.8.1 | 153 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - urllib3


The following packages will be downloaded:

    package                    |            build
    -------------------

#### 2/ Consts and globals

In [3]:
# To scrap the data
CONST_wikipediaLink = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Key for the dataframe we will use
CONST_dfColumns = ['PostalCode', 'Borough', 'Neighborhood']

# Size of the radius to retrieve venues from FoursquareAPI, and limit of venues per neighbourhood
CONST_venuesRadiusScan = 1100
CONST_venuesLimit = 100

#### 3/ Scrape the raw data using BeautifulSoup

In [4]:
http = urllib3.PoolManager()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Scrape the raw xml data from Wikipedia
response = http.request('GET', CONST_wikipediaLink)
soup = BeautifulSoup(response.data, "lxml")

#soup

#### 4/ Parse the raw data

In [5]:
df_neighbourhoods = pd.DataFrame(columns=CONST_dfColumns)
fullList = []
# Retrieve all neighbourhoods, in the <tr> tags
neighbourhoodsList = soup.body.table.find_all('tr')

# Loop through each neighbourhood, in the <td> tags
for neighbourhood in neighbourhoodsList:
    neighbourElts = neighbourhood.find_all('td')
    
    # Loop through each attritube of the current neighbourhood : name, title, and wikipedia url
    tmpList = []
    for elt in neighbourElts:
        # Remove the tags & newlines
        tmpList.append(str(elt.get_text().strip()))
        
    fullList.append(tmpList)

# Add the scraped nneighbourhoods into the dataframe
df_neighbourhoods=pd.DataFrame(fullList,columns=CONST_dfColumns)
df_neighbourhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


#### 5/ Data preparation

In [6]:
# Filter all unassigned neighbourhoods
df_neighbourhoods = df_neighbourhoods[(df_neighbourhoods.Borough.notnull())]
df_neighbourhoods = df_neighbourhoods[(df_neighbourhoods.Borough != "Not assigned")]

# Group by PostalCode/Borough
df_neighbourhoods = pd.DataFrame(df_neighbourhoods.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(list)).reset_index()
df_neighbourhoods['Neighborhood'] = df_neighbourhoods['Neighborhood'].apply(lambda x: ', '.join(x))

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
df_neighbourhoods.loc[ (df_neighbourhoods.Neighborhood.isnull() == True) |
               (df_neighbourhoods.Neighborhood == "Not assigned")
               , 'Neighborhood'] = df_neighbourhoods.Borough

df_neighbourhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
df_neighbourhoods.shape

(103, 3)

## Part 2 - Add the latitude and the longitude coordinates of each neighbourhood

#### 1/ Imports

In [8]:
!conda install -c conda-forge geocoder             #Uncomment this if geocoder is not installed on your system
import geocoder # import geocoder

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geocoder


The following packages will be DOWNGRADED:

    openssl: 1.1.1d-h7b6447c_3 --> 1.1.1d-h516909a_0 conda-forge

Preparing transaction: done
Verifying transaction: done
Executing transaction: done


#### 2/ Get all coordinates by postal code, using geocoder with arcgis

In [9]:
# This function returns the latitude and longitude of the given postal code, in Toronto
def getCoordsByPostalCode(postal_code):
    # initialize to None : this variable will allow us to loop until geocoder responds with the coordinates
    lat_lng_coords = None

    # loop until we get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

In [10]:
postalCodesWithCoordsList = []

# Loop through each postal code from our dataframe
for postalCode in df_neighbourhoods['PostalCode']:
    # Fill the temp list with the coordinates from geocoder
    latitude, longitude = getCoordsByPostalCode(postalCode)
    postalCodesWithCoordsList.append([postalCode, latitude, longitude])
    
# Transform the temp list into a dataframe
df_coords = pd.DataFrame(postalCodesWithCoordsList)
df_coords.columns = ['PostalCode', 'Latitude', 'Longitude']

# Merge the coordinates dataframe with the original neighbourhoods dataframe (key : postal code)
df_neighbourhoods = pd.merge(df_neighbourhoods, df_coords, on='PostalCode')

df_neighbourhoods_toronto = df_neighbourhoods

df_neighbourhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


## Part 3 - Neighbourhoods clustering using venues from FoursquareAPI

#### 1/ Imports

In [None]:
!conda install -c conda-forge folium                #Uncomment this if folium is not installed on your system
import folium
import requests

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Solving environment: \ 

#### 2/ Generate the map of Toronto with the neighbourhoods

Retrieve the coordinates of the city of Toronto so we can draw it using Folium

In [11]:
address = 'Toronto, ON'

# initialize to None
lat_lng_coords = None

# loop until we get the coordinates
while(lat_lng_coords is None):
    g = geocoder.arcgis('Toronto, Ontario')
    lat_lng_coords = g.latlng

latitude_toronto = lat_lng_coords[0]
longitude_toronto = lat_lng_coords[1]

print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude_toronto, longitude_toronto))

The geograpical coordinate of Toronto are 43.648690000000045, -79.38543999999996.


We can draw the city of Toronto with the neighbourhoods from the dataframe

In [12]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood, postalCode in zip(df_neighbourhoods_toronto['Latitude'], df_neighbourhoods_toronto['Longitude'], df_neighbourhoods_toronto['Borough'], df_neighbourhoods_toronto['Neighborhood'], df_neighbourhoods_toronto['PostalCode']):
    label = '{}'.format(postalCode)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto) 
    
map_toronto

We draw the same map, but with each neighbourhood represented by a circle having a radius equals to the radius size **CONST_venuesRadiusScan**, so we can see how far FoursquareAPI will retrieve venues for each neighbourhood.  
We can see that the city is widely covered by the circles, so the radius size is adequate.

In [13]:
# create map of Toronto using latitude and longitude values
map_toronto_radius = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood, postalCode in zip(df_neighbourhoods_toronto['Latitude'], df_neighbourhoods_toronto['Longitude'], df_neighbourhoods_toronto['Borough'], df_neighbourhoods_toronto['Neighborhood'], df_neighbourhoods_toronto['PostalCode']):
    label = '{}'.format(postalCode)
    label = folium.Popup(label, parse_html=True)

    folium.Circle(
        [lat, lng],
        radius=CONST_venuesRadiusScan,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.3).add_to(map_toronto_radius) 
    
map_toronto_radius

#### 3/ Get the venues in Toronto, using FoursquareAPI

FoursquareAPI credentials

In [None]:
##Utilizing the Foursquare API to explore and segment neighborhoods
CLIENT_ID = 2GFKIZFPHZMTSMVADOZIQK2SHKOMAFGOBFL03UPJAFI1VU4C # your Foursquare ID
CLIENT_SECRET = HTHH43W3AKEFZH5XDJKZC0YXMQ0DH34HB5CJPT0IJZLEAIWF # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

This function will retrieve the FoursquareAPI venues for all the neighbourhoods given in parameters

In [15]:
def getNearbyVenues(postalCodes, boroughs, neighbourhoods, latitudes, longitudes):
    
    venues_list=[]
    # Loop through each neighbourhood given in parameters
    for postalCode, borough, neighbourhood, lat, lng in zip(postalCodes, boroughs, neighbourhoods, latitudes, longitudes):
            
        # create the API request URL to explore the neighbourhood using FoursquareAPI
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            CONST_venuesRadiusScan, 
            CONST_venuesLimit)

        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue : name, latitude, longitude, and the categories' names
        venues_list.append([(
            postalCode,
            borough,
            neighbourhood, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    # add the venues in the dataframe
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
                        'PostalCode',
                        'Borough',
                        'Neighborhood', 
                        'Neighborhood Latitude', 
                        'Neighborhood Longitude', 
                        'Venue', 
                        'Venue Latitude', 
                        'Venue Longitude', 
                        'Venue Category'
    ]
    
    return(nearby_venues)

Loop through each Toronto neighbourhood and retrieve its venues using the previous function

In [16]:
# Get the venues for each neighbourhood
toronto_venues = getNearbyVenues(  
                                    postalCodes=df_neighbourhoods_toronto['PostalCode'],
                                    boroughs=df_neighbourhoods_toronto['Borough'],
                                    neighbourhoods=df_neighbourhoods_toronto['Neighborhood'],
                                    latitudes=df_neighbourhoods_toronto['Latitude'],
                                    longitudes=df_neighbourhoods_toronto['Longitude']
                                  )

toronto_venues.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,Scarborough,"Rouge, Malvern",43.81165,-79.195561,Canadiana exhibit,43.817962,-79.193374,Zoo Exhibit
1,M1B,Scarborough,"Rouge, Malvern",43.81165,-79.195561,Lion Exhibit,43.819228,-79.186977,Zoo Exhibit
2,M1B,Scarborough,"Rouge, Malvern",43.81165,-79.195561,Images Salon & Spa,43.802283,-79.198565,Spa
3,M1B,Scarborough,"Rouge, Malvern",43.81165,-79.195561,Wendy's,43.807448,-79.199056,Fast Food Restaurant
4,M1B,Scarborough,"Rouge, Malvern",43.81165,-79.195561,Wendy's,43.802008,-79.19808,Fast Food Restaurant


Analyse each neighborhood : each type of venue will be one hot encoded so we will be able to perform a K-means clustering on the dataframe

In [17]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add postalCode, borough, and neighborhood column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['PostalCode'] 
toronto_onehot['Borough'] = toronto_venues['Borough'] 

toronto_onehot.pop('Neighborhood')
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move postalCode, borough, neighborhood column to the first column
for i in range(0, 3):
    toronto_onehot = toronto_onehot[[toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])]

toronto_onehot.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,M1B,Scarborough,"Rouge, Malvern",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,M1B,Scarborough,"Rouge, Malvern",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,M1B,Scarborough,"Rouge, Malvern",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1B,Scarborough,"Rouge, Malvern",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1B,Scarborough,"Rouge, Malvern",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Group rows by postal code, borough and neighbourhood, using means

In [18]:
toronto_grouped = toronto_onehot.groupby(['PostalCode','Borough', 'Neighborhood']).mean().reset_index()
toronto_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,M1B,Scarborough,"Rouge, Malvern",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.615385
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,Scarborough,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,Scarborough,Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.035714,0.0,0.035714,0.0


Print each neighborhood along with the top 5 most common venues

In [19]:
num_top_venues = 5

# Iterate through all the grouped dataframe
for index, row in toronto_grouped.iterrows():
    tempPostalCode = row['PostalCode']
    tempBorough = row['Borough']
    tempNeighborhood = row['Neighborhood']
    
    print("----"+tempPostalCode + " / " + tempBorough + " / " + tempNeighborhood +"----")
    
    # Create a temp df filtered on the current neighbourhood (key : postal code x borouhg x neighbourhood)
    temp = toronto_grouped[
        (toronto_grouped.PostalCode == tempPostalCode) &
        (toronto_grouped.Borough == tempBorough) &
        (toronto_grouped.Neighborhood == tempNeighborhood)
    ].T.reset_index()
    
    temp.columns = ['venue','freq']

    # We skip the key PostalCode x Borough x Neighbourhood : length = 3, iloc[3:]
    temp = temp.iloc[len(CONST_dfColumns):]
    temp['freq'] = temp['freq'].astype(float)
    
    # Round the frequency with two digits
    temp = temp.round({'freq': 2})
    
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M1B / Scarborough / Rouge, Malvern----
                  venue  freq
0           Zoo Exhibit  0.62
1  Fast Food Restaurant  0.15
2           Coffee Shop  0.08
3                   Spa  0.08
4            Hobby Shop  0.08


----M1C / Scarborough / Highland Creek, Rouge Hill, Port Union----
                venue  freq
0      Breakfast Spot  0.25
1          Playground  0.25
2  Italian Restaurant  0.25
3        Burger Joint  0.25
4   Accessories Store  0.00


----M1E / Scarborough / Guildwood, Morningside, West Hill----
                  venue  freq
0           Pizza Place  0.06
1              Pharmacy  0.06
2         Grocery Store  0.06
3           Coffee Shop  0.06
4  Fast Food Restaurant  0.06


----M1G / Scarborough / Woburn----
                  venue  freq
0     Indian Restaurant  0.09
1           Coffee Shop  0.09
2  Fast Food Restaurant  0.09
3           Pizza Place  0.09
4   Filipino Restaurant  0.05


----M1H / Scarborough / Cedarbrae----
               venue  freq
0  Indian Re

#### 4/ Put the results into a Pandas dataframe

Function to sort the venues in descending order.

In [20]:
def return_most_common_venues(row, num_top_venues):
    # Remove the key PostalCode x Borough x Neighbourhood from the row
    row_categories = row.iloc[len(CONST_dfColumns):]
    
    # Sort ascending
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    # Return the top num_top_venues
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [21]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostalCode', 'Borough', 'Neighborhood']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe, and set it with the columns names
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)

# add the keys from the grouped dataframe (Postal code x Borough x Neighborhood)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_grouped['Borough']
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

# loop through each rows
for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, len(CONST_dfColumns):] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",Zoo Exhibit,Fast Food Restaurant,Coffee Shop,Spa,Hobby Shop,Food,Falafel Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",Italian Restaurant,Playground,Breakfast Spot,Burger Joint,Farmers Market,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space,Exhibit
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",Pharmacy,Fast Food Restaurant,Pizza Place,Coffee Shop,Grocery Store,Sports Bar,Discount Store,Bank,Bar,Supermarket
3,M1G,Scarborough,Woburn,Pizza Place,Fast Food Restaurant,Coffee Shop,Indian Restaurant,Bank,Sandwich Place,Supermarket,Juice Bar,Park,Thrift / Vintage Store
4,M1H,Scarborough,Cedarbrae,Indian Restaurant,Bakery,Coffee Shop,Fried Chicken Joint,Hakka Restaurant,Supplement Shop,Sandwich Place,Caribbean Restaurant,Fish & Chips Shop,Athletics & Sports


#### 5/ Cluster neighbourhoods

In [22]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
toronto_grouped_clustering = toronto_grouped_clustering.drop('PostalCode', 1)
toronto_grouped_clustering = toronto_grouped_clustering.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 1, 4, 4, 1, 4, 4, 4, 4, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [23]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [24]:
toronto_merged = df_neighbourhoods_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index(['PostalCode','Borough', 'Neighborhood']), on=['PostalCode','Borough', 'Neighborhood'])

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.81165,-79.195561,2,Zoo Exhibit,Fast Food Restaurant,Coffee Shop,Spa,Hobby Shop,Food,Falafel Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785605,-79.158701,1,Italian Restaurant,Playground,Breakfast Spot,Burger Joint,Farmers Market,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space,Exhibit
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175299,4,Pharmacy,Fast Food Restaurant,Pizza Place,Coffee Shop,Grocery Store,Sports Bar,Discount Store,Bank,Bar,Supermarket
3,M1G,Scarborough,Woburn,43.768216,-79.21761,4,Pizza Place,Fast Food Restaurant,Coffee Shop,Indian Restaurant,Bank,Sandwich Place,Supermarket,Juice Bar,Park,Thrift / Vintage Store
4,M1H,Scarborough,Cedarbrae,43.769608,-79.23944,1,Indian Restaurant,Bakery,Coffee Shop,Fried Chicken Joint,Hakka Restaurant,Supplement Shop,Sandwich Place,Caribbean Restaurant,Fish & Chips Shop,Athletics & Sports


Visualise the clusters

In [25]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, postalCode, borough, neighborhood, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(postalCode) + ' - Cluster ' + str(cluster), parse_html=True)
    cluster = int(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### 6/ Examine clusters

**Cluster 0** focuses on Parks, Gyms, and places to practice sports and/or chillout outside.  
We can see that **most of the neighbourhoods categorised in this cluster are near several parks** on the map.

In [26]:
toronto_merged[toronto_merged['Cluster Labels'] == 0].head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.69669,-79.260069,0,Park,American Restaurant,Skating Rink,General Entertainment,Café,Gym,Gym Pool,College Stadium,Cuban Restaurant,Dumpling Restaurant
19,M2K,North York,Bayview Village,43.781015,-79.380542,0,Park,Café,Bank,Chinese Restaurant,Trail,Japanese Restaurant,Zoo Exhibit,Eastern European Restaurant,Electronics Store,Empanada Restaurant
20,M2L,North York,"Silver Hills, York Mills",43.757177,-79.37994,0,Park,Gym,Intersection,Gym / Fitness Center,Bus Stop,Furniture / Home Store,Candy Store,Farmers Market,Electronics Store,Donut Shop
23,M2P,North York,York Mills West,43.74781,-79.400062,0,Coffee Shop,Park,Gym,Tennis Court,French Restaurant,Restaurant,Seafood Restaurant,Intersection,Thai Restaurant,Gym / Fitness Center
31,M3L,North York,Downsview West,43.740945,-79.505004,0,Park,Spa,Coffee Shop,Vietnamese Restaurant,Pizza Place,Grocery Store,Bank,Exhibit,Dumpling Restaurant,Eastern European Restaurant
32,M3M,North York,Downsview Central,43.73361,-79.49675,0,Restaurant,Food Truck,Playground,Park,Baseball Field,Falafel Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant
44,M4N,Central Toronto,Lawrence Park,43.728135,-79.38709,0,Park,Coffee Shop,Café,Gym / Fitness Center,Trail,Bookstore,College Gym,College Quad,Flea Market,Falafel Restaurant
50,M4W,Downtown Toronto,Rosedale,43.68196,-79.378445,0,Park,Coffee Shop,Bank,Italian Restaurant,Trail,Grocery Store,Spa,Pizza Place,Pub,Sporting Goods Shop
94,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",43.649738,-79.554055,0,Bank,Pizza Place,Park,Hotel,Mexican Restaurant,Fish & Chips Shop,Theater,Grocery Store,American Restaurant,Pharmacy
97,M9M,North York,"Emery, Humberlea",43.733665,-79.537477,0,Coffee Shop,Golf Course,Latin American Restaurant,Café,Park,Grocery Store,Discount Store,Nightclub,Ethiopian Restaurant,Dumpling Restaurant


**Cluster 1** focuses on the diversity of restaurants, coffee shops, and bars.  
We can see that the **downtown of Toronto is highly categorised in this cluster**.

In [27]:
toronto_merged[toronto_merged['Cluster Labels'] == 1].head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785605,-79.158701,1,Italian Restaurant,Playground,Breakfast Spot,Burger Joint,Farmers Market,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space,Exhibit
4,M1H,Scarborough,Cedarbrae,43.769608,-79.23944,1,Indian Restaurant,Bakery,Coffee Shop,Fried Chicken Joint,Hakka Restaurant,Supplement Shop,Sandwich Place,Caribbean Restaurant,Fish & Chips Shop,Athletics & Sports
18,M2J,North York,"Fairview, Henry Farm, Oriole",43.78081,-79.347782,1,Clothing Store,Coffee Shop,Japanese Restaurant,Sandwich Place,Bakery,Electronics Store,Caribbean Restaurant,Juice Bar,Supermarket,Bank
21,M2M,North York,"Newtonbrook, Willowdale",43.7913,-79.413546,1,Korean Restaurant,Coffee Shop,Café,Middle Eastern Restaurant,Pizza Place,Sandwich Place,Shopping Mall,Grocery Store,Park,Dessert Shop
22,M2N,North York,Willowdale South,43.768165,-79.40742,1,Japanese Restaurant,Coffee Shop,Ramen Restaurant,Fast Food Restaurant,Korean Restaurant,Pizza Place,Sandwich Place,Café,Pharmacy,Sushi Restaurant
26,M3B,North York,Don Mills North,43.749055,-79.362212,1,Coffee Shop,Japanese Restaurant,Restaurant,Electronics Store,Bank,Dim Sum Restaurant,Fast Food Restaurant,Diner,Bagel Shop,Discount Store
27,M3C,North York,"Flemingdon Park, Don Mills South",43.721375,-79.343415,1,Japanese Restaurant,Grocery Store,Gym,Coffee Shop,Asian Restaurant,Chinese Restaurant,Sandwich Place,Beer Store,Middle Eastern Restaurant,Fast Food Restaurant
30,M3K,North York,"CFB Toronto, Downsview East",43.738931,-79.46732,1,Athletics & Sports,Racetrack,Coffee Shop,Turkish Restaurant,Gym / Fitness Center,Climbing Gym,Latin American Restaurant,Sandwich Place,Business Service,Skating Rink
36,M4C,East York,Woodbine Heights,43.689645,-79.307165,1,Skating Rink,Coffee Shop,Park,Ice Cream Shop,Café,Pizza Place,Gastropub,Sushi Restaurant,Bus Line,Grocery Store
37,M4E,East Toronto,The Beaches,43.676531,-79.29541,1,Pub,Coffee Shop,Bakery,Beach,Park,Pizza Place,Breakfast Spot,Bar,Sandwich Place,Gastropub


**Cluster 2** is an outlier, its 1st common Venue is Zoo Exhibit, it also features Hobby shops, Electronics Stores ...

In [28]:
toronto_merged[toronto_merged['Cluster Labels'] == 2].head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.81165,-79.195561,2,Zoo Exhibit,Fast Food Restaurant,Coffee Shop,Spa,Hobby Shop,Food,Falafel Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant


**Cluster 3** is an outlier, its 1st commont venue is Bakery, and it also features Farms, Event spaces ...

In [29]:
toronto_merged[toronto_merged['Cluster Labels'] == 3].head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,M1X,Scarborough,Upper Rouge,43.834215,-79.216701,3,Bakery,Zoo Exhibit,Farm,Eastern European Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Space,Exhibit,Falafel Restaurant


**Cluster 4** focuses on Fast foods, pizzas and restaurants in general.  
We can see that **this cluster is predominant outside of the downtown**.

In [30]:
toronto_merged[toronto_merged['Cluster Labels'] == 4].head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175299,4,Pharmacy,Fast Food Restaurant,Pizza Place,Coffee Shop,Grocery Store,Sports Bar,Discount Store,Bank,Bar,Supermarket
3,M1G,Scarborough,Woburn,43.768216,-79.21761,4,Pizza Place,Fast Food Restaurant,Coffee Shop,Indian Restaurant,Bank,Sandwich Place,Supermarket,Juice Bar,Park,Thrift / Vintage Store
5,M1J,Scarborough,Scarborough Village,43.743085,-79.232172,4,Fast Food Restaurant,Sandwich Place,Big Box Store,Restaurant,Train Station,Indian Restaurant,Chinese Restaurant,Coffee Shop,Pizza Place,Convenience Store
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.72626,-79.26367,4,Discount Store,Coffee Shop,Chinese Restaurant,Grocery Store,Metro Station,Department Store,Sandwich Place,Asian Restaurant,Bus Line,Light Rail Station
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713213,-79.28491,4,Convenience Store,Bakery,Coffee Shop,Fast Food Restaurant,Bus Station,Bus Line,Intersection,Soccer Field,Grocery Store,Bank
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.723575,-79.234976,4,Fast Food Restaurant,Discount Store,Bistro,Park,Sporting Goods Shop,Furniture / Home Store,Burger Joint,Liquor Store,Bank,Flower Shop
10,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.759975,-79.268974,4,Pizza Place,Fast Food Restaurant,Chinese Restaurant,Coffee Shop,Light Rail Station,Automotive Shop,Indian Restaurant,Park,Electronics Store,Wings Joint
11,M1R,Scarborough,"Maryvale, Wexford",43.750803,-79.30056,4,Middle Eastern Restaurant,Pizza Place,Grocery Store,Convenience Store,Furniture / Home Store,Restaurant,Korean Restaurant,Supermarket,Bar,Steakhouse
12,M1S,Scarborough,Agincourt,43.79394,-79.267976,4,Chinese Restaurant,Shopping Mall,Asian Restaurant,Restaurant,Park,Supermarket,Sandwich Place,Coffee Shop,Cantonese Restaurant,Bakery
13,M1T,Scarborough,"Clarks Corners, Sullivan, Tam O'Shanter",43.784725,-79.299244,4,Pharmacy,Fast Food Restaurant,Pizza Place,Coffee Shop,Chinese Restaurant,Vietnamese Restaurant,Sandwich Place,Burrito Place,Liquor Store,Beer Store
