# Segmenting and clustering Toronto Neighborhood data

In [1]:
import pandas as pd
import numpy as np
import requests 
import geocoder
import folium
from geopy.geocoders import Nominatim
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from pandas.io.json import json_normalize

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries imported')

Libraries imported


## Reading Toronoto neighborhoods data from wiki page into dataframe 

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(wiki_url)
html_tables = pd.read_html(response.text)

df_neigh = html_tables[0]
print('Neighborhoods as obtained from wikipedia', df_neigh.shape)

Neighborhoods as obtained from wikipedia (180, 3)


## Drop rows where Borough is Not assigned

In [3]:
df_neigh = df_neigh[df_neigh['Borough'] != 'Not assigned']
print('Updated Neighborhoods after excluding not assigned Boroughs', df_neigh.shape)

Updated Neighborhoods after excluding not assigned Boroughs (103, 3)


## Merge neighborhoods with same postal code

In [4]:
df_neigh['Postal Code'].value_counts()

print('There are no duplicate postal codes as row count of neighborhoods and distinct postal codes is same 103')

There are no duplicate postal codes as row count of neighborhoods and distinct postal codes is same 103


## Update Neighborhoods to be same as Borough when not assigned

In [5]:
# Find rows where Neighborhood is not assigned
df_neigh[df_neigh['Neighborhood'] == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighborhood


## There are no rows where Neighborhood is Not assigned, hence skipping

In [6]:
df_neigh.reset_index(drop=True, inplace=True)

print('Final shape of Neighborhood data after cleaning', df_neigh.shape)


Final shape of Neighborhood data after cleaning (103, 3)


# Get latitude and longitude for Neighborhoods

In [7]:
def get_cords(postal_code):
    lat_lng_coords = None
    i = 0
    while(i < 5):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        print(lat_lng_coords)
        i += 1
    return lat_lng_coords

print('Function defined to get coords from postal code')     


Function defined to get coords from postal code


### Since geocoder package could not find coords for given postal code therefore reading it from provided csv file


In [8]:
coords_file = pd.read_csv('Geospatial_Coordinates.csv')
coords_file.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge lat and long with neighborhood data

In [9]:
df_neigh_coords = pd.merge(df_neigh, coords_file, on='Postal Code')
df_neigh_coords.shape

(103, 5)

In [10]:
df_neigh_coords.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Use geopy to get the latitude and longitude of Toronto

In [12]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="my_app")
location = geolocator.geocode(address)
lat = location.latitude
long = location.longitude
print('Coords for Toronto are: Lat={}, Long={}'.format(lat, long))

Coords for Toronto are: Lat=43.6534817, Long=-79.3839347


## Create a map of Toronto with neighborhoods superimposed on top

In [13]:
# Create map centered around Toronto
map_toronto = folium.Map(location=[lat, long], zoom_start=10)

# Add Neighborhoods as markers
for lat, long, borough, neighborhood in zip(df_neigh_coords['Latitude'], df_neigh_coords['Longitude'], df_neigh_coords['Borough'], df_neigh_coords['Neighborhood']):
    
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

In [21]:
df_neigh_coords['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

Lets simplify the above map to view neighborhoods only in Central Toronto. Lets slice the dataframe to get data only for Central Toronto

In [14]:
df_neigh_central = df_neigh_coords[df_neigh_coords['Borough'] == 'Central Toronto'].reset_index(drop=True)
df_neigh_central.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M5N,Central Toronto,Roselawn,43.711695,-79.416936
2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
3,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307
4,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
5,M5R,Central Toronto,"The Annex, North Midtown, Yorkville",43.67271,-79.405678
6,M4S,Central Toronto,Davisville,43.704324,-79.38879
7,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
8,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


Lets get coords for Central Toronto

In [15]:
loc_central_toronto = geolocator.geocode('Central Toronto, Toronto') 
lat_ct = loc_central_toronto.latitude
lon_ct = loc_central_toronto.longitude
print("Central Toronoto coords,lat={}, lon={}".format(lat_ct, lon_ct))

Central Toronoto coords,lat=43.6534817, lon=-79.3839347


Lets visualise neighborhoods only in central Toronto

In [16]:
# Create map centered around Toronto
map_ct = folium.Map(location=[lat_ct, lon_ct], zoom_start=10)

# Add Neighborhoods as markers
for lat, long, borough, neighborhood in zip(df_neigh_central['Latitude'], df_neigh_central['Longitude'], df_neigh_central['Borough'], df_neigh_central['Neighborhood']):
    
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ct)

map_ct

Define FourSquare credentials

In [17]:
CLIENT_ID = 'LGQEEPA4FF4WWTQF2IZD2VKT5RGGY2RSPQE22WQNFE3B10VC' # your Foursquare ID
CLIENT_SECRET = '5LB5MHDYW25FFTP4RRELXW35GX3NU0NWM1FS5VCVLXGZANLW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LGQEEPA4FF4WWTQF2IZD2VKT5RGGY2RSPQE22WQNFE3B10VC
CLIENT_SECRET:5LB5MHDYW25FFTP4RRELXW35GX3NU0NWM1FS5VCVLXGZANLW


Lets explore first Neighborhood in Central Toronto

In [18]:
print(df_neigh_central.loc[0, 'Neighborhood'])

Lawrence Park


In [19]:
neigh_lat = df_neigh_central.loc[0, 'Latitude']
neigh_lon = df_neigh_central.loc[0, 'Longitude']
neigh_name = df_neigh_central.loc[0, 'Neighborhood']
print('Coords of neighborhood {} are {}, {}'.format(neigh_name, neigh_lat, neigh_lon))

Coords of neighborhood Lawrence Park are 43.7280205, -79.3887901


#### Now, let's get the top 100 venues that are in Lawrence Park within a radius of 500 meters.

In [20]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neigh_lat, 
    neigh_lon, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=LGQEEPA4FF4WWTQF2IZD2VKT5RGGY2RSPQE22WQNFE3B10VC&client_secret=5LB5MHDYW25FFTP4RRELXW35GX3NU0NWM1FS5VCVLXGZANLW&v=20180605&ll=43.7280205,-79.3887901&radius=500&limit=100'

In [24]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ec670d09388d7001b3afa0f'},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.7325205045, 'lng': -79.3825744605273},
   'sw': {'lat': 43.7235204955, 'lng': -79.3950057394727}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '50e6da19e4b0d8a78a0e9794',
       'name': 'Lawrence Park Ravine',
       'location': {'address': '3055 Yonge Street',
        'crossStreet': 'Lawrence Avenue East',
        'lat': 43.72696303913755,
        'lng': -79.39438246708775,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.72696303913755,
          'lng': -79.39438246708775}],
        'distance': 465,
        'cc': 'CA',
  

In [25]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Lawrence Park Ravine,Park,43.726963,-79.394382
1,Zodiac Swim School,Swim School,43.728532,-79.38286
2,TTC Bus #162 - Lawrence-Donway,Bus Line,43.728026,-79.382805


## Explore Neighborhoods in Central Toronto

### Lets create a function to repeat the same process for all neighborhoods in Central Toronto

In [27]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()
#         print(results)
        
        try:
            results = results["response"]['groups'][0]['items']
        except:
            print('Following error occured while fetching venues for {}'.format(name))
            print(results)
            continue
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Get venues in all neighborhoods in Central Toronto

In [28]:
ct_venues = getNearbyVenues(df_neigh_central['Neighborhood'],
                           df_neigh_central['Latitude'],
                           df_neigh_central['Longitude'])

Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
North Toronto West
The Annex, North Midtown, Yorkville
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park


In [29]:
ct_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Lawrence Park,43.72802,-79.38879,Zodiac Swim School,43.728532,-79.38286,Swim School
2,Lawrence Park,43.72802,-79.38879,TTC Bus #162 - Lawrence-Donway,43.728026,-79.382805,Bus Line
3,Roselawn,43.711695,-79.416936,Dr.Paul Hodges MIP,43.710634,-79.41581,Health & Beauty Service
4,Roselawn,43.711695,-79.416936,Ceiling Champions,43.713891,-79.420702,Home Service


In [24]:
ct_venues.shape

(112, 7)

## Analyze each Neighborhood

In [32]:
#one hot encoding
ct_venues_onehotenc = pd.get_dummies(ct_venues['Venue Category'], prefix="", prefix_sep="")

#Add Neighborhood column
ct_venues_onehotenc['Neighborhood'] = ct_venues['Neighborhood']

#Move neighborhood column to first position
fixed_columns = [ct_venues_onehotenc.columns[-1]] + list(ct_venues_onehotenc.columns[:-1])

ct_venues_onehotenc = ct_venues_onehotenc[fixed_columns]

ct_venues_onehotenc.head()

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Bar,Breakfast Spot,Brewery,Burger Joint,Bus Line,...,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,Lawrence Park,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Roselawn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Roselawn,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
ct_venues_grouped = ct_venues_onehotenc.groupby('Neighborhood').mean().reset_index()
ct_venues_grouped

Unnamed: 0,Neighborhood,American Restaurant,BBQ Joint,Bagel Shop,Bank,Bar,Breakfast Spot,Brewery,Burger Joint,Bus Line,...,Sports Bar,Supermarket,Sushi Restaurant,Swim School,Thai Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Yoga Studio
0,Davisville,0.0,0.0,0.0,0.0,0.029412,0.0,0.029412,0.0,0.0,...,0.0,0.0,0.058824,0.0,0.029412,0.029412,0.0,0.0,0.0,0.0
1,Davisville North,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Forest Hill North & West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.0,0.0
3,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
4,"Moore Park, Summerhill East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
5,North Toronto West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455
6,Roselawn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Summerhill West, Rathnelly, South Hill, Forest...",0.0625,0.0,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0,...,0.0625,0.0625,0.0625,0.0,0.0,0.0,0.0,0.0,0.0625,0.0
8,"The Annex, North Midtown, Yorkville",0.038462,0.038462,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0


## Lets print top 5 venues for each Neighborhood

In [34]:
num_top_venues = 5

for hood in ct_venues_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = ct_venues_grouped[ct_venues_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Davisville----
                venue  freq
0      Sandwich Place  0.09
1        Dessert Shop  0.09
2                 Gym  0.06
3  Italian Restaurant  0.06
4         Pizza Place  0.06


----Davisville North----
               venue  freq
0              Hotel  0.11
1  Convenience Store  0.11
2                Gym  0.11
3               Park  0.11
4  Food & Drink Shop  0.11


----Forest Hill North & West----
              venue  freq
0  Sushi Restaurant   0.2
1             Trail   0.2
2     Jewelry Store   0.2
3              Park   0.2
4          Bus Line   0.2


----Lawrence Park----
                 venue  freq
0                 Park  0.33
1          Swim School  0.33
2             Bus Line  0.33
3  American Restaurant  0.00
4   Mexican Restaurant  0.00


----Moore Park, Summerhill East----
                 venue  freq
0           Playground   0.5
1                Trail   0.5
2  American Restaurant   0.0
3   Mexican Restaurant   0.0
4          Pizza Place   0.0


----North Toronto Wes

In [35]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Lets create a new dataframe and display top 10 venues for each neighborhood

In [36]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ct_venues_grouped['Neighborhood']

for ind in np.arange(ct_venues_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ct_venues_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Davisville,Sandwich Place,Dessert Shop,Italian Restaurant,Sushi Restaurant,Café,Coffee Shop,Pizza Place,Gym,Indoor Play Area,Indian Restaurant
1,Davisville North,Hotel,Gym,Park,Department Store,Breakfast Spot,Convenience Store,Sandwich Place,Dog Run,Food & Drink Shop,Yoga Studio
2,Forest Hill North & West,Jewelry Store,Bus Line,Park,Trail,Sushi Restaurant,Donut Shop,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop
3,Lawrence Park,Bus Line,Park,Swim School,Yoga Studio,Donut Shop,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop,Fried Chicken Joint
4,"Moore Park, Summerhill East",Trail,Playground,Yoga Studio,Fried Chicken Joint,Dog Run,Donut Shop,Farmers Market,Fast Food Restaurant,Flower Shop,Food & Drink Shop


## Lets Cluster Neighborhoods

In [29]:
# set number of clusters
kclusters = 5

ct_venues_grouped_clustering = ct_venues_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ct_venues_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 4, 0, 3, 1, 2, 1, 1])

Lets Create a new DataFrame which includes the cluster as well as top 10 venues for each neighbourhood

In [34]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_central_merged = df_neigh_central

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_central_merged = df_central_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

df_central_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Park,Swim School,Bus Line,Yoga Studio,Fast Food Restaurant,Dessert Shop,Diner,Donut Shop,Farmers Market,Flower Shop
1,M5N,Central Toronto,Roselawn,43.711695,-79.416936,2,Garden,Yoga Studio,Dance Studio,Greek Restaurant,Gourmet Shop,Gas Station,Fried Chicken Joint,Food & Drink Shop,Flower Shop,Fast Food Restaurant
2,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,Gym,Breakfast Spot,Park,Sandwich Place,Department Store,Food & Drink Shop,Dance Studio,Hotel,Fried Chicken Joint,Garden
3,M5P,Central Toronto,Forest Hill North & West,43.696948,-79.411307,4,Jewelry Store,Park,Trail,Sushi Restaurant,Yoga Studio,Dessert Shop,Diner,Donut Shop,Farmers Market,Flower Shop
4,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,1,Clothing Store,Coffee Shop,Sporting Goods Shop,Yoga Studio,Salon / Barbershop,Café,Chinese Restaurant,Dessert Shop,Diner,Fast Food Restaurant


### Finally Lets visualise resulting clusters

In [43]:
# create map
map_clusters = folium.Map(location=[lat_ct, lon_ct], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_central_merged['Latitude'], df_central_merged['Longitude'], df_central_merged['Neighborhood'], df_central_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine Clusters

Cluster 1

In [45]:
df_central_merged.loc[df_central_merged['Cluster Labels'] == 0, df_central_merged.columns[[1] + list(range(5, df_central_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,0,Park,Swim School,Bus Line,Yoga Studio,Fast Food Restaurant,Dessert Shop,Diner,Donut Shop,Farmers Market,Flower Shop


Cluster 2

In [46]:
df_central_merged.loc[df_central_merged['Cluster Labels'] == 1, df_central_merged.columns[[1] + list(range(5, df_central_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Central Toronto,1,Gym,Breakfast Spot,Park,Sandwich Place,Department Store,Food & Drink Shop,Dance Studio,Hotel,Fried Chicken Joint,Garden
4,Central Toronto,1,Clothing Store,Coffee Shop,Sporting Goods Shop,Yoga Studio,Salon / Barbershop,Café,Chinese Restaurant,Dessert Shop,Diner,Fast Food Restaurant
5,Central Toronto,1,Café,Sandwich Place,Coffee Shop,American Restaurant,Grocery Store,History Museum,Donut Shop,Indian Restaurant,Liquor Store,Cosmetics Shop
6,Central Toronto,1,Sandwich Place,Dessert Shop,Italian Restaurant,Coffee Shop,Sushi Restaurant,Pizza Place,Café,Gym,Indian Restaurant,Park
8,Central Toronto,1,Coffee Shop,Pub,American Restaurant,Sports Bar,Fried Chicken Joint,Vietnamese Restaurant,Light Rail Station,Liquor Store,Restaurant,Pizza Place


Cluster 3

In [47]:
df_central_merged.loc[df_central_merged['Cluster Labels'] == 2, df_central_merged.columns[[1] + list(range(5, df_central_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Central Toronto,2,Garden,Yoga Studio,Dance Studio,Greek Restaurant,Gourmet Shop,Gas Station,Fried Chicken Joint,Food & Drink Shop,Flower Shop,Fast Food Restaurant


Cluster 4

In [48]:
df_central_merged.loc[df_central_merged['Cluster Labels'] == 3, df_central_merged.columns[[1] + list(range(5, df_central_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Central Toronto,3,Park,Yoga Studio,Gym,Greek Restaurant,Gourmet Shop,Gas Station,Garden,Fried Chicken Joint,Food & Drink Shop,Flower Shop


Cluster 5

In [49]:
df_central_merged.loc[df_central_merged['Cluster Labels'] == 4, df_central_merged.columns[[1] + list(range(5, df_central_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Central Toronto,4,Jewelry Store,Park,Trail,Sushi Restaurant,Yoga Studio,Dessert Shop,Diner,Donut Shop,Farmers Market,Flower Shop
