Applied Data Science - Week 3 by Heston Miles

Import packages first

In [1]:
#import packages
%matplotlib inline
import pandas as pd
import numpy as np # library to handle data in a vectorized manner

import requests
import io
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

!python3 -m pip install folium
import folium # map rendering library

import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors



In [2]:
#dont need this anymore - switched to folium
#!conda install -c conda-forge geopandas

Scrape data, use BeautifulSoup. Then loop through data to organize it, remove rows that are not assigned (postal codes, boroughs).

In [3]:
#scrape data
data_url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data_page= requests.get(data_url).text
data = BeautifulSoup(data_page,'xml')

#extract data
table = data.find('table')

postalCode = []
borough = []
neighborhood = []
mainCount=0

# organize the table
for tr_cell in table.find_all('tr'):
    count = 1
    postalCode_var = -1
    borough_var = -1
    neighborhood_var = -1
    
    for td_cell in tr_cell.find_all('td'):
        if count == 1: 
            postalCode_var = td_cell.text
            postalCode_var = postalCode_var[:-1]
        if count == 2: 
            borough_var = td_cell.text
            borough_var = borough_var[:-1]
        if count == 3: 
            neighborhood_var = str(td_cell.text).strip() 
        if (postalCode_var != 'Not Assigned' and postalCode_var != -1 and borough_var != 'Not assigned' and borough_var != -1 and neighborhood_var != -1):

            postalCode.append(postalCode_var)
            borough.append(borough_var)
            neighborhood.append(neighborhood_var)
            mainCount+=1
        count +=1

In [4]:
#create dataframe
toronto_data={'Postcode':postalCode, 'Borough':borough, 'Neighborhood':neighborhood}
df_toronto = pd.DataFrame.from_dict(toronto_data)
df_toronto.to_csv('toronto_part1.csv')

THIS IS WHERE PART 2 STARTS: Import Lat/Long Package

In [5]:
df_long = pd.read_csv (r'http://cocl.us/Geospatial_data')
df_long.set_index('Postal Code')
df_toronto.set_index('Postcode')

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"
M3B,North York,Don Mills
M4B,East York,"Parkview Hill, Woodbine Gardens"
M5B,Downtown Toronto,"Garden District, Ryerson"


Match lat/long with dataset - sorting the lat/long list alphanumerically to match the df_toronto list

In [6]:
#loop through and match it up
latitude=[]
longitude=[]
latLongCount=0
df_toronto.reset_index(drop=True, inplace=True)
#sort each list by postcode
df_long.sort_values(by='Postal Code', inplace=True)
df_toronto.sort_values(by='Postcode', inplace=True)
df_toronto['Latitude']=df_long['Latitude']
df_toronto['Longitude']=df_long['Longitude']

df_toronto.sort_index()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.744734,-79.239476
6,M1B,Scarborough,"Malvern, Rouge",43.727929,-79.262029
7,M3B,North York,Don Mills,43.711112,-79.284577
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.716316,-79.239476
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.692657,-79.264848


THIS IS WHERE PART 3  STARTS: Clusters / Maps
    Cluster by boroughs with the word 'Toronto'

In [7]:
# create map of toronto using latitude and longitude values
latitude=43.6532
longitude=-79.3832
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

FourSquare Creds

In [8]:
CLIENT_ID = '14O12ED3ZS0ZQJ3G1DAOB0B4RAOHOH4FZPGDVZFC0UU1NX35' # your Foursquare ID
CLIENT_SECRET = 'V0K0HTYKF1SY1BLBTPDGHJLBCIFSP1XJSFUHNNRBWE42ZHVD' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

Perform Same Analysis as we did for NYC

In [9]:
df_toronto.loc[0, 'Neighborhood']

'Parkwoods'

In [10]:
neighborhood_latitude = df_toronto.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_toronto.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_toronto.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.806686299999996, -79.19435340000001.


Now, let's get the top 100 venues that are in Parkwoods within a radius of 500 meters.

In [11]:
# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=14O12ED3ZS0ZQJ3G1DAOB0B4RAOHOH4FZPGDVZFC0UU1NX35&client_secret=V0K0HTYKF1SY1BLBTPDGHJLBCIFSP1XJSFUHNNRBWE42ZHVD&v=20180605&ll=43.6532,-79.3832&radius=500&limit=100'

In [12]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ed8391b9da7ee001b46aeb5'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Bay Street Corridor',
  'headerFullLocation': 'Bay Street Corridor, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 83,
  'suggestedBounds': {'ne': {'lat': 43.6577000045, 'lng': -79.37699210971401},
   'sw': {'lat': 43.648699995499996, 'lng': -79.389407890286}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5227bb01498e17bf485e6202',
       'name': 'Downtown Toronto',
       'location': {'lat': 43.65323167517444,
        'lng': -79.38529600606677,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.65323167517444,
          'l

Only Pull Neighborhoods with 'Toronto' in the Name

In [13]:
df_toronto= df_toronto[df_toronto['Borough'].str.contains('Toronto', na = False)].reset_index(drop=True)
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.786947,-79.385975
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.704324,-79.38879
3,M4M,East Toronto,Studio District,43.657162,-79.378937
4,M4N,Central Toronto,Lawrence Park,43.648198,-79.379817


In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West,  Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High

In [15]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Brockton, Parkdale Village, Exhibition Place",40,40,40,40,40,40
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",3,3,3,3,3,3
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,6,6,6,6,6,6
Christie,3,3,3,3,3,3
Church and Wellesley,8,8,8,8,8,8
"Commerce Court, Victoria Hotel",4,4,4,4,4,4
Davisville,4,4,4,4,4,4
Davisville North,58,58,58,58,58,58
"Dufferin, Dovercourt Village",6,6,6,6,6,6


Analyze the Neighborhoods

In [16]:
# one hot encode
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
cols=list(toronto_onehot.columns.values)
cols.pop(cols.index('Neighborhood'))
toronto_onehot=toronto_onehot[['Neighborhood']+cols]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,...,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [17]:
toronto_onehot.shape

(741, 197)

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [18]:
gr_toronto = toronto_onehot.groupby('Neighborhood').mean().reset_index()
gr_toronto

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,...,Toy / Game Store,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.025
1,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,...,0.0,0.0,0.0,0.051724,0.0,0.051724,0.017241,0.0,0.0,0.0
9,"Dufferin, Dovercourt Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Let's confirm the new size

In [19]:
gr_toronto.shape

(38, 197)

Let's print each neighborhood along with the top 5 most common venues

In [20]:
num_top_venues = 10

for hood in gr_toronto['Neighborhood']:
    print("----"+hood+"----")
    temp = gr_toronto[gr_toronto['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Brockton, Parkdale Village, Exhibition Place----
                       venue  freq
0                       Café  0.10
1                Coffee Shop  0.08
2                    Brewery  0.05
3        American Restaurant  0.05
4                     Bakery  0.05
5                  Gastropub  0.05
6                Yoga Studio  0.02
7  Middle Eastern Restaurant  0.02
8                  Bookstore  0.02
9  Latin American Restaurant  0.02


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                       venue  freq
0          Mobile Phone Shop  0.33
1             Sandwich Place  0.33
2                       Park  0.33
3               Optical Shop  0.00
4                     Office  0.00
5                       Lake  0.00
6  Latin American Restaurant  0.00
7             Lingerie Store  0.00
8               Liquor Store  0.00
9                     Lounge  0.00


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay,

                venue  freq
0       Grocery Store  0.24
1                Café  0.18
2                Park  0.12
3         Candy Store  0.06
4          Restaurant  0.06
5          Baby Store  0.06
6  Italian Restaurant  0.06
7         Coffee Shop  0.06
8               Diner  0.06
9           Nightclub  0.06


----Queen's Park, Ontario Provincial Government----
                  venue  freq
0                Bakery  0.11
1   Fried Chicken Joint  0.11
2  Caribbean Restaurant  0.11
3       Thai Restaurant  0.11
4           Gas Station  0.11
5    Athletics & Sports  0.11
6                  Bank  0.11
7                Lounge  0.11
8      Hakka Restaurant  0.11
9    Mexican Restaurant  0.00


----Regent Park, Harbourfront----
                 venue  freq
0       Breakfast Spot  0.14
1       Medical Center  0.14
2    Electronics Store  0.14
3         Intersection  0.14
4   Mexican Restaurant  0.14
5                 Bank  0.14
6  Rental Car Location  0.14
7              Airport  0.00
8    Mobile

Let's put that into a pandas dataframe

First, let's write a function to sort the venues in descending order.

In [21]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [22]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = gr_toronto['Neighborhood']

for ind in np.arange(gr_toronto.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(gr_toronto.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Gastropub,Brewery,Bakery,American Restaurant,Comfort Food Restaurant,Bookstore,Cheese Shop,Pet Store
1,"Business reply mail Processing Centre, South C...",Park,Sandwich Place,Mobile Phone Shop,Department Store,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center
2,"CN Tower, King and Spadina, Railway Lands, Har...",Yoga Studio,Auto Workshop,Park,Pizza Place,Butcher,Burrito Place,Restaurant,Brewery,Comic Shop,Farmers Market
3,Central Bay Street,Butcher,Grocery Store,Coffee Shop,Pharmacy,Pizza Place,Bank,Yoga Studio,Electronics Store,Eastern European Restaurant,Donut Shop
4,Christie,Park,Food & Drink Shop,Pool,Department Store,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center


Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [28]:
# set number of clusters
kclusters = 5

gr_toronto_clustering = gr_toronto.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(gr_toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 2, 2, 2, 0, 2, 2, 2, 2, 2], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [66]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.dropna(inplace=True) 
toronto_merged['Cluster Labels']=toronto_merged['Cluster Labels'].astype('int')
toronto_merged # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.786947,-79.385975,2,Japanese Restaurant,Chinese Restaurant,Bank,Café,Coworking Space,Distribution Center,Falafel Restaurant,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,2,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Restaurant,Furniture / Home Store,Spa,Caribbean Restaurant,Café,Juice Bar
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.704324,-79.38879,2,Sandwich Place,Dessert Shop,Italian Restaurant,Sushi Restaurant,Coffee Shop,Café,Gym,Pizza Place,Indoor Play Area,Seafood Restaurant
3,M4M,East Toronto,Studio District,43.657162,-79.378937,2,Clothing Store,Coffee Shop,Middle Eastern Restaurant,Café,Bubble Tea Shop,Italian Restaurant,Cosmetics Shop,Japanese Restaurant,Ramen Restaurant,Diner
4,M4N,Central Toronto,Lawrence Park,43.648198,-79.379817,2,Coffee Shop,Café,Restaurant,Hotel,Gym,American Restaurant,Japanese Restaurant,Seafood Restaurant,Deli / Bodega,Italian Restaurant
5,M4P,Central Toronto,Davisville North,43.653206,-79.400049,2,Café,Coffee Shop,Mexican Restaurant,Bakery,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Park,Bar,Grocery Store,Dessert Shop
6,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.693781,-79.428191,2,Field,Trail,Tennis Court,Hockey Arena,Convenience Store,Cosmetics Shop,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop
7,M4S,Central Toronto,Davisville,43.713756,-79.490074,2,Park,Construction & Landscaping,Basketball Court,Bakery,Discount Store,Falafel Restaurant,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.64896,-79.456325,2,Gift Shop,Breakfast Spot,Bookstore,Dog Run,Italian Restaurant,Coffee Shop,Movie Theater,Cuban Restaurant,Eastern European Restaurant,Dessert Shop
9,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.636966,-79.615819,2,Coffee Shop,Hotel,Middle Eastern Restaurant,Mediterranean Restaurant,Intersection,Sandwich Place,American Restaurant,Gym,Burrito Place,Fried Chicken Joint


Finally, let's visualize the resulting clusters

In [67]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters