# Toronto segmenting and clustering

## Phase 1 - scraping and formatting postal code data to neighbourhood dataframe

In [1]:
#importing necessary libraries
import pandas as pd
import numpy as np

In [2]:
# the method to process the list is quite straightforward:
# 1. need to read the original html into pandas dataframe, it will create a list of 'table' elements from the html
# 2. need to take the first element (the table with the codes)
# 3. need to filter out items where the borough is not set, I'll use simple dataframe filter for that
# 4. nedd to update neighbourhood to the borough value where the latter is set, I'll use numpy.where for that. this gives a warning, but is sufficient for now
# 5. need to combine rows with same postcode, using groupby with 2 columns (postcode, borough) for that and apply.join

In [3]:
orig_tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [4]:
#the source html has 3 tables, we need only the first one
orig_table = orig_tables[0]
print(orig_table.shape)
orig_table.head()

(287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
# no borough is not needed
df = orig_table
df = df[df["Borough"] != "Not assigned"]
print(df.head())
df.shape

  Postcode           Borough     Neighbourhood
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront
5      M6A        North York  Lawrence Heights
6      M6A        North York    Lawrence Manor


(210, 3)

In [6]:
#updating not set neighborhood values
df["Neighbourhood"] = np.where(df["Neighbourhood"] == "Not assigned", df["Borough"],df["Neighbourhood"])
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [7]:
#grouping by postcode
print(df.shape)
df2 = df
df_clean = pd.DataFrame( df2.groupby(['Postcode','Borough'])["Neighbourhood"].apply(','.join))
print(df_clean.head())

(210, 3)
                                             Neighbourhood
Postcode Borough                                          
M1B      Scarborough                         Rouge,Malvern
M1C      Scarborough  Highland Creek,Rouge Hill,Port Union
M1E      Scarborough       Guildwood,Morningside,West Hill
M1G      Scarborough                                Woburn
M1H      Scarborough                             Cedarbrae


In [8]:
df_clean = df_clean.reset_index()
print('the resulting dataframe size is {}'.format(df_clean.shape))
df_clean.head()

the resulting dataframe size is (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Phase 2 - reading location data for neighbourhoods

In [9]:
# due to the unreliability of the api to get the data use the below flag to govern the source 
readAPI = True #set to false to read the csv directly from project's url
numRetries = 3 #the geocoder API tends to end up in some eternal loop, we need to break after a given number of retries

#may need to install geocoder - see https://anaconda.org/conda-forge/geocoder
#!conda install -c conda-forge geocoder 
import geocoder # import geocoder

In [10]:
#the below tries to call the API, but admittedly, even the demo/ simple examples from https://geocoder.readthedocs.io/api.html don't work. 
#this whole library seems to be broken 
#creating a custom exception so we can handle that the API does not return anything
class GeocoderError(Exception):
    """Raised when the geocoder API returns nothing"""
    pass

In [11]:
#dataframe for coordinates -> 3 columns, Postal code, latitude, longitude. will be filled by either API or csv read

In [12]:
#TODO: remove
readAPI = False

if (readAPI == True):

    # initialize your variable to None
    lat_lng_coords = None
    try:
        
        #TODO: the below code would not work for the whole dataframe, but the API simply does not work... below is just a stub to test it
        
        tries = 1
        # loop until you get the coordinates or need to exit the loop, as this API does not seem to work
        while(lat_lng_coords is None and tries<= numRetries):
            g = geocoder.google('{}, Toronto, Ontario'.format("M3A"))
            lat_lng_coords = g.latlng
            tries = tries+1

        if (lat_lng_coords is None):
           raise GeocoderError

    except GeocoderError:
        print("Geocoder API returned nothing, falling back to using csv")
        readAPI = False

if (readAPI == False):
    #either we did not try or we did not suceed using the geocoder
    #use csv file - download, process
    
    #!wget -q -O 'geospatial_coordinates.csv' https://cocl.us/Geospatial_data
    #print('Data downloaded!')

    df_coordinates = pd.read_csv('https://cocl.us/Geospatial_data/geospatial_coordinates.csv')
    print(df_coordinates.head())
    print(df_coordinates.shape)
    print('Data downloaded!')

    

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476
(103, 3)
Data downloaded!


In [13]:
#combining the cleaned neighbourhood frame with the coordinates
#renaming Postal Code to Postcode to use as an id in merging
df_coordinates.rename(columns = {"Postal Code" : "Postcode"}, inplace=True)
df_coordinates.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
#merging them together
df_merged = pd.merge(df_clean, df_coordinates, on='Postcode')
print(df_merged.shape)
df_merged

(103, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


## Phase 3 - neighbourhood clustering

In [15]:
# this part will essentially reuse the new york sample in the following way:
# 1. explore the foursquare API and get the top10 features for lat / long
# 2. do a hot one and explode items
# 3. cluster 'hoods
# 4. display clusters on map

#importing libraries
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [16]:
#creating a map of greater toronto and visualizing our boroughs and 'hoods
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [17]:
# create map of Torono using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [18]:
# The code was removed by Watson Studio for sharing.

In [19]:
# The code was removed by Watson Studio for sharing.

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # make the GET request#foursquare api connection
        # the following variables are used to connect (already defined in function)
        # CLIENT_ID
        # CLIENT_SECRET
        # VERSION
        # uses the predefined getFoursquareExplore function, which does nothing else but calls the foursquare explore with the credentials. 
        # the function is only hidden for privacy
        results = getFoursquareExplore(lat,lng,radius,limit)["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:

#we have 103 neighborhoods - let's get the top 100 venues for the boroughs having 'Toronto' in their name in an area of 500 meters
df_foursquared = df_merged[df_merged["Borough"].str.contains("Toronto")]

limit = 100
radius = 500

toronto_venues = getNearbyVenues(names=df_foursquared['Neighbourhood'],
                                   latitudes=df_foursquared['Latitude'],
                                   longitudes=df_foursquared['Longitude']
                                  )

toronto_venues

The Beaches
The Danforth West,Riverdale
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park,Summerhill East
Deer Park,Forest Hill SE,Rathnelly,South Hill,Summerhill West
Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide,King,Richmond
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill North,Forest Hill West
The Annex,North Midtown,Yorkville
Harbord,University of Toronto
Chinatown,Grange Park,Kensington Market
CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place,Underground city
Christie
Dovercourt Village,Dufferin
Little Portugal,Trinity
Brockton,Exhibition Place,Parkdale Village
High Park,The Junction South
Parkdale,Roncesvalles
Runnymede

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.676300,-79.294784,Other Great Outdoors
4,The Beaches,43.676357,-79.293031,Domino's Pizza,43.679058,-79.297382,Pizza Place
5,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
6,"The Danforth West,Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
7,"The Danforth West,Riverdale",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop
8,"The Danforth West,Riverdale",43.679557,-79.352188,MenEssentials,43.677820,-79.351265,Cosmetics Shop
9,"The Danforth West,Riverdale",43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant


In [22]:
print(toronto_venues.shape)
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

(1686, 7)
There are 235 uniques categories.


In [23]:
print("Unique categories are: {}", format(toronto_venues['Venue Category'].unique() ))

Unique categories are: {} ['Trail' 'Health Food Store' 'Pub' 'Other Great Outdoors' 'Pizza Place'
 'Neighborhood' 'Greek Restaurant' 'Ice Cream Shop' 'Cosmetics Shop'
 'Italian Restaurant' 'Brewery' 'Yoga Studio' 'Fruit & Vegetable Store'
 'Juice Bar' 'Bookstore' 'Restaurant' 'Dessert Shop' 'Bubble Tea Shop'
 'Spa' 'Furniture / Home Store' 'Diner' 'Grocery Store' 'Coffee Shop'
 'Indian Restaurant' 'Caribbean Restaurant' 'Bakery' 'American Restaurant'
 'Liquor Store' 'Fish & Chips Shop' 'Gym' 'Burger Joint' 'Park'
 'Sushi Restaurant' 'Steakhouse' 'Pet Store' 'Fast Food Restaurant'
 'Burrito Place' 'Movie Theater' 'Sandwich Place' 'Light Rail Station'
 'Fish Market' 'Café' 'Seafood Restaurant' 'Gay Bar' 'Thai Restaurant'
 'Middle Eastern Restaurant' 'Comfort Food Restaurant' 'Cheese Shop'
 'Stationery Store' 'Coworking Space' 'Bar' 'Gastropub'
 'Gym / Fitness Center' 'Bank' 'Convenience Store'
 'Latin American Restaurant' 'Clothing Store' 'Swim School' 'Bus Line'
 'Food & Drink Shop' 'Br

In [24]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
#check the shape
toronto_onehot.shape

(1686, 235)

In [26]:
#similarly to the NY example, need to group by neighborhood and take a mean frequency
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
2,"Brockton,Exhibition Place,Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0.0,0.0,0.066667,0.066667,0.066667,0.133333,0.133333,0.133333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.012195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012195,...,0.0,0.0,0.0,0.0,0.012195,0.0,0.0,0.012195,0.0,0.0
7,"Chinatown,Grange Park,Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01087,0.0,0.0,0.0,0.021739,0.0,0.043478,0.01087,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011905,0.011905,0.0,0.0,0.0,0.0,0.0,0.0,0.011905,...,0.0,0.0,0.0,0.0,0.0,0.0,0.011905,0.0,0.0,0.011905


### Let's print each neighborhood along with the top 5 most common venue

In [27]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
             venue  freq
0      Coffee Shop  0.07
1             Café  0.05
2       Steakhouse  0.04
3              Bar  0.04
4  Thai Restaurant  0.04


----Berczy Park----
            venue  freq
0     Coffee Shop  0.07
1          Bakery  0.05
2      Steakhouse  0.04
3  Farmers Market  0.04
4            Café  0.04


----Brockton,Exhibition Place,Parkdale Village----
                   venue  freq
0         Breakfast Spot  0.09
1            Coffee Shop  0.09
2                   Café  0.09
3  Performing Arts Venue  0.09
4                 Bakery  0.09


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.05
2                 Spa  0.05
3       Garden Center  0.05
4              Garden  0.05


----CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara----
              venue  freq
0    Airport Lounge  0.13
1   Airport Service  0.

In [28]:
#sorting values
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [29]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Bar,Steakhouse,Thai Restaurant,Bakery,Burger Joint,Restaurant,Cosmetics Shop,Sushi Restaurant
1,Berczy Park,Coffee Shop,Bakery,Cocktail Bar,Café,Cheese Shop,Seafood Restaurant,Steakhouse,Beer Bar,Farmers Market,Creperie
2,"Brockton,Exhibition Place,Parkdale Village",Performing Arts Venue,Coffee Shop,Café,Breakfast Spot,Bakery,Gym,Intersection,Pet Store,Grocery Store,Climbing Gym
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Recording Studio,Smoke Shop,Skate Park,Brewery,Burrito Place,Butcher,Restaurant,Park
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Terminal,Airport Lounge,Airport Service,Plane,Harbor / Marina,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Airport Gate


### Clustering neighborhoods

running k-means to cluster 'hoods into 5 clusters

In [30]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 0, 3, 3, 3, 3,
       2, 3, 0, 3, 3, 4, 1, 3, 3, 3, 3, 3, 3, 0, 3, 3], dtype=int32)

In [31]:
#merge the dataframes together with the 10 most frequent and the cluster value
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_foursquared
#correcting the good old bour vs bor issues
toronto_merged.rename(columns={"Neighbourhood" : "Neighborhood"}, inplace = True)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Pizza Place,Trail,Pub,Other Great Outdoors,Health Food Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Department Store,Donut Shop
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,3,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Restaurant,Pizza Place,Brewery,Bubble Tea Shop
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,3,Park,Pizza Place,Pub,Liquor Store,Light Rail Station,Burger Joint,Sandwich Place,Fast Food Restaurant,Burrito Place,Fish & Chips Shop
43,M4M,East Toronto,Studio District,43.659526,-79.340923,3,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery,Brewery,Stationery Store,Bar,Fish Market,Coworking Space
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,Park,Swim School,Bus Line,Wings Joint,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


Finally, let's visualize the resulting clusters

In [32]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine clusters

#### Cluster 1

In [33]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,East Toronto,0,Pizza Place,Trail,Pub,Other Great Outdoors,Health Food Store,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Department Store,Donut Shop
45,Central Toronto,0,Gym,Clothing Store,Sandwich Place,Asian Restaurant,Food & Drink Shop,Hotel,Breakfast Spot,Park,Electronics Store,Eastern European Restaurant
48,Central Toronto,0,Gym,Intersection,Trail,Tennis Court,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
64,Central Toronto,0,Trail,Mexican Restaurant,Jewelry Store,Sushi Restaurant,Wings Joint,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


#### Cluster 2

In [34]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
63,Central Toronto,1,Garden,Wings Joint,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop


#### Cluster 3

In [35]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
44,Central Toronto,2,Park,Swim School,Bus Line,Wings Joint,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


#### Cluster 4

In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
41,East Toronto,3,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Furniture / Home Store,Restaurant,Pizza Place,Brewery,Bubble Tea Shop
42,East Toronto,3,Park,Pizza Place,Pub,Liquor Store,Light Rail Station,Burger Joint,Sandwich Place,Fast Food Restaurant,Burrito Place,Fish & Chips Shop
43,East Toronto,3,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery,Brewery,Stationery Store,Bar,Fish Market,Coworking Space
46,Central Toronto,3,Clothing Store,Coffee Shop,Sporting Goods Shop,Gym / Fitness Center,Metro Station,Mexican Restaurant,Diner,Dessert Shop,Park,Chinese Restaurant
47,Central Toronto,3,Sandwich Place,Pizza Place,Dessert Shop,Café,Coffee Shop,Gym,Italian Restaurant,Sushi Restaurant,Flower Shop,Japanese Restaurant
49,Central Toronto,3,Pub,Coffee Shop,Pizza Place,Light Rail Station,Sports Bar,Bagel Shop,Restaurant,Supermarket,Sushi Restaurant,Fried Chicken Joint
51,Downtown Toronto,3,Coffee Shop,Park,Pizza Place,Restaurant,Café,Pub,Italian Restaurant,Bakery,Diner,Indian Restaurant
52,Downtown Toronto,3,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant,Pub,Men's Store,Gastropub,Gym,Hotel
53,Downtown Toronto,3,Coffee Shop,Pub,Park,Bakery,Café,Breakfast Spot,Mexican Restaurant,Theater,Spa,Electronics Store
54,Downtown Toronto,3,Coffee Shop,Clothing Store,Café,Fast Food Restaurant,Middle Eastern Restaurant,Cosmetics Shop,Bakery,Japanese Restaurant,Italian Restaurant,Bubble Tea Shop


#### Cluster 5

In [37]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
50,Downtown Toronto,4,Park,Playground,Trail,Wings Joint,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
