## Coursera 'applied data science' capstone week 3

### Exercise part A - creation of Toronto neighborhoods data frame

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


#### Import data from website using BeautifulSoup and create raw dataframe 'df'

In [2]:
res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))[0]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [3]:
df.shape

(287, 3)

#### Remove rows with no value for 'Borough' and create 'df_borough' dataframe

In [4]:
df_borough = df[(df['Borough'] != 'Not assigned')]
df_borough.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [5]:
df_borough.shape

(210, 3)

#### For neighborhoods without a name, assign borough name as neigborhood name

In [6]:
df_borough['Neighbourhood'].loc[df_borough['Neighbourhood'] == 'Not assigned'] = df_borough['Borough'].loc[df_borough['Neighbourhood'] == 'Not assigned']
df_borough.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [7]:
df_borough.shape

(210, 3)

#### Join rows that have a common postcode, including all neigborhoods separated by commas, and create 'toronto_postcodes' dataframe

In [8]:
toronto_postcodes = pd.DataFrame(df_borough.groupby(['Postcode', 'Borough'])['Neighbourhood'].agg(lambda x : ', '.join(set(x)))).reset_index()
toronto_postcodes.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"West Hill, Morningside, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Ionview, East Birchmount Park, Kennedy Park"
7,M1L,Scarborough,"Oakridge, Clairlea, Golden Mile"
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest"
9,M1N,Scarborough,"Cliffside West, Birch Cliff"


In [9]:
toronto_postcodes.shape

(103, 3)

### Exercise part B - assigment of latitude and longitude to postal code dataframe

#### Read latitude/longitude data into separate 'df_latlong' dataframe

In [10]:
latlong_url = 'https://cocl.us/Geospatial_data'
latlong_url

'https://cocl.us/Geospatial_data'

In [11]:
df_latlong = pd.read_csv(latlong_url)
df_latlong.rename(columns = {'Postal Code' : 'Postcode'}, inplace=True)
df_latlong.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge latitude/longitude data into existing postal code data

In [12]:
toronto_latlong = pd.merge(toronto_postcodes, df_latlong, on='Postcode')
toronto_latlong.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"West Hill, Morningside, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Ionview, East Birchmount Park, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Oakridge, Clairlea, Golden Mile",43.711112,-79.284577
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West, Birch Cliff",43.692657,-79.264848


In [13]:
toronto_latlong.shape

(103, 5)

In [14]:
print('The dataframe has {} postal codes and {} boroughs.'.format(toronto_latlong.shape[0], len(toronto_latlong['Borough'].unique())))

The dataframe has 103 postal codes and 11 boroughs.


### Exercise part C - neighborhood exploration using top-level data

#### Extract Toronto latitude and longitude and produce map of Toronto area

In [15]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [16]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, postcode, neighborhood in zip(toronto_latlong['Latitude'], toronto_latlong['Longitude'], toronto_latlong['Borough'], toronto_latlong['Postcode'], toronto_latlong['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Focus on Toronto proper, by extracting postcodes with 'Toronto' in the borough name

In [17]:
toronto_proper = toronto_latlong[toronto_latlong['Borough'].str.contains('Toronto')].reset_index()
toronto_proper.head(10)

Unnamed: 0,index,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,47,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,49,M4V,Central Toronto,"South Hill, Summerhill West, Forest Hill SE, R...",43.686412,-79.400049


In [18]:
toronto_proper.shape

(39, 6)

In [19]:
map_toronto_proper = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, postcode, neighborhood in zip(toronto_proper['Latitude'], toronto_proper['Longitude'], toronto_proper['Borough'], toronto_proper['Postcode'], toronto_proper['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_proper)  
    
map_toronto_proper

#### Exploration of first neighborhood grouping on list

In [20]:
CLIENT_ID = 'O2QLX2DPST4TZUAUT55P5N2ZNMKHRRQNXGOJ3VSB5GLRLGWB' # your Foursquare ID
CLIENT_SECRET = 'KUBR2PZ2A2FIWC0XPCJDZT2ARPPNPSHNBHQE54KYHTESCDG1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: O2QLX2DPST4TZUAUT55P5N2ZNMKHRRQNXGOJ3VSB5GLRLGWB
CLIENT_SECRET:KUBR2PZ2A2FIWC0XPCJDZT2ARPPNPSHNBHQE54KYHTESCDG1


In [21]:
toronto_proper.loc[0, 'Neighbourhood']

'The Beaches'

In [22]:
neighborhood_latitude = toronto_proper.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_proper.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_proper.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


#### Search for top 100 venues within 500 meters of "The Beaches" neighborhood center - returned four venues

In [23]:
radius = 500
latitude = neighborhood_latitude
longitude = neighborhood_longitude
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
url


'https://api.foursquare.com/v2/venues/explore?client_id=O2QLX2DPST4TZUAUT55P5N2ZNMKHRRQNXGOJ3VSB5GLRLGWB&client_secret=KUBR2PZ2A2FIWC0XPCJDZT2ARPPNPSHNBHQE54KYHTESCDG1&ll=43.67635739999999,-79.2930312&v=20180605&radius=500&limit=100'

In [24]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e34cf261d67cb001bad36d9'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'labe

In [25]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(20)

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


In [27]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


#### Explore all venues within all neighborhood groupings in Toronto 

In [28]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [29]:
toronto_venues = getNearbyVenues(names=toronto_proper['Neighbourhood'],
                                   latitudes=toronto_proper['Latitude'],
                                   longitudes=toronto_proper['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
South Hill, Summerhill West, Forest Hill SE, Rathnelly, Deer Park
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
King, Richmond, Adelaide
Harbourfront East, Toronto Islands, Union Station
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill West, Forest Hill North
Yorkville, North Midtown, The Annex
University of Toronto, Harbord
Kensington Market, Grange Park, Chinatown
Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, CN Tower, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Trinity, Little Portugal
Parkdale Village, Exhibition Place, Brockton
The Junction South, High Par

In [30]:
print(toronto_venues.shape)
toronto_venues.head(10)

(1708, 7)


Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
5,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop
6,"The Danforth West, Riverdale",43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant
7,"The Danforth West, Riverdale",43.679557,-79.352188,Mezes,43.677962,-79.350196,Greek Restaurant
8,"The Danforth West, Riverdale",43.679557,-79.352188,Messini Authentic Gyros,43.677827,-79.350569,Greek Restaurant
9,"The Danforth West, Riverdale",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop


#### Create dataframe that replaces old values of 'venue category' with Foursquare top-level categories 

Idea is that broader groupings may facilitate effective clustering by highlighting neighborhood differences at a more functional level than would be feasible with more granular groupings.  Foursquare appears to have four levels of venue categories, and the values that are returned initially appears to be extracted from the second, third and fourth categories.

In [31]:
# Venue categories are described at:  https://developer.foursquare.com/docs/resources/categories
# This code extracts multi-layer data of the venue category levels 

urltl = 'https://api.foursquare.com/v2/venues/categories?client_id={}&client_secret={}&v={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION)
resultstl = requests.get(urltl).json()
results_tl2 = resultstl['response']['categories']
flatt = json_normalize(results_tl2)
level1 = pd.DataFrame(flatt)

In [32]:
# Create empty dataframe to receive the venue level information 

column_names = ['TL', 'ML', 'LL', 'LLL'] 
MultiLevels = pd.DataFrame(columns=column_names)
MultiLevels

Unnamed: 0,TL,ML,LL,LLL


In [33]:
# Extract venue levels and place into new dataframe.  
# For venues that do not have third-level ('LL') or fourth-level ('LLL') identifiers, the second-level ('ML')
# labels are downstreamed to the lower level. 

for w in range(10): 
    TL = level1.iloc[w, 4]
    level2 = pd.DataFrame(level1.iloc[w,0])
    len2 = level2.shape[0]
    for x in range(len2):
        ML = level2.iloc[x, 3]
        if level2.iloc[x, 0] == []:
            LL = ML
            LLL = ML
            temp = pd.DataFrame([TL, ML, LL, LLL]).T
            temp.columns = column_names
            MultiLevels = MultiLevels.append(temp)
        else:
            level3 = pd.DataFrame(level2.iloc[x, 0])
            len3 = level3.shape[0]
            for y in range(len3):
                LL = level3.iloc[y, 3]
                if level3.iloc[y, 0] == []:
                    LLL = LL
                    temp = pd.DataFrame([TL, ML, LL, LLL]).T
                    temp.columns = column_names
                    MultiLevels = MultiLevels.append(temp)
                else: 
                    level4 = pd.DataFrame(level3.iloc[y, 0])
                    len4 = level4.shape[0]
                    for z in range(len4):
                        LLL = level4.iloc[z, 3]
                        temp = pd.DataFrame([TL, ML, LL, LLL]).T
                        temp.columns = column_names
                        MultiLevels = MultiLevels.append(temp)
                    
MultiLevels = pd.DataFrame(MultiLevels)


In [34]:
# Examples of data with two levels
MultiLevels.iloc[1:10, ]

Unnamed: 0,TL,ML,LL,LLL
0,Arts & Entertainment,Aquarium,Aquarium,Aquarium
0,Arts & Entertainment,Arcade,Arcade,Arcade
0,Arts & Entertainment,Art Gallery,Art Gallery,Art Gallery
0,Arts & Entertainment,Bowling Alley,Bowling Alley,Bowling Alley
0,Arts & Entertainment,Casino,Casino,Casino
0,Arts & Entertainment,Circus,Circus,Circus
0,Arts & Entertainment,Comedy Club,Comedy Club,Comedy Club
0,Arts & Entertainment,Concert Hall,Concert Hall,Concert Hall
0,Arts & Entertainment,Country Dance Club,Country Dance Club,Country Dance Club


In [35]:
# Examples of data with four levels
MultiLevels.iloc[150:159, ]

Unnamed: 0,TL,ML,LL,LLL
0,Food,Asian Restaurant,Indonesian Restaurant,Manadonese Restaurant
0,Food,Asian Restaurant,Indonesian Restaurant,Padangnese Restaurant
0,Food,Asian Restaurant,Indonesian Restaurant,Sundanese Restaurant
0,Food,Asian Restaurant,Japanese Restaurant,Donburi Restaurant
0,Food,Asian Restaurant,Japanese Restaurant,Japanese Curry Restaurant
0,Food,Asian Restaurant,Japanese Restaurant,Kaiseki Restaurant
0,Food,Asian Restaurant,Japanese Restaurant,Kushikatsu Restaurant
0,Food,Asian Restaurant,Japanese Restaurant,Monjayaki Restaurant
0,Food,Asian Restaurant,Japanese Restaurant,Nabe Restaurant


In [36]:
# Creation of dictonary that maps the 'venue category' to the top level ('TL').

# The 'TL_ML', 'TL_LL' and 'TL_LLL' subsets will be used to generate the top-level dictionary.

TL_ML = MultiLevels[['TL', 'ML']].copy()
TL_ML.rename(columns={'ML':'zzz'}, inplace=True)

TL_LL = MultiLevels[['TL', 'LL']].copy()
TL_LL.rename(columns={'LL':'zzz'}, inplace=True)

TL_LLL = MultiLevels[['TL', 'LLL']].copy()
TL_LLL.rename(columns={'LLL':'zzz'}, inplace=True)

In [37]:
# Source data for top-level dictionary

TL_ML_LL_LLL = pd.concat([TL_ML, TL_LL, TL_LLL]).drop_duplicates().reset_index(drop=True)
TL_ML_LL_LLL.rename(columns={'zzz':'Venue Category'}, inplace=True)
TL_ML_LL_LLL.head()

Unnamed: 0,TL,Venue Category
0,Arts & Entertainment,Amphitheater
1,Arts & Entertainment,Aquarium
2,Arts & Entertainment,Arcade
3,Arts & Entertainment,Art Gallery
4,Arts & Entertainment,Bowling Alley


In [38]:
# Top level dictionary
dict_TL = TL_ML_LL_LLL.set_index('Venue Category')['TL'].to_dict()
dict_TL

{'Amphitheater': 'Arts & Entertainment',
 'Aquarium': 'Arts & Entertainment',
 'Arcade': 'Arts & Entertainment',
 'Art Gallery': 'Arts & Entertainment',
 'Bowling Alley': 'Arts & Entertainment',
 'Casino': 'Arts & Entertainment',
 'Circus': 'Arts & Entertainment',
 'Comedy Club': 'Arts & Entertainment',
 'Concert Hall': 'Arts & Entertainment',
 'Country Dance Club': 'Arts & Entertainment',
 'Disc Golf': 'Arts & Entertainment',
 'Exhibit': 'Arts & Entertainment',
 'General Entertainment': 'Arts & Entertainment',
 'Go Kart Track': 'Arts & Entertainment',
 'Historic Site': 'Arts & Entertainment',
 'Karaoke Box': 'Arts & Entertainment',
 'Laser Tag': 'Arts & Entertainment',
 'Memorial Site': 'Arts & Entertainment',
 'Mini Golf': 'Arts & Entertainment',
 'Movie Theater': 'Arts & Entertainment',
 'Museum': 'Arts & Entertainment',
 'Music Venue': 'Arts & Entertainment',
 'Pachinko Parlor': 'Arts & Entertainment',
 'Performing Arts Venue': 'Arts & Entertainment',
 'Pool Hall': 'Arts & Entertai

#### Incorporation of top-level data into the analysis

In [39]:
# 'Old venue category' serves as a check.    
toronto_venues_top = toronto_venues.copy()
toronto_venues_top[['Old venue category']] = toronto_venues_top[['Venue Category']]
toronto_venues_top.head(10)

Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Old venue category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant,Greek Restaurant
5,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop,Cosmetics Shop
6,"The Danforth West, Riverdale",43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant,Italian Restaurant
7,"The Danforth West, Riverdale",43.679557,-79.352188,Mezes,43.677962,-79.350196,Greek Restaurant,Greek Restaurant
8,"The Danforth West, Riverdale",43.679557,-79.352188,Messini Authentic Gyros,43.677827,-79.350569,Greek Restaurant,Greek Restaurant
9,"The Danforth West, Riverdale",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop,Ice Cream Shop


In [40]:
# 'Venue category' is replaced with the top level category using the dictionary.
toronto_venues_top['Venue Category']= toronto_venues_top['Venue Category'].map(dict_TL)
toronto_venues_top.head(10)

Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Old venue category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Outdoors & Recreation,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Shop & Service,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Nightlife Spot,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Outdoors & Recreation,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Food,Greek Restaurant
5,"The Danforth West, Riverdale",43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Shop & Service,Cosmetics Shop
6,"The Danforth West, Riverdale",43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Food,Italian Restaurant
7,"The Danforth West, Riverdale",43.679557,-79.352188,Mezes,43.677962,-79.350196,Food,Greek Restaurant
8,"The Danforth West, Riverdale",43.679557,-79.352188,Messini Authentic Gyros,43.677827,-79.350569,Food,Greek Restaurant
9,"The Danforth West, Riverdale",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Food,Ice Cream Shop


Take care of the single NaN by assiging 'Food' to venue category for that row

In [41]:
nullcheck = toronto_venues_top[toronto_venues_top['Venue Category'].isnull()]
nullcheck.head(30)

Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Old venue category
1614,"Runnymede, Swansea",43.651571,-79.48445,The Good Fork,43.649565,-79.484023,,Food


In [42]:
toronto_venues_top.iat[1614, 6] = 'Food'
toronto_venues_top.iloc[1612:1616, ]

Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Old venue category
1612,"Roncesvalles, Parkdale",43.64896,-79.456325,A Good Read,43.64947,-79.450339,Shop & Service,Bookstore
1613,"Roncesvalles, Parkdale",43.64896,-79.456325,Butler's Pantry,43.650087,-79.450458,Food,Breakfast Spot
1614,"Runnymede, Swansea",43.651571,-79.48445,The Good Fork,43.649565,-79.484023,Food,Food
1615,"Runnymede, Swansea",43.651571,-79.48445,The One That Got Away,43.649842,-79.482615,Food,Fish & Chips Shop


#### Run analysis using top-level data

In [43]:
toronto_venues_top.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Old venue category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, CN Tower, South Niagara",18,18,18,18,18,18,18
Berczy Park,55,55,55,55,55,55,55
Business Reply Mail Processing Centre 969 Eastern,17,17,17,17,17,17,17
"Cabbagetown, St. James Town",44,44,44,44,44,44,44
Central Bay Street,79,79,79,79,79,79,79
Christie,18,18,18,18,18,18,18
Church and Wellesley,83,83,83,83,83,83,83
"Commerce Court, Victoria Hotel",100,100,100,100,100,100,100
Davisville,33,33,33,33,33,33,33
Davisville North,7,7,7,7,7,7,7


In [44]:
print('There are {} uniques categories.'.format(len(toronto_venues_top['Venue Category'].unique())))

There are 9 uniques categories.


In [45]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues_top[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
toronto_onehot.insert(0, 'Neighbourhood', toronto_venues_top['Neighbourhood'])
toronto_onehot.head(10)

Unnamed: 0,Neighbourhood,Arts & Entertainment,College & University,Event,Food,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Shop & Service,Travel & Transport
0,The Beaches,0,0,0,0,0,1,0,0,0
1,The Beaches,0,0,0,0,0,0,0,1,0
2,The Beaches,0,0,0,0,1,0,0,0,0
3,The Beaches,0,0,0,0,0,1,0,0,0
4,"The Danforth West, Riverdale",0,0,0,1,0,0,0,0,0
5,"The Danforth West, Riverdale",0,0,0,0,0,0,0,1,0
6,"The Danforth West, Riverdale",0,0,0,1,0,0,0,0,0
7,"The Danforth West, Riverdale",0,0,0,1,0,0,0,0,0
8,"The Danforth West, Riverdale",0,0,0,1,0,0,0,0,0
9,"The Danforth West, Riverdale",0,0,0,1,0,0,0,0,0


In [46]:
toronto_onehot.shape

(1708, 10)

#### Analysis across neighborhood groupings

In [47]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Arts & Entertainment,College & University,Event,Food,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Shop & Service,Travel & Transport
0,"Bathurst Quay, Island airport, Harbourfront We...",0.0,0.0,0.0,0.055556,0.055556,0.111111,0.0,0.055556,0.722222
1,Berczy Park,0.090909,0.0,0.0,0.509091,0.109091,0.054545,0.0,0.218182,0.018182
2,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.235294,0.058824,0.235294,0.0,0.352941,0.117647
3,"Cabbagetown, St. James Town",0.045455,0.0,0.0,0.568182,0.045455,0.090909,0.0,0.25,0.0
4,Central Bay Street,0.0,0.0,0.0,0.772152,0.037975,0.050633,0.012658,0.113924,0.012658
5,Christie,0.0,0.0,0.0,0.388889,0.055556,0.166667,0.0,0.388889,0.0
6,Church and Wellesley,0.036145,0.0,0.0,0.590361,0.120482,0.072289,0.0,0.156627,0.024096
7,"Commerce Court, Victoria Hotel",0.04,0.0,0.0,0.68,0.08,0.06,0.01,0.07,0.06
8,Davisville,0.0,0.0,0.0,0.69697,0.030303,0.090909,0.0,0.181818,0.0
9,Davisville North,0.0,0.0,0.0,0.285714,0.0,0.285714,0.0,0.285714,0.142857


In [48]:
toronto_grouped.shape

(39, 10)

#### Sort venues and place into data frame, identifying the top six venues within each neighborhood grouping

In [49]:
num_top_venues = 6

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, CN Tower, South Niagara----
                   venue  freq
0     Travel & Transport  0.72
1  Outdoors & Recreation  0.11
2                   Food  0.06
3         Nightlife Spot  0.06
4         Shop & Service  0.06
5   Arts & Entertainment  0.00


----Berczy Park----
                   venue  freq
0                   Food  0.51
1         Shop & Service  0.22
2         Nightlife Spot  0.11
3   Arts & Entertainment  0.09
4  Outdoors & Recreation  0.05
5     Travel & Transport  0.02


----Business Reply Mail Processing Centre 969 Eastern----
                   venue  freq
0         Shop & Service  0.35
1                   Food  0.24
2  Outdoors & Recreation  0.24
3     Travel & Transport  0.12
4         Nightlife Spot  0.06
5   Arts & Entertainment  0.00


----Cabbagetown, St. James Town----
                   venue  freq
0                   Food  0.57
1         Shop & Service  0.25
2  Outdoors & Recreat

                   venue  freq
0                   Food  0.69
1         Shop & Service  0.19
2         Nightlife Spot  0.07
3  Outdoors & Recreation  0.05
4   Arts & Entertainment  0.00
5   College & University  0.00


----The Junction South, High Park----
                   venue  freq
0                   Food  0.54
1         Shop & Service  0.25
2         Nightlife Spot  0.12
3   Arts & Entertainment  0.04
4  Outdoors & Recreation  0.04
5   College & University  0.00


----Toronto Dominion Centre, Design Exchange----
                   venue  freq
0                   Food  0.66
1         Nightlife Spot  0.11
2     Travel & Transport  0.09
3         Shop & Service  0.06
4   Arts & Entertainment  0.05
5  Outdoors & Recreation  0.03


----Trinity, Little Portugal----
                   venue  freq
0                   Food  0.61
1         Nightlife Spot  0.17
2         Shop & Service  0.13
3   Arts & Entertainment  0.06
4  Outdoors & Recreation  0.04
5   College & University  0.00


----

In [50]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [51]:
num_top_venues = 6

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(10)

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue
0,"Bathurst Quay, Island airport, Harbourfront We...",Travel & Transport,Outdoors & Recreation,Shop & Service,Nightlife Spot,Food,Professional & Other Places
1,Berczy Park,Food,Shop & Service,Nightlife Spot,Arts & Entertainment,Outdoors & Recreation,Travel & Transport
2,Business Reply Mail Processing Centre 969 Eastern,Shop & Service,Outdoors & Recreation,Food,Travel & Transport,Nightlife Spot,Professional & Other Places
3,"Cabbagetown, St. James Town",Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment,Travel & Transport
4,Central Bay Street,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Travel & Transport,Professional & Other Places
5,Christie,Shop & Service,Food,Outdoors & Recreation,Nightlife Spot,Travel & Transport,Professional & Other Places
6,Church and Wellesley,Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Arts & Entertainment,Travel & Transport
7,"Commerce Court, Victoria Hotel",Food,Nightlife Spot,Shop & Service,Travel & Transport,Outdoors & Recreation,Arts & Entertainment
8,Davisville,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Travel & Transport,Professional & Other Places
9,Davisville North,Shop & Service,Outdoors & Recreation,Food,Travel & Transport,Professional & Other Places,Nightlife Spot


#### K-means clustering of each neighborhood grouping, using five clusters

In [52]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 3, 0, 0, 3, 0, 0, 0, 3], dtype=int32)

#### Incorporate clusters into new dataframe, and map the clusters

In [53]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_proper.copy()

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head(10)

Unnamed: 0,index,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031,4,Outdoors & Recreation,Shop & Service,Nightlife Spot,Travel & Transport,Professional & Other Places,Food
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Travel & Transport,Professional & Other Places
2,42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment,Travel & Transport
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Travel & Transport
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,Travel & Transport,Professional & Other Places,Outdoors & Recreation,Shop & Service,Nightlife Spot,Food
5,45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,3,Shop & Service,Outdoors & Recreation,Food,Travel & Transport,Professional & Other Places,Nightlife Spot
6,46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,3,Food,Shop & Service,Outdoors & Recreation,Travel & Transport,Professional & Other Places,Nightlife Spot
7,47,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Travel & Transport,Professional & Other Places
8,48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,1,Outdoors & Recreation,Travel & Transport,Shop & Service,Professional & Other Places,Nightlife Spot,Food
9,49,M4V,Central Toronto,"South Hill, Summerhill West, Forest Hill SE, R...",43.686412,-79.400049,0,Food,Nightlife Spot,Shop & Service,Travel & Transport,Professional & Other Places,Outdoors & Recreation


In [54]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### First cluster 

In [55]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[3] + list(range(7, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue
1,"The Danforth West, Riverdale",Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Travel & Transport,Professional & Other Places
2,"The Beaches West, India Bazaar",Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment,Travel & Transport
3,Studio District,Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Travel & Transport
7,Davisville,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Travel & Transport,Professional & Other Places
9,"South Hill, Summerhill West, Forest Hill SE, R...",Food,Nightlife Spot,Shop & Service,Travel & Transport,Professional & Other Places,Outdoors & Recreation
11,"Cabbagetown, St. James Town",Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment,Travel & Transport
12,Church and Wellesley,Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Arts & Entertainment,Travel & Transport
13,Harbourfront,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment,Travel & Transport
14,"Garden District, Ryerson",Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment,Travel & Transport
15,St. James Town,Food,Shop & Service,Nightlife Spot,Arts & Entertainment,Travel & Transport,Outdoors & Recreation


#### Second cluster

In [56]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[3] + list(range(7, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue
8,"Moore Park, Summerhill East",Outdoors & Recreation,Travel & Transport,Shop & Service,Professional & Other Places,Nightlife Spot,Food
10,Rosedale,Outdoors & Recreation,Travel & Transport,Shop & Service,Professional & Other Places,Nightlife Spot,Food
22,Roselawn,Outdoors & Recreation,Travel & Transport,Shop & Service,Professional & Other Places,Nightlife Spot,Food


#### Third cluster 

In [57]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[3] + list(range(7, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue
4,Lawrence Park,Travel & Transport,Professional & Other Places,Outdoors & Recreation,Shop & Service,Nightlife Spot,Food
27,"Bathurst Quay, Island airport, Harbourfront We...",Travel & Transport,Outdoors & Recreation,Shop & Service,Nightlife Spot,Food,Professional & Other Places


#### Fourth cluster 

In [58]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[3] + list(range(7, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue
5,Davisville North,Shop & Service,Outdoors & Recreation,Food,Travel & Transport,Professional & Other Places,Nightlife Spot
6,North Toronto West,Food,Shop & Service,Outdoors & Recreation,Travel & Transport,Professional & Other Places,Nightlife Spot
23,"Forest Hill West, Forest Hill North",Food,Shop & Service,Outdoors & Recreation,Travel & Transport,Professional & Other Places,Nightlife Spot
30,Christie,Shop & Service,Food,Outdoors & Recreation,Nightlife Spot,Travel & Transport,Professional & Other Places
31,"Dufferin, Dovercourt Village",Shop & Service,Food,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment,Travel & Transport
38,Business Reply Mail Processing Centre 969 Eastern,Shop & Service,Outdoors & Recreation,Food,Travel & Transport,Nightlife Spot,Professional & Other Places


#### Fifth cluster 

In [59]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[3] + list(range(7, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue
0,The Beaches,Outdoors & Recreation,Shop & Service,Nightlife Spot,Travel & Transport,Professional & Other Places,Food


### Redoing clustering, after adding the normalized number of venues as an input
#### The above clustering exercise does not lead to interesting conclusions, so the exercise is repeated after adding the normalized number of venues to the mix.

#### Extracting number of venues for each neighborhood group, with min-max normalizing

In [60]:
neighborhood_count = pd.DataFrame(toronto_venues_top.groupby('Neighbourhood').count())
neighborhood_count = neighborhood_count.iloc[:, 1]
neighborhood_count = pd.DataFrame((neighborhood_count - neighborhood_count.min())/(neighborhood_count.max() - neighborhood_count.min())).reset_index()
neighborhood_count.rename(columns={'Neighborhood Longitude':'values_norm'}, inplace=True)
neighborhood_count

Unnamed: 0,Neighbourhood,values_norm
0,"Bathurst Quay, Island airport, Harbourfront We...",0.163265
1,Berczy Park,0.540816
2,Business Reply Mail Processing Centre 969 Eastern,0.153061
3,"Cabbagetown, St. James Town",0.428571
4,Central Bay Street,0.785714
5,Christie,0.163265
6,Church and Wellesley,0.826531
7,"Commerce Court, Victoria Hotel",1.0
8,Davisville,0.316327
9,Davisville North,0.05102


In [61]:
neighborhood_count.shape

(39, 2)

#### Incorporation of normalized values as an input into the clustering exercise

In [62]:
toronto_grouped2 = toronto_grouped.copy()
toronto_grouped2 = pd.concat([toronto_grouped2, neighborhood_count.iloc[:, 1]], axis=1)
toronto_grouped2.head(10)

Unnamed: 0,Neighbourhood,Arts & Entertainment,College & University,Event,Food,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Shop & Service,Travel & Transport,values_norm
0,"Bathurst Quay, Island airport, Harbourfront We...",0.0,0.0,0.0,0.055556,0.055556,0.111111,0.0,0.055556,0.722222,0.163265
1,Berczy Park,0.090909,0.0,0.0,0.509091,0.109091,0.054545,0.0,0.218182,0.018182,0.540816
2,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.235294,0.058824,0.235294,0.0,0.352941,0.117647,0.153061
3,"Cabbagetown, St. James Town",0.045455,0.0,0.0,0.568182,0.045455,0.090909,0.0,0.25,0.0,0.428571
4,Central Bay Street,0.0,0.0,0.0,0.772152,0.037975,0.050633,0.012658,0.113924,0.012658,0.785714
5,Christie,0.0,0.0,0.0,0.388889,0.055556,0.166667,0.0,0.388889,0.0,0.163265
6,Church and Wellesley,0.036145,0.0,0.0,0.590361,0.120482,0.072289,0.0,0.156627,0.024096,0.826531
7,"Commerce Court, Victoria Hotel",0.04,0.0,0.0,0.68,0.08,0.06,0.01,0.07,0.06,1.0
8,Davisville,0.0,0.0,0.0,0.69697,0.030303,0.090909,0.0,0.181818,0.0,0.316327
9,Davisville North,0.0,0.0,0.0,0.285714,0.0,0.285714,0.0,0.285714,0.142857,0.05102


In [63]:
# set number of clusters
kclusters2 = 5

toronto_grouped_clustering2 = toronto_grouped2.drop('Neighbourhood', 1)

# run k-means clustering
kmeans2 = KMeans(n_clusters=kclusters2, random_state=0).fit(toronto_grouped_clustering2)

# check cluster labels generated for each row in the dataframe
kmeans2.labels_[0:10] 
#toronto_grouped_clustering2.head()

array([3, 1, 4, 1, 0, 4, 0, 0, 1, 4], dtype=int32)

In [64]:
# add clustering labels
neighborhoods_venues_sorted2 = neighborhoods_venues_sorted.copy()
neighborhoods_venues_sorted2.drop(['Cluster Labels'], axis=1, inplace=True)
neighborhoods_venues_sorted2.insert(0, 'Cluster Labels', kmeans2.labels_)

toronto_merged2 = toronto_proper.copy()
toronto_merged2 = pd.concat([toronto_merged2, neighborhood_count.iloc[:, 1]], axis=1)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged2 = toronto_merged2.join(neighborhoods_venues_sorted2.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged2.head(10)

Unnamed: 0,index,Postcode,Borough,Neighbourhood,Latitude,Longitude,values_norm,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue
0,37,M4E,East Toronto,The Beaches,43.676357,-79.293031,0.163265,4,Outdoors & Recreation,Shop & Service,Nightlife Spot,Travel & Transport,Professional & Other Places,Food
1,41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0.540816,1,Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Travel & Transport,Professional & Other Places
2,42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0.153061,1,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment,Travel & Transport
3,43,M4M,East Toronto,Studio District,43.659526,-79.340923,0.428571,1,Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Travel & Transport
4,44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0.785714,3,Travel & Transport,Professional & Other Places,Outdoors & Recreation,Shop & Service,Nightlife Spot,Food
5,45,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0.163265,4,Shop & Service,Outdoors & Recreation,Food,Travel & Transport,Professional & Other Places,Nightlife Spot
6,46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0.826531,4,Food,Shop & Service,Outdoors & Recreation,Travel & Transport,Professional & Other Places,Nightlife Spot
7,47,M4S,Central Toronto,Davisville,43.704324,-79.38879,1.0,1,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Travel & Transport,Professional & Other Places
8,48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,0.316327,2,Outdoors & Recreation,Travel & Transport,Shop & Service,Professional & Other Places,Nightlife Spot,Food
9,49,M4V,Central Toronto,"South Hill, Summerhill West, Forest Hill SE, R...",43.686412,-79.400049,0.05102,1,Food,Nightlife Spot,Shop & Service,Travel & Transport,Professional & Other Places,Outdoors & Recreation


In [65]:
# create map
map_clusters2 = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters2)
ys = [i + x + (i*x)**2 for i in range(kclusters2)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged2['Latitude'], toronto_merged2['Longitude'], toronto_merged2['Neighbourhood'], toronto_merged2['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### First cluster

In [66]:
toronto_merged2.loc[toronto_merged2['Cluster Labels'] == 0, toronto_merged2.columns[[3, 6] + list(range(8, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,values_norm,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
12,Church and Wellesley,0.020408,Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Arts & Entertainment
14,"Garden District, Ryerson",0.469388,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment
15,St. James Town,1.0,Food,Shop & Service,Nightlife Spot,Arts & Entertainment,Travel & Transport
17,Central Bay Street,1.0,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Travel & Transport
18,"King, Richmond, Adelaide",0.010204,Food,Shop & Service,Nightlife Spot,Arts & Entertainment,Outdoors & Recreation
19,"Harbourfront East, Toronto Islands, Union Station",0.0,Food,Arts & Entertainment,Outdoors & Recreation,Nightlife Spot,Travel & Transport
20,"Toronto Dominion Centre, Design Exchange",0.204082,Food,Nightlife Spot,Travel & Transport,Shop & Service,Arts & Entertainment
21,"Commerce Court, Victoria Hotel",0.204082,Food,Nightlife Spot,Shop & Service,Travel & Transport,Outdoors & Recreation
26,"Kensington Market, Grange Park, Chinatown",0.367347,Food,Shop & Service,Nightlife Spot,Travel & Transport,Professional & Other Places
28,Stn A PO Boxes 25 The Esplanade,1.0,Food,Shop & Service,Nightlife Spot,Arts & Entertainment,Outdoors & Recreation


#### Second cluster

In [67]:
toronto_merged2.loc[toronto_merged2['Cluster Labels'] == 1, toronto_merged2.columns[[3, 6] + list(range(8, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,values_norm,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,"The Danforth West, Riverdale",0.540816,Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Travel & Transport
2,"The Beaches West, India Bazaar",0.153061,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment
3,Studio District,0.428571,Food,Shop & Service,Nightlife Spot,Outdoors & Recreation,Professional & Other Places
7,Davisville,1.0,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Travel & Transport
9,"South Hill, Summerhill West, Forest Hill SE, R...",0.05102,Food,Nightlife Spot,Shop & Service,Travel & Transport,Professional & Other Places
11,"Cabbagetown, St. James Town",1.0,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment
13,Harbourfront,1.0,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment
16,Berczy Park,0.836735,Food,Shop & Service,Nightlife Spot,Arts & Entertainment,Outdoors & Recreation
24,"Yorkville, North Midtown, The Annex",0.020408,Food,Shop & Service,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment
25,"University of Toronto, Harbord",0.0,Food,Shop & Service,Nightlife Spot,College & University,Outdoors & Recreation


#### Third cluster

In [68]:
toronto_merged2.loc[toronto_merged2['Cluster Labels'] == 2, toronto_merged2.columns[[3, 6] + list(range(8, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,values_norm,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
8,"Moore Park, Summerhill East",0.316327,Outdoors & Recreation,Travel & Transport,Shop & Service,Professional & Other Places,Nightlife Spot
10,Rosedale,0.153061,Outdoors & Recreation,Travel & Transport,Shop & Service,Professional & Other Places,Nightlife Spot
22,Roselawn,0.377551,Outdoors & Recreation,Travel & Transport,Shop & Service,Professional & Other Places,Nightlife Spot


#### Fourth cluster

In [69]:
toronto_merged2.loc[toronto_merged2['Cluster Labels'] == 3, toronto_merged2.columns[[3, 6] + list(range(8, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,values_norm,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
4,Lawrence Park,0.785714,Travel & Transport,Professional & Other Places,Outdoors & Recreation,Shop & Service,Nightlife Spot
27,"Bathurst Quay, Island airport, Harbourfront We...",0.122449,Travel & Transport,Outdoors & Recreation,Shop & Service,Nightlife Spot,Food


#### Fifth cluster

In [70]:
toronto_merged2.loc[toronto_merged2['Cluster Labels'] == 4, toronto_merged2.columns[[3, 6] + list(range(8, toronto_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,values_norm,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,The Beaches,0.163265,Outdoors & Recreation,Shop & Service,Nightlife Spot,Travel & Transport,Professional & Other Places
5,Davisville North,0.163265,Shop & Service,Outdoors & Recreation,Food,Travel & Transport,Professional & Other Places
6,North Toronto West,0.826531,Food,Shop & Service,Outdoors & Recreation,Travel & Transport,Professional & Other Places
23,"Forest Hill West, Forest Hill North",0.112245,Food,Shop & Service,Outdoors & Recreation,Travel & Transport,Professional & Other Places
30,Christie,0.408163,Shop & Service,Food,Outdoors & Recreation,Nightlife Spot,Travel & Transport
31,"Dufferin, Dovercourt Village",0.020408,Shop & Service,Food,Outdoors & Recreation,Nightlife Spot,Arts & Entertainment
38,Business Reply Mail Processing Centre 969 Eastern,0.204082,Shop & Service,Outdoors & Recreation,Food,Travel & Transport,Nightlife Spot


#### Bottom line
Adding the number of venues per heighborhood improves the balancing of the clusters, but it remains difficult to draw broad conclusions