# Neighborhood Segmentation

## Webscrape wikipedia page for Toronto neighbourhood data

### Data required are Postal Code, Borough and Neighborhood

In [5]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.request import urlopen
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

In [6]:
post_codes = []
boroughs = []
neighborhoods = []
tables = soup.find_all('table')

for table in tables:
    #each row is defined by the 'tr' tag
    rows = table.find_all('tr')
    for row in rows[1:]:
            #the data in each cell of each row uses the 'tr' tag
            cells = row.find_all('td')
            
            if len(cells) > 1:
                #append the data into their respectives lists, text.strip removes html tags
                post_code = cells[0]
                post_codes.append(post_code.text.strip())
                
                borough = cells[1]
                boroughs.append(borough.text.strip())
                
                neighborhood = cells[2]
                neighborhoods.append(neighborhood.text.strip())
        

### Now that the data is extracted, we put the data into a DataFrame

In [7]:
df_toronto = pd.DataFrame(list(zip(post_codes, boroughs, neighborhoods)), columns = ['PostalCode', 'Borough', 'Neighborhood'])

In [8]:
df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
179,M9Z,Not assigned,Not assigned
180,NL\n\nNS\n\nPE\n\nNB\n\nQC\n\nON\n\nMB\n\nSK\n...,NL,NS
181,NL,NS,PE
182,A,B,C


#### Clean the dataframe
1. Remove any irrelevant data
2. Remove rows with borough that is 'Not assigned'
3. replace 'not assigned' niehgborhood with the name of the Borough


In [9]:
df_toronto.drop([180,181,182,183], axis=0, inplace=True)
df_toronto.drop(df_toronto.loc[df_toronto['Borough']=='Not assigned'].index, inplace=True)

In [10]:
df_toronto.reset_index(inplace=True)
del df_toronto['index']

In [11]:
#replace unassigned neighborhood with borough name
mask = df_toronto['Neighborhood'] == 'Not assigned'
column_name = 'Neighborhood'
df_toronto.loc[mask, column_name] = df_toronto['Borough']
#df_toronto['Neighborhood'][df_toronto['Neighborhood'] == 'Not assigned'] = df_toronto['Borough']

In [12]:
df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [13]:
df_toronto.shape

(103, 3)

## Getting the latitude and longitude cooridnates of each neighborhood

### Install geocoder and Import the module

### I found another geocoder called pgeocode that supports Canada
https://pgeocode.readthedocs.io/en/latest/overview.html#supported-countries


In [14]:
!pip install pgeocode #install pgeocode

Collecting pgeocode
  Downloading pgeocode-0.3.0-py3-none-any.whl (8.5 kB)
Installing collected packages: pgeocode
Successfully installed pgeocode-0.3.0


In [15]:
import pgeocode

In [16]:
nomi_ca = pgeocode.Nominatim('ca') #create pgeocode object for canada

post_codes = list(df_toronto['PostalCode']) #creating an updated list of postal codes
lats = []
lngs = []

for code in post_codes: #looping through each postcode and getting their respective latitudes and longitudes
    lat = nomi_ca.query_postal_code(code).latitude
    lng = nomi_ca.query_postal_code(code).longitude
    #appending each coordinate into a list
    lats.append(lat)
    lngs.append(lng)


In [17]:
lats[:5]

[43.7545, 43.7276, 43.6555, 43.7223, 43.6641]

In [18]:
lngs[:5]

[-79.33, -79.3148, -79.3626, -79.4504, -79.3889]

#### Merge the latitudes and longitudes into the toronto DataFrame

In [19]:
df_toronto['Latitude'] = lats
df_toronto['Longitude'] = lngs

In [20]:
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


#### Check if there are any NaN values

In [21]:
df_toronto[df_toronto.isnull().any(axis=1)]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
76,M7R,Mississauga,Canada Post Gateway Processing Centre,,


#### It seems there is one entry('M7R') with NaN values, I will replace with data found in the csv file

In [22]:
# lat and lng value in csv file
# 43.6369656, -79.615819

df_toronto["Latitude"].replace(np.nan, 43.6370, inplace=True)
df_toronto["Longitude"].replace(np.nan, -79.6158, inplace=True)

In [23]:
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889


In [24]:
df_toronto.shape

(103, 5)

## Cluster the Neighborhood

#### Import Necessary libraries

In [None]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [29]:
!pip install folium

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 4.2 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


In [30]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#### Use geopy library to get the latitude and longitude values of Toronto

In [78]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [79]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Segment the Toronto Data to just Downtown Toronto

In [80]:
toronto_data = df_toronto[df_toronto['Borough']=='Downtown Toronto'].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783
3,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756
4,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754


#### Use geopy library to get the latitude and longitude values of Downtown Toronto

In [81]:
address = 'Downtown Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6563221, -79.3809161.


In [82]:
# create map of downtown Toronto using latitude and longitude values
map_toronto_dt = folium.Map(location=[latitude, longitude], zoom_start=14)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_dt)  
    
map_toronto_dt

### Define Foursquare Credentials and Version

In [83]:
CLIENT_ID = '13FGS0OA0ELDEBO1B15XH0H2C3VWFX1XAI1KX5GSRZ4LX3WL' # your Foursquare ID
CLIENT_SECRET = 'GTCPAXZKHUYWPX1WLPZAPCCUPN13M0ULXLTIODQHP1EOMICN' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value


#### Start with exploring 1 neighborhood of the toronto downtown dataframe

In [84]:
toronto_data.loc[0, 'Neighborhood']

'Regent Park, Harbourfront'

#### Get the cooridnates

In [85]:
neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6555, -79.3626.


### Get the top 100 venues within a 500m radius of Regent Park
#### Define the url for FourSquare

In [86]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=13FGS0OA0ELDEBO1B15XH0H2C3VWFX1XAI1KX5GSRZ4LX3WL&client_secret=GTCPAXZKHUYWPX1WLPZAPCCUPN13M0ULXLTIODQHP1EOMICN&v=20180605&ll=43.6555,-79.3626&radius=500&limit=100'

#### Send the get request and examine the results

In [87]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fd1bd7ea06d855f1b087eed'},
 'response': {'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 22,
  'suggestedBounds': {'ne': {'lat': 43.660000004500006,
    'lng': -79.3563918719477},
   'sw': {'lat': 43.6509999955, 'lng': -79.36880812805231}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.653446723052674,
          'lng': -79.3620167174383}],
        'distance': 233

#### Extract the categories of each venue

In [88]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### Compile the json data into a dataframe

In [89]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  app.launch_new_instance()


Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Figs Breakfast & Lunch,Breakfast Spot,43.655675,-79.364503
3,Berkeley Church,Event Space,43.655123,-79.365873
4,The Yoga Lounge,Yoga Studio,43.655515,-79.364955


In [90]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

22 venues were returned by Foursquare.


### Explore the Neighborhoods in Downtown Toronto

#### Function to explore each neighborhood in Downtown Toronto

In [91]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [92]:
toronto_dt_venues = getNearbyVenues(names=toronto_data['Neighborhood'], latitudes=toronto_data['Latitude'], longitudes=toronto_data['Longitude'], radius=500)
toronto_dt_venues.head()

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.6555,-79.3626,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.6555,-79.3626,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.6555,-79.3626,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
3,"Regent Park, Harbourfront",43.6555,-79.3626,Berkeley Church,43.655123,-79.365873,Event Space
4,"Regent Park, Harbourfront",43.6555,-79.3626,The Yoga Lounge,43.655515,-79.364955,Yoga Studio


In [93]:
toronto_dt_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,90,90,90,90,90,90
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",56,56,56,56,56,56
Central Bay Street,74,74,74,74,74,74
Christie,12,12,12,12,12,12
Church and Wellesley,74,74,74,74,74,74
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",4,4,4,4,4,4
"Kensington Market, Chinatown, Grange Park",64,64,64,64,64,64


In [94]:
print('There are {} uniques categories.'.format(len(toronto_dt_venues['Venue Category'].unique())))

There are 183 uniques categories.


### Analyze each neighborhood

In [95]:
# one hot encoding
toronto_dt_onehot = pd.get_dummies(toronto_dt_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_dt_onehot['Neighborhood'] = toronto_dt_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_dt_onehot.columns[-1]] + list(toronto_dt_onehot.columns[:-1])
toronto_dt_onehot = toronto_dt_onehot[fixed_columns]

toronto_dt_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Theater,Theme Restaurant,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
toronto_dt_onehot.shape

(1191, 183)

#### Group the rows by neighborhood and by taking the mean of the frequency of occurence of each category

In [97]:
toronto_dt_grouped = toronto_dt_onehot.groupby('Neighborhood').mean().reset_index()
toronto_dt_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Theater,Theme Restaurant,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint
0,Berczy Park,0.011111,0.0,0.011111,0.022222,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.011111,0.0,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.017857,0.0,0.0,0.0,0.0,0.0,0.017857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017857
2,Central Bay Street,0.0,0.0,0.0,0.0,0.013514,0.0,0.0,0.0,0.0,...,0.0,0.0,0.013514,0.0,0.0,0.013514,0.013514,0.0,0.013514,0.0
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.027027,0.013514,0.013514,0.0,0.0,0.0,0.0,0.0,0.0,...,0.013514,0.013514,0.0,0.0,0.0,0.0,0.013514,0.0,0.0,0.0
5,"Commerce Court, Victoria Hotel",0.0,0.0,0.03,0.01,0.0,0.0,0.03,0.0,0.0,...,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0
6,"First Canadian Place, Underground city",0.0,0.0,0.03,0.01,0.0,0.0,0.03,0.0,0.0,...,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0
7,"Garden District, Ryerson",0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,...,0.02,0.0,0.01,0.0,0.0,0.01,0.01,0.0,0.01,0.0
8,"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Kensington Market, Chinatown, Grange Park",0.0,0.0,0.0,0.015625,0.0,0.015625,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0625,0.0,0.03125,0.0,0.015625,0.0


In [98]:
toronto_dt_grouped.shape

(19, 183)

#### Print each neighborhood along with the top 5 most common venues

In [99]:
num_top_venues = 5

for hood in toronto_dt_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_dt_grouped[toronto_dt_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
         venue  freq
0  Coffee Shop  0.10
1        Hotel  0.06
2         Café  0.04
3       Bakery  0.04
4   Restaurant  0.03


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
                venue  freq
0  Italian Restaurant  0.07
1         Coffee Shop  0.07
2                Café  0.05
3                 Bar  0.05
4                Park  0.04


----Central Bay Street----
                       venue  freq
0                Coffee Shop  0.19
1                       Café  0.04
2                 Restaurant  0.03
3           Sushi Restaurant  0.03
4  Middle Eastern Restaurant  0.03


----Christie----
           venue  freq
0  Grocery Store  0.25
1           Café  0.25
2     Playground  0.08
3    Candy Store  0.08
4           Park  0.08


----Church and Wellesley----
                 venue  freq
0          Coffee Shop  0.08
1  Japanese Restaurant  0.05
2              Gay Bar  0.05
3           Restaurant  0.

In [100]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [101]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_dt_grouped['Neighborhood']

for ind in np.arange(toronto_dt_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_dt_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Hotel,Café,Bakery,Restaurant,Seafood Restaurant,Beer Bar,Japanese Restaurant,Pub,Cocktail Bar
1,"CN Tower, King and Spadina, Railway Lands, Har...",Italian Restaurant,Coffee Shop,Bar,Café,Speakeasy,Bakery,Bank,Park,Gym / Fitness Center,Restaurant
2,Central Bay Street,Coffee Shop,Café,Hotel,Bubble Tea Shop,Sandwich Place,Italian Restaurant,Sushi Restaurant,Middle Eastern Restaurant,Restaurant,Breakfast Spot
3,Christie,Café,Grocery Store,Park,Candy Store,Playground,Athletics & Sports,Baby Store,Coffee Shop,Donut Shop,Event Space
4,Church and Wellesley,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Café,Hotel,Yoga Studio,Bubble Tea Shop,Mediterranean Restaurant


In [102]:
toronto_dt_grouped_clustering.shape

(19, 182)

### Cluster the neighborhood

In [103]:
# set number of clusters
kclusters = 5

toronto_dt_grouped_clustering = toronto_dt_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_dt_grouped_clustering)


In [104]:

# check cluster labels generated for each row in the dataframe
kmeans.labels_[:] 

array([0, 4, 0, 2, 0, 0, 0, 0, 3, 4, 4, 0, 0, 1, 0, 4, 0, 0, 4],
      dtype=int32)

In [108]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_dt_merged = toronto_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_dt_merged = toronto_dt_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

In [109]:
toronto_dt_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626,0,Coffee Shop,Breakfast Spot,Yoga Studio,Bakery,Gym / Fitness Center,Distribution Center,Pub,Restaurant,Electronics Store,Event Space
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6641,-79.3889,4,Gym,Coffee Shop,Hobby Shop,Dance Studio,Chinese Restaurant,Restaurant,Ramen Restaurant,Portuguese Restaurant,College Cafeteria,College Theater
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3783,0,Coffee Shop,Clothing Store,Café,Cosmetics Shop,Japanese Restaurant,Pizza Place,Hotel,Ramen Restaurant,Bubble Tea Shop,Middle Eastern Restaurant
3,M5C,Downtown Toronto,St. James Town,43.6513,-79.3756,0,Coffee Shop,Café,Seafood Restaurant,Cocktail Bar,Restaurant,American Restaurant,Gastropub,Beer Bar,Gym,Cosmetics Shop
4,M5E,Downtown Toronto,Berczy Park,43.6456,-79.3754,0,Coffee Shop,Hotel,Café,Bakery,Restaurant,Seafood Restaurant,Beer Bar,Japanese Restaurant,Pub,Cocktail Bar
5,M5G,Downtown Toronto,Central Bay Street,43.6564,-79.386,0,Coffee Shop,Café,Hotel,Bubble Tea Shop,Sandwich Place,Italian Restaurant,Sushi Restaurant,Middle Eastern Restaurant,Restaurant,Breakfast Spot
6,M6G,Downtown Toronto,Christie,43.6683,-79.4205,2,Café,Grocery Store,Park,Candy Store,Playground,Athletics & Sports,Baby Store,Coffee Shop,Donut Shop,Event Space
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6496,-79.3833,0,Café,Coffee Shop,Gym,Hotel,Asian Restaurant,Restaurant,Salad Place,Thai Restaurant,Steakhouse,American Restaurant
8,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.623,-79.3936,3,Harbor / Marina,Café,Music Venue,Park,Grocery Store,Dog Run,Ethiopian Restaurant,Gym,Escape Room,Electronics Store
9,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.6469,-79.3823,0,Coffee Shop,Hotel,Café,Restaurant,Salad Place,American Restaurant,Seafood Restaurant,Japanese Restaurant,Beer Bar,Bar


#### Visualize the clusters

In [110]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=14)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_dt_merged['Latitude'], toronto_dt_merged['Longitude'], toronto_dt_merged['Neighborhood'], toronto_dt_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters
       