# Segmenting and Clustering Neighbourhoods in Toronto

## Import Libraries

In [1]:
pip install geopy 

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install geopandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 

from bs4 import BeautifulSoup

## Webscraping of Wikipedia Page on Neighborhoods in Toronto

In [4]:
#Webscraping
html_data=requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup=BeautifulSoup(html_data, "html5lib")

In [5]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell={}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode']=row.p.text[:3]
        cell['Borough']=(row.span.text).split('(')[0]
        cell['Neigbourhood']=(((((row.span.text).split('(')[1]).strip(')')).replace('/',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

In [6]:
print(table_contents)

[{'PostalCode': 'M3A', 'Borough': 'North York', 'Neigbourhood': 'Parkwoods'}, {'PostalCode': 'M4A', 'Borough': 'North York', 'Neigbourhood': 'Victoria Village'}, {'PostalCode': 'M5A', 'Borough': 'Downtown Toronto', 'Neigbourhood': 'Regent Park , Harbourfront'}, {'PostalCode': 'M6A', 'Borough': 'North York', 'Neigbourhood': 'Lawrence Manor , Lawrence Heights'}, {'PostalCode': 'M7A', 'Borough': "Queen's Park", 'Neigbourhood': 'Ontario Provincial Government'}, {'PostalCode': 'M9A', 'Borough': 'Etobicoke', 'Neigbourhood': 'Islington Avenue'}, {'PostalCode': 'M1B', 'Borough': 'Scarborough', 'Neigbourhood': 'Malvern , Rouge'}, {'PostalCode': 'M3B', 'Borough': 'North York', 'Neigbourhood': 'Don Mills North'}, {'PostalCode': 'M4B', 'Borough': 'East York', 'Neigbourhood': 'Parkview Hill , Woodbine Gardens'}, {'PostalCode': 'M5B', 'Borough': 'Downtown Toronto', 'Neigbourhood': 'Garden District, Ryerson'}, {'PostalCode': 'M6B', 'Borough': 'North York', 'Neigbourhood': 'Glencairn'}, {'PostalCode':

In [7]:
df=pd.DataFrame(table_contents)

In [8]:
df.head(110)

Unnamed: 0,PostalCode,Borough,Neigbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


## Data Cleaning

In [9]:
#Rename misspelt column
df.rename(columns={'Neigbourhood':'Neighborhood'}, inplace=True)

In [10]:
#Widen column width of Borough column, in order to view complete data
pd.set_option('max_colwidth',None)
df


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Centre969 Eastern,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea , Humber Bay , Mimico NE , The Queensway East , Royal York South East , Kingsway Park South East"


In [11]:
#Rename Borough names which are too long
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                    'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                    'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [12]:
#Expand display to view all rows
pd.set_option('display.max_rows',200)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


# SECTION 1

### Drop Boroughs Which Are Not Assigned

In [13]:
#Drop cells with a Borough which is not assigned. However, there does not seem to have any Borough cells where it is "Not Assigned".
#Nonetheless, will still run the code.

df2=df[df.Borough!='Not Assigned'].reset_index(drop=True)

In [14]:
df2

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [15]:
#Group Neighborhoods in the same Borough which are not assigned
df_group=df2.groupby(['PostalCode','Borough'], as_index =False).agg(lambda x: ", ".join(x))



In [16]:
df_group.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


### Check for duplication of M5A listed twice

In [17]:
#Check for duplication of M5A listed twice. There is no duplication in this dataset
df_group[df_group['PostalCode']=='M5A']


Unnamed: 0,PostalCode,Borough,Neighborhood
53,M5A,Downtown Toronto,"Regent Park , Harbourfront"


### Assign Neighborhood to be same as the Borough for Neighborhoods with term "Enclave"

In [18]:
#There are some Neighborhoods with the words "Enclave.." 
#eg. Enclave M5E/M4L/L4W which do not explicitly state which neighborhood. 
#Hence will assign the Neigborhood to be the same as the Borough for these cells.

for index, row in df_group.iterrows():
    if 'Enclave' in row['Neighborhood']:
        row['Neighborhood'] = row['Borough']

df_group.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern , Rouge"
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park"
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge"
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


## Use Shape Method To Display Rows

In [19]:
#Using .shape method to print number of rows
df_group.shape

(103, 3)

# SECTION 2

## Creating the Geospatial Dataset

In [20]:
#Loading the geospatial data
coordinates=pd.read_csv('Geospatial_Coordinates.csv')
coordinates

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [21]:
coordinates.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

In [22]:
#Merging the coordinates with the borough, neighborhood and postal code.
df_coor=pd.merge(df_group,coordinates, on='PostalCode', how='left')

In [23]:
df_coor

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern , Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill , Port Union , Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park , Ionview , East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile , Clairlea , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside , Cliffcrest , Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


# SECTION 3

## Exploring Neighborhoods in Toronto

In [24]:
address='Toronto'

geolocator=Nominatim(user_agent="toronto_explorer")
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude
print('The geographical coordinate of Toronto are {},{}.'.format(latitude,longitude))

The geographical coordinate of Toronto are 43.6534817,-79.3839347.


## Map of Neighborhoods in Toronto

In [25]:
#create map of toronto
map_toronto=folium.Map(location=[latitude,longitude], zoom_start=11)

#add markers to map
for lat, lng, label in zip(df_coor['Latitude'], df_coor['Longitude'], df_coor['Neighborhood']):
    label=folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='#185af8',
    fill=True,
    fill_color='#18caf8',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)

map_toronto

In [26]:
df_coor.Borough.unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'East YorkEast Toronto', 'Central Toronto', 'Downtown Toronto',
       'Downtown Toronto Stn A', 'York', 'West Toronto', "Queen's Park",
       'Mississauga', 'East Toronto Business', 'Etobicoke',
       'EtobicokeNorthwest'], dtype=object)

## Locating Only Boroughs that Contain the Word 'Toronto'

In [27]:
#Creating a list of borough names that contain the word 'Toronto'
borough_names = list(df_coor.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

['East Toronto',
 'East YorkEast Toronto',
 'Central Toronto',
 'Downtown Toronto',
 'Downtown Toronto Stn A',
 'West Toronto',
 'East Toronto Business']

In [28]:
#Creating DataFrame of locations that contain the word 'Toronto' in Borough
df_coor_toronto = df_coor[df_coor['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
print(df_coor_toronto .shape)
df_coor_toronto.head(100)

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4J,East YorkEast Toronto,The Danforth East,43.685347,-79.338106
2,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188
3,M4L,East Toronto,"India Bazaar , The Beaches West",43.668999,-79.315572
4,M4M,East Toronto,Studio District,43.659526,-79.340923
5,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
6,M4P,Central Toronto,Davisville North,43.712751,-79.390197
7,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
8,M4S,Central Toronto,Davisville,43.704324,-79.38879
9,M4T,Central Toronto,"Moore Park , Summerhill East",43.689574,-79.38316


## Map of Boroughs in Toronto that Contain Word 'Toronto'

In [29]:
#create map of toronto
map_toronto=folium.Map(location=[latitude,longitude], zoom_start=11)

#add markers to map
for lat, lng, label in zip(df_coor_toronto['Latitude'], df_coor_toronto['Longitude'], df_coor_toronto['Neighborhood']):
    label=folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='#1a486e',
    fill=True,
    fill_color='#bed9f0',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)

map_toronto

## Exploring the First Neighborhood 'The Beaches'

In [30]:
df_coor_toronto.loc[0,'Neighborhood']

'The Beaches'

In [31]:
#Getting the Neigborhood The Beaches's latitude and longitude values.
neighborhood_latitude=df_coor_toronto.loc[0,'Latitude']
neighborhood_longitude=df_coor_toronto.loc[0,'Longitude']
neighborhood_name=df_coor_toronto.loc[0,'Neighborhood']
print('Latitude and Longitude values of {} are {},{}.'.format(neighborhood_name,neighborhood_latitude, neighborhood_longitude))

Latitude and Longitude values of The Beaches are 43.67635739999999,-79.2930312.


In [32]:
# @hidden_cell
#Defining Foursquare credentials and version
CLIENT_ID= 'XXXXXX'
CLIENT_SECRET='XXXXXX'
VERSION='20180605'
LIMIT=100

## Getting the top 100 venues that are in The Beaches within a radius of 500 meters

In [33]:
#Creating the get Request URL.

LIMIT=100
radius=500

url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
neighborhood_latitude,
neighborhood_longitude,
radius,
LIMIT)

url



'https://api.foursquare.com/v2/venues/explore?&client_id=5GYCGVHIUVEFMT10VNQK1EHYW4XAJIILTGVWUATISPOTWQ3K&client_secret=UXNQUS20PDIZSMSXW1NMHZXC55XGYYG5IRUCYXHSAD4TXVSR&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

In [34]:
#Sending the GET request and examining the results
results=requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60e012db981d040fd4c4faa8'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ad4c062f964a52011f820e3',
       'name': 'The Big Carrot Natural Food Market',
       'location': {'address': '125 Southwood Dr',
        'lat': 43.678879,
        'lng': -79.297734,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.678879,
          'lng': -79.297734}],
        'distance': 471,
        'postalCode': 'M4E 0B8',
   

## Creating DataFrame of top 100 venues and categories in The Beaches within 500 meters radius

In [35]:
#function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list=row['categories']
    except:
        categories_list=row['venue.categories']
    
    if len(categories_list)==0:
        return None
    else:
        return categories_list[0]['name']
    

In [36]:
#Cleaning the json and structuring it into a Pandas DataFrame
venues = results['response']['groups'][0]['items']

#flatten json
nearby_venues = json_normalize(venues)
                       
#filter columns
filtered_columns=['venue.name','venue.categories','venue.location.lat','venue.location.lng']
nearby_venues=nearby_venues.loc[:,filtered_columns]

#filter category for each row
nearby_venues['venue.categories']=nearby_venues.apply(get_category_type, axis=1)
                           
#clean columns
nearby_venues.columns=[col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(100)

  """


Unnamed: 0,name,categories,lat,lng
0,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
1,Glen Manor Ravine,Trail,43.676821,-79.293942
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


## Exploring Top 100 venues of Boroughs (that contain 'Toronto') within 500 meters radius

In [37]:

def getNearbyVenues(names, latitudes, longitudes, radius=500):

    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        #create the API request URL
        url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT)
        
        #make the GET request
        results=requests.get(url).json()["response"]["groups"][0]['items']
        
        #return only relevant information for each nearby venue
        
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name'])for v in results])
        
    nearby_venues=pd.DataFrame([item for venue_list in venues_list for item in venue_list])

    nearby_venues.columns=['Neighborhood',
                          'Neighborhood Latitude',
                          'Neighborhood Longitude',
                           'Venue Name',
                          'Venue Latitude',
                          'Venue Longitude',
                          'Venue Category']
 
    return(nearby_venues)

In [38]:
toronto_venues=getNearbyVenues(names=df_coor_toronto['Neighborhood'],
                              latitudes=df_coor_toronto['Latitude'],
                              longitudes=df_coor_toronto['Longitude']
                              )

In [39]:
print(toronto_venues.shape)
toronto_venues.head(100)


(1587, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Danforth East,43.685347,-79.338106,The Path,43.683923,-79.335007,Park
5,The Danforth East,43.685347,-79.338106,Sammon Convenience,43.686951,-79.335007,Convenience Store
6,"The Danforth West , Riverdale",43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant
7,"The Danforth West , Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
8,"The Danforth West , Riverdale",43.679557,-79.352188,La Diperie,43.677702,-79.352265,Ice Cream Shop
9,"The Danforth West , Riverdale",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop


## Displaying the Number of Venues Returned for Each Neighborhood

In [40]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,58,58,58,58,58,58
"Brockton , Parkdale Village , Exhibition Place",24,24,24,24,24,24
"CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport",16,16,16,16,16,16
Central Bay Street,65,65,65,65,65,65
Christie,16,16,16,16,16,16
Church and Wellesley,79,79,79,79,79,79
"Commerce Court , Victoria Hotel",100,100,100,100,100,100
Davisville,37,37,37,37,37,37
Davisville North,7,7,7,7,7,7
Downtown Toronto Stn A,99,99,99,99,99,99


### Finding out Number of Unique Categories from All Returned Venues

In [41]:
print('There are {} uniques categoroes.'.format(len(toronto_venues['Venue Category'].unique())))

There are 231 uniques categoroes.


## Analyzing Each Neighborhood

In [42]:
#one hot encoding
toronto_onehot=pd.get_dummies(toronto_venues[['Venue Category']],prefix="", prefix_sep="")

#add neighborhood column back to DataFrame
toronto_onehot['Neighborhood']=toronto_venues['Neighborhood']

#move neighborhood column to the first column
fixed_columns=[toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
               
toronto_onehot=toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theater,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
toronto_onehot.shape

(1587, 231)

## Grouping Rows By Neighborhood and Taking the Mean of the Frequency of Occurrence of Each Category

In [44]:
toronto_grouped=toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0
1,"Brockton , Parkdale Village , Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport",0.0,0.0,0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.015385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.015385,0.0,0.0,0.015385
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.025316,0.012658,0.0,0.0,0.0,0.0,0.0,0.012658,0.0,...,0.012658,0.012658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Commerce Court , Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.027027,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Downtown Toronto Stn A,0.010101,0.0,0.0,0.0,0.0,0.0,0.0,0.010101,0.010101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.010101,0.0,0.0,0.0


In [45]:
#Confirming the new size
toronto_grouped.shape

(39, 231)

## Displaying Each Neighborhood Along With Top 5 Most Common Venues

In [46]:
#Print each neighborhood along with top 5 most common venues

num_top_venues=5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp=toronto_grouped[toronto_grouped['Neighborhood']==hood].T.reset_index()
    
    temp.columns=['venue', 'freq']
    temp=temp.iloc[1:]
    temp['freq']=temp['freq'].astype(float)
    temp=temp.round({'freq':2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
          venue  freq
0   Coffee Shop  0.10
1  Cocktail Bar  0.05
2        Bakery  0.05
3    Restaurant  0.03
4   Cheese Shop  0.03


----Brockton , Parkdale Village , Exhibition Place----
            venue  freq
0            Café  0.12
1  Breakfast Spot  0.08
2          Bakery  0.08
3     Coffee Shop  0.08
4       Nightclub  0.04


----CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport----
              venue  freq
0    Airport Lounge  0.12
1   Airport Service  0.12
2  Airport Terminal  0.12
3          Boutique  0.06
4             Plane  0.06


----Central Bay Street----
                 venue  freq
0          Coffee Shop  0.17
1   Italian Restaurant  0.05
2       Sandwich Place  0.05
3                 Café  0.05
4  Japanese Restaurant  0.03


----Christie----
           venue  freq
0  Grocery Store  0.25
1           Café  0.19
2           Park  0.12
3      Nightclub  0.06
4     Baby Store  0.06


----Ch

## Creating a New DataFrame of the Top 10 Venues for Each Neighborhood

In [47]:
#Function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories=row.iloc[1:]
    row_categories_sorted=row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [48]:
#Creating a new DataFrame and displaying the top 10 venues for each neighborhood

num_top_venues=10

indicators =['st','nd','rd']

#create columns according to number of top venues
columns=['Neighborhood']


for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
        
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

        
#create a new DataFrame
neighborhoods_venues_sorted=pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood']=toronto_grouped['Neighborhood']


for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:]=return_most_common_venues(toronto_grouped.iloc[ind, :],num_top_venues)
    
neighborhoods_venues_sorted.head(100)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Cheese Shop,Beer Bar,Seafood Restaurant,Farmers Market,Restaurant,Hotel,Basketball Stadium
1,"Brockton , Parkdale Village , Exhibition Place",Café,Coffee Shop,Breakfast Spot,Bakery,Intersection,Italian Restaurant,Convenience Store,Nightclub,Office,Stadium
2,"CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst Quay , South Niagara , Island airport",Airport Lounge,Airport Service,Airport Terminal,Coffee Shop,Harbor / Marina,Plane,Rental Car Location,Boutique,Sculpture Garden,Bar
3,Central Bay Street,Coffee Shop,Sandwich Place,Café,Italian Restaurant,Bubble Tea Shop,Salad Place,Restaurant,Japanese Restaurant,Burger Joint,Middle Eastern Restaurant
4,Christie,Grocery Store,Café,Park,Athletics & Sports,Coffee Shop,Candy Store,Restaurant,Italian Restaurant,Baby Store,Nightclub
5,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Hotel,Pub,Bubble Tea Shop,Fast Food Restaurant,Yoga Studio
6,"Commerce Court , Victoria Hotel",Coffee Shop,Hotel,Café,Restaurant,Gym,Italian Restaurant,Japanese Restaurant,Cocktail Bar,Asian Restaurant,Deli / Bodega
7,Davisville,Pizza Place,Dessert Shop,Sandwich Place,Gym,Coffee Shop,Sushi Restaurant,Café,Italian Restaurant,Restaurant,Indoor Play Area
8,Davisville North,Hotel,Breakfast Spot,Food & Drink Shop,Pizza Place,Sandwich Place,Park,Department Store,Eastern European Restaurant,Electronics Store,Dessert Shop
9,Downtown Toronto Stn A,Coffee Shop,Italian Restaurant,Seafood Restaurant,Cocktail Bar,Pub,Restaurant,Japanese Restaurant,Bakery,Beer Bar,Hotel


## Clustering Neighborhoods

#### Running k-Means to Cluster the Neighborhoods into 5 Clusters

In [49]:
from sklearn.cluster import KMeans

In [50]:
#Set Number of Clusters
kclusters=5

toronto_grouped_clustering=toronto_grouped.drop('Neighborhood',1)

#run k-means clustering
kmeans=KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

#check cluster labels generated for each row in the DataFrame
kmeans.labels_[0:10]


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

### Display DataFrame that includes Cluster as well as top 10 venues for each Neighborhood


In [51]:
#add clustering labels
neighborhoods_venues_sorted.insert(0,'Cluster Labels', kmeans.labels_)

toronto_merged=df_coor_toronto

#merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged=toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head(100)  #check the last columns!


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Pub,Trail,Health Food Store,Deli / Bodega,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant
1,M4J,East YorkEast Toronto,The Danforth East,43.685347,-79.338106,2,Park,Convenience Store,Dessert Shop,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant
2,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Restaurant,Ice Cream Shop,Cosmetics Shop,Brewery,Bubble Tea Shop,Pub
3,M4L,East Toronto,"India Bazaar , The Beaches West",43.668999,-79.315572,0,Park,Fast Food Restaurant,Gym,Restaurant,Fish & Chips Shop,Ice Cream Shop,Italian Restaurant,Sandwich Place,Brewery,Light Rail Station
4,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Coffee Shop,Bakery,Gastropub,Brewery,Café,American Restaurant,Convenience Store,Bookstore,Cheese Shop,Pet Store
5,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Park,Swim School,Bus Line,Wine Bar,Dessert Shop,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant
6,M4P,Central Toronto,Davisville North,43.712751,-79.390197,0,Hotel,Breakfast Spot,Food & Drink Shop,Pizza Place,Sandwich Place,Park,Department Store,Eastern European Restaurant,Electronics Store,Dessert Shop
7,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,0,Coffee Shop,Clothing Store,Yoga Studio,Sporting Goods Shop,Spa,Bagel Shop,Café,Rental Car Location,Fast Food Restaurant,Mexican Restaurant
8,M4S,Central Toronto,Davisville,43.704324,-79.38879,0,Pizza Place,Dessert Shop,Sandwich Place,Gym,Coffee Shop,Sushi Restaurant,Café,Italian Restaurant,Restaurant,Indoor Play Area
9,M4T,Central Toronto,"Moore Park , Summerhill East",43.689574,-79.38316,1,Lawyer,Playground,Wine Bar,Dessert Shop,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Donut Shop


### Displaying a Map that Visualizes the Neighborhoods and How They Cluster Together 

In [52]:
#create map

map_clusters=folium.Map(location=[latitude, longitude], zoom_start=11)

#set colour scheme for the clusters
x=np.arange(kclusters)
ys=[i+x+(i*x)**2 for i in range(kclusters)]
colors_array=cm.rainbow(np.linspace(0,1,len(ys)))
rainbow=[colors.rgb2hex(i)for i in colors_array]

#add markers to the map
markers_colors=[]
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged ['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label=folium.Popup(str(poi)+' Cluster '+str(cluster+1),parse_html=True)
    folium.CircleMarker(
        [lat,lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters


### Cluster One
###### Cluster Label 0

#### Observations:
### The '1st Most Common Venue' in Cluster One, which is the largest cluster, is mainly centred on coffee shops and restaurants. In Downtown Toronto specifically (which is the main central district) the '1st Most Common Venue' are coffee shops (around 76%) and coffee shops/cafes (around 88%). <br> <br> This could an appropriate area to target for online coffee delivery or coffee/beverage promotions.

In [53]:
toronto_merged.loc[toronto_merged['Cluster Labels']==0, toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Pub,Trail,Health Food Store,Deli / Bodega,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant
2,East Toronto,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Restaurant,Ice Cream Shop,Cosmetics Shop,Brewery,Bubble Tea Shop,Pub
3,East Toronto,0,Park,Fast Food Restaurant,Gym,Restaurant,Fish & Chips Shop,Ice Cream Shop,Italian Restaurant,Sandwich Place,Brewery,Light Rail Station
4,East Toronto,0,Coffee Shop,Bakery,Gastropub,Brewery,Café,American Restaurant,Convenience Store,Bookstore,Cheese Shop,Pet Store
5,Central Toronto,0,Park,Swim School,Bus Line,Wine Bar,Dessert Shop,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant
6,Central Toronto,0,Hotel,Breakfast Spot,Food & Drink Shop,Pizza Place,Sandwich Place,Park,Department Store,Eastern European Restaurant,Electronics Store,Dessert Shop
7,Central Toronto,0,Coffee Shop,Clothing Store,Yoga Studio,Sporting Goods Shop,Spa,Bagel Shop,Café,Rental Car Location,Fast Food Restaurant,Mexican Restaurant
8,Central Toronto,0,Pizza Place,Dessert Shop,Sandwich Place,Gym,Coffee Shop,Sushi Restaurant,Café,Italian Restaurant,Restaurant,Indoor Play Area
10,Central Toronto,0,Coffee Shop,Liquor Store,Restaurant,Fried Chicken Joint,Bank,Supermarket,Sushi Restaurant,Pizza Place,Pub,American Restaurant
12,Downtown Toronto,0,Coffee Shop,Bakery,Café,Restaurant,Italian Restaurant,Pub,Pizza Place,Playground,Indian Restaurant,Sandwich Place


### Cluster Two
###### Cluster Label 1

#### Observations:
### The '1st Most Common Venue' in Cluster Two (Moore Park , Summerhill) is centred around legal services.


In [54]:
toronto_merged.loc[toronto_merged['Cluster Labels']==1, toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Central Toronto,1,Lawyer,Playground,Wine Bar,Dessert Shop,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Donut Shop


### Cluster Three
###### Cluster Label 2

#### Observations:
### The '1st Most Common Venue' in Cluster Three (The Danforth East) is centred on the Park. This would be an appropriate area to promote exercise, family activities, elderly activities like walking and to engage pet owners. 

In [55]:
toronto_merged.loc[toronto_merged['Cluster Labels']==2, toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,East YorkEast Toronto,2,Park,Convenience Store,Dessert Shop,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant


### Cluster Four
###### Cluster Label 3

#### Observations:
### Cluster Four (Forest Hill & Rosdale) constains one of Toronto's more affluent neighborhoods. These two neighbourhoods share similar type common venues such as Department Store, Ethiopian Restaurant, Escape Room, Electronics Store, Eastern European Restaurant and Donut Shop which could reflect the lifestyle needs of this community. Rosdale's '1st Most Common Venue' is Sushi Restaurant while Forest Hill's is Park.

In [56]:
toronto_merged.loc[toronto_merged['Cluster Labels']==3, toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,Downtown Toronto,3,Park,Trail,Playground,Wine Bar,Department Store,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Donut Shop
24,Central Toronto,3,Sushi Restaurant,Jewelry Store,Park,Trail,Department Store,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Donut Shop


### Cluster Five
###### Cluster Label 4

#### Observations:
### The '1st Most Common Venue' in Cluster Five (Roselawn) is Music Venue. This would be a good area to introduce music events or orchestra and band performances. 

In [57]:
toronto_merged.loc[toronto_merged['Cluster Labels']==4, toronto_merged.columns[[1]+list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,Central Toronto,4,Music Venue,Home Service,Garden,Wine Bar,Dessert Shop,Event Space,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant
