# Peer Graded Assignment   
## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# Import necessary modules
import requests 
import lxml.html as lh
from bs4 import BeautifulSoup
import pandas as pd

## Extract table from website using BeautifulSoup and cast content to DataFrame.  
### Comments are annotated in body of code how dataframe is cleaned and formatted in line with assignment instructions.

In [2]:
# Extract table content from webscraping url using BeautifulSoup:
website_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
res = requests.get(website_url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 

# Fetch table content into a dataframe:
df = pd.read_html(str(table))[0]
# Format the dataframe with the required column headings and reindex the rows:
df.columns = df.iloc[0]
df = df.reindex(df.index.drop(0)).reset_index(drop=True)
df.columns.name = None

# Remove rows with 'Not assigned' Borough:
df = df[df.Borough != 'Not assigned']
# Replace 'Not assigned' Neighbourhood to 'Borough name'
df.loc[df['Neighbourhood'] =='Not assigned', 'Neighbourhood'] = df['Borough']
df.reset_index( drop=True, inplace=True)

# Combine neighbourhoods for same postalcode
df2 = df.groupby('Postcode').agg({'Borough':'first', 
                             'Neighbourhood': ', '.join 
                              }).reset_index()

# Rename df2 columns to what is described in Assignment Instructions
df2.rename(columns={'Postcode':'PostalCode',
                          'Borough':'Borough',
                          'Neighbourhood':'Neighborhood'}, 
                inplace=True)

# Reorder the columns to the correct sequence since the use of dictionary above changed the order
df2_1 = df2[['PostalCode','Borough','Neighborhood']]

df2_1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [3]:
# Get the shape of df2_1
df2_1.shape

(103, 3)

In [4]:
# import the csv file with the latitude and longitude coordinates corresponding to postal code as dataframe df3
url = 'https://cocl.us/Geospatial_data'
df3 = pd.read_csv(url)
df3.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [5]:
#rename df3 column name 'Postal_Code'to 'PostalCode'
df3.columns= ['PostalCode', 'Latitude', 'Longitude']

# merge df2_1 and df3 on PostalCode
df_merge = pd.merge(df2_1, df3, on='PostalCode')
df_merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [6]:
df_merge.shape

(103, 5)

## Slice df_merge for Boroughs containing 'Toronto' for Exploring and Clustering

In [7]:
# Get slice of df_merge for which column['Borough'] contains string 'Toronto'
df_toronto = df_merge[df_merge['Borough'].str.contains('Toronto')].reset_index(drop=True)
df_toronto.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [8]:
df_toronto.shape

(38, 5)

## Get latitude and longitude of Toronto using geopy and folium to create map of Toronto, Ontario

In [9]:
# Get geographical coordinates of Toronto
from geopy.geocoders import Nominatim 
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent = 'ont_explorer')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print ('The geographical coordinates of Toronto are {},{}.'.format(latitude,longitude))

The geographical coordinates of Toronto are 43.653963,-79.387207.


In [10]:
# Create Map of Toronto and add the 38 neighborhood markers of DataFrame df_toronto 
!pip install folium
import folium
map_toronto = folium.Map( location=[latitude,longitude], zoom_start =11)
map_toronto

[31mtensorflow 1.3.0 requires tensorflow-tensorboard<0.2.0,>=0.1.0, which is not installed.[0m


## Add markers of selected neighborhoods to the Toronto Map

In [11]:
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'],df_toronto['Neighborhood']):
    label= folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat,lng],
                       radius =5,
                       popup = label,
                       color='blue',
                       fill =True,
                       fill_color = '#3186cc',
                       fill_opacity = 0.7,
                       parse_html = False).add_to(map_toronto)
map_toronto    

In [12]:
# Count the number of boroughs and neighborhoods
number_borough = df_toronto['Borough'].nunique()
number_neighborhood = len(df_toronto['Neighborhood'])
print('There are {} boroughs and {} Neighborhoods in Metro Toronto area.'. format((number_borough),number_neighborhood))

There are 4 boroughs and 38 Neighborhoods in Metro Toronto area.


## Use Foursquare API to explore neighborhoods and segment them

In [13]:
# @hidden_cell
CLIENT_ID ='U5BMNK0AWH3GX3HECO5FWTKV4RYYOKPYWMILJNL3LYGMCX2V'
CLIENT_SECRET = '40SPFPTH2VO0XS40IDJ4LJPEBZJ4I31CZGADB2GNV4OK0A00'
VERSION ='20180605'

print('Your Credentials: ')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET: ' + CLIENT_SECRET)

Your Credentials: 
CLIENT_ID: U5BMNK0AWH3GX3HECO5FWTKV4RYYOKPYWMILJNL3LYGMCX2V
CLIENT_SECRET: 40SPFPTH2VO0XS40IDJ4LJPEBZJ4I31CZGADB2GNV4OK0A00


## Explore first neighborhood

In [14]:
# get neighborhood name
df_toronto.loc[1, 'Neighborhood']

'The Danforth West, Riverdale'

In [15]:
# get neighbourhood latitude and longitude
neighborhood_latitude = df_toronto.loc[1,'Latitude']
neighborhood_longitude = df_toronto.loc[1,'Longitude']
neighborhood_name = df_toronto.loc[1,'Neighborhood']
print('Latitude and Longitude values of {} are {},{}.'.format (neighborhood_name, neighborhood_latitude, neighborhood_longitude ))

Latitude and Longitude values of The Danforth West, Riverdale are 43.6795571,-79.352188.


In [16]:
# create get results url
LIMIT =100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
       CLIENT_ID,
       CLIENT_SECRET,
       VERSION,
       neighborhood_latitude,
       neighborhood_longitude,
       radius,
       LIMIT )
url


'https://api.foursquare.com/v2/venues/explore?client_id=U5BMNK0AWH3GX3HECO5FWTKV4RYYOKPYWMILJNL3LYGMCX2V&client_secret=40SPFPTH2VO0XS40IDJ4LJPEBZJ4I31CZGADB2GNV4OK0A00&v=20180605&ll=43.6795571,-79.352188&radius=500&limit=100'

In [17]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ce9c10b4c1f6753b60dc9ad'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4bce4183ef10952197da8386-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/greek_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d10e941735',
         'name': 'Greek Restaurant',
         'pluralName': 'Greek Restaurants',
         'primary': True,
         'shortName': 'Greek'}],
       'id': '4bce4183ef10952197da8386',
       'location': {'address': '407 Danforth Ave.',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'at Chester Ave.',
        'distance': 223,
        'formattedAddress': ['407 Danforth Ave. (at Chester Ave.)',
         'Toronto ON M4K 1P1',
         'Canada'],
        'label

In [18]:
#create function that extracts category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    if len(categories_list)==0:
        return None
    else:
        return categories_list[0]['name']
        
        

In [19]:
# clean json and structure to DataFrame
from pandas.io.json import json_normalize
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)
#filter columns
filtered_columns = ['venue.name', 'venue.categories','venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[ : , filtered_columns]
#filter category
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
#clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Pantheon,Greek Restaurant,43.677621,-79.351434
1,Dolce Gelato,Ice Cream Shop,43.677773,-79.351187
2,MenEssentials,Cosmetics Shop,43.67782,-79.351265
3,Cafe Fiorentina,Italian Restaurant,43.677743,-79.350115
4,Mezes,Greek Restaurant,43.677962,-79.350196


In [20]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

42 venues were returned by Foursquare.


In [21]:
#Define function to get top 100 Nearby Venues (within 500 metres radius) for all neighborhoods in metro Toronto

def getNearbyVenues( names, latitudes, longitudes, radius=500 ):
    
    venues_list = []
    
    for name, lat, lng in zip(names,latitudes, longitudes):
        print(name)
   
    #create the API request URL:
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(   
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT)
          
    # make the get requests
        results = requests.get(url).json()["response"]['groups'][0]['items']
    # return only relevant information for each nearby venue
        venues_list.append([(
                        name,
                        lat,
                        lng,
                        v['venue']['name'],
                        v['venue']['location']['lat'],
                        v['venue']['location']['lng'],
                        v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                             'Neighborhood Latitude',
                             'Neighborhood Longitude',
                             'Venue',
                             'venue Latitude',
                             'Venue Longitude',
                             'Venue Category']
    return(nearby_venues)

In [22]:
# create a new dataframe by running function above for each neighborhood and called it toronto_venues
toronto_venues = getNearbyVenues( names = df_merge['Neighborhood'],latitudes = df_merge['Latitude'],longitudes = df_merge['Longitude'])


Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

In [23]:
print(toronto_venues.shape)
toronto_venues.head()

(2246, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge, Malvern",43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [24]:
# check how many venues were returned for each neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,5,5,5,5,5,5
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",3,3,3,3,3,3
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",9,9,9,9,9,9
"Alderwood, Long Branch",9,9,9,9,9,9
"Bathurst Manor, Downsview North, Wilson Heights",18,18,18,18,18,18
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",25,25,25,25,25,25
Berczy Park,55,55,55,55,55,55
"Birch Cliff, Cliffside West",4,4,4,4,4,4


In [25]:
# find how many unique categories are there returned from all the venues
print('There are {} unique categories found.'.format(len(toronto_venues['Venue Category'].unique())))

There are 281 unique categories found.


## Analyze each neighborhood

In [26]:
# use one hot encoding to apply venue category as a feature for each venue
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix ="", prefix_sep = "")
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[ : -1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# find out size of toronto_onehot dataframe
toronto_onehot.shape

(2246, 281)

In [28]:
# group rows by neighborhood and take mean frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,"Adelaide, King, Richmond",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.0,0.010000,0.000000,0.000000,0.000000,0.000000,0.010000,0.000000,0.000000
1,Agincourt,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,"Alderwood, Long Branch",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,"Bathurst Manor, Downsview North, Wilson Heights",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.0,0.000000,0.000000,0.055556,0.000000,0.000000,0.000000,0.000000,0.000000
6,Bayview Village,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,"Bedford Park, Lawrence Manor East",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,Berczy Park,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.0,0.018182,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,"Birch Cliff, Cliffside West",0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [29]:
# check the size of toronto_grouped
toronto_grouped.shape

(100, 281)

In [30]:
# print out each neighborhood and its top 5 most common venues
num_top_venues = 5
for hood in toronto_grouped['Neighborhood']:
    print("---" + hood + "---")
    temp = toronto_grouped[toronto_grouped['Neighborhood']==hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq':2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---Adelaide, King, Richmond---
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2                  Bar  0.04
3      Thai Restaurant  0.04
4  American Restaurant  0.04


---Agincourt---
                venue  freq
0      Clothing Store   0.2
1      Breakfast Spot   0.2
2      Sandwich Place   0.2
3  Chinese Restaurant   0.2
4              Lounge   0.2


---Agincourt North, L'Amoreaux East, Milliken, Steeles East---
                venue  freq
0                Park  0.67
1          Playground  0.33
2         Yoga Studio  0.00
3  Miscellaneous Shop  0.00
4       Movie Theater  0.00


---Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown---
                  venue  freq
0         Grocery Store  0.22
1           Pizza Place  0.11
2              Pharmacy  0.11
3  Fast Food Restaurant  0.11
4            Beer Store  0.11


---Alderwood, Long Branch---
            venue  freq
0     Pizza Place  0.22

## Put this into a DataFrame

In [31]:
# create function to sort venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


In [32]:
#create dataframe and display the top ten venues for each neighborhood
import numpy as np
num_top_venues = 10
indicators = ['st','nd','rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
#create new DataFrame
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()
    
    

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Bar,American Restaurant,Thai Restaurant,Steakhouse,Burger Joint,Bakery,Cosmetics Shop,Hotel
1,Agincourt,Chinese Restaurant,Lounge,Breakfast Spot,Clothing Store,Sandwich Place,Women's Store,Discount Store,Dog Run,Doner Restaurant,Donut Shop
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Women's Store,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Sandwich Place,Pharmacy,Pizza Place,Fast Food Restaurant,Coffee Shop,Beer Store,Fried Chicken Joint,Women's Store,Dessert Shop
4,"Alderwood, Long Branch",Pizza Place,Pharmacy,Sandwich Place,Pub,Pool,Skating Rink,Gym,Coffee Shop,Colombian Restaurant,Deli / Bodega


## Cluster neighborhoods

In [33]:
# Apply KMeans Clustering algorithm to cluster neighborhoods
# in view of sample size, try three clusters

from sklearn.cluster import KMeans

kclusters = 3 
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(toronto_grouped_clustering)
#check cluster labels
kmeans.labels_[0:10]

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [34]:
# create a new Dataframe to include the cluster as well as the top ten venues for each neighborhood
# add clustering labels
neighborhoods_venues_sorted.insert(0,'Cluster Labels', kmeans.labels_)
toronto_merged = df_toronto
# merge dfs to add lat and long data
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Pub,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Yoga Studio,Pizza Place,Bookstore,Brewery,Bubble Tea Shop
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Pizza Place,Gym,Movie Theater,Brewery,Sandwich Place,Liquor Store,Burger Joint,Fast Food Restaurant,Burrito Place,Fish & Chips Shop
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Bakery,Gastropub,Italian Restaurant,American Restaurant,Stationery Store,Bookstore,Seafood Restaurant,Brewery
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1,Lake,Swim School,Park,Bus Line,Drugstore,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Women's Store


## Create Map to show clusters

In [35]:
#create map

import matplotlib.colors as colors
from matplotlib.colors import rgb2hex
import matplotlib.cm as cm
map_cluster = folium.Map(location=[latitude,longitude], zoom_start =11)
# set color scheme for clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
#add markers to map
markers_color=[]
for lat,lon,poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'],toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + 'Cluster' + str(cluster), parse_html =True)
    folium.CircleMarker(
            [lat,lon],
            radius =5,
            popup=label,
            color = rainbow[cluster -1],
            fill = True,
            fill_color = rainbow[cluster-1],
            fill_opacity = 0.7). add_to (map_cluster)
map_cluster
    

## examine the clusters

In [36]:
# look at cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels']==0,toronto_merged.columns[[1]+ list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Health Food Store,Pub,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore
1,East Toronto,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Yoga Studio,Pizza Place,Bookstore,Brewery,Bubble Tea Shop
2,East Toronto,0,Pizza Place,Gym,Movie Theater,Brewery,Sandwich Place,Liquor Store,Burger Joint,Fast Food Restaurant,Burrito Place,Fish & Chips Shop
3,East Toronto,0,Café,Coffee Shop,Bakery,Gastropub,Italian Restaurant,American Restaurant,Stationery Store,Bookstore,Seafood Restaurant,Brewery
5,Central Toronto,0,Hotel,Gym,Park,Breakfast Spot,Clothing Store,Food & Drink Shop,Sandwich Place,Donut Shop,Diner,Discount Store
6,Central Toronto,0,Coffee Shop,Yoga Studio,Bagel Shop,Fast Food Restaurant,Mexican Restaurant,Diner,Miscellaneous Shop,Dessert Shop,Park,Clothing Store
7,Central Toronto,0,Dessert Shop,Sandwich Place,Pizza Place,Restaurant,Café,Sushi Restaurant,Italian Restaurant,Coffee Shop,Park,Farmers Market
8,Central Toronto,0,Restaurant,Gym,Playground,Trail,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
9,Central Toronto,0,Coffee Shop,Pub,Sushi Restaurant,American Restaurant,Fried Chicken Joint,Sports Bar,Athletics & Sports,Bagel Shop,Supermarket,Pizza Place
11,Downtown Toronto,0,Coffee Shop,Park,Restaurant,Bakery,Italian Restaurant,Pub,Café,Chinese Restaurant,Pizza Place,Pharmacy


In [37]:
# look at cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels']==1, toronto_merged.columns[[1] + list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Central Toronto,1,Lake,Swim School,Park,Bus Line,Drugstore,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Women's Store
10,Downtown Toronto,1,Park,Trail,Playground,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
23,Central Toronto,1,Trail,Sushi Restaurant,Park,Jewelry Store,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop


In [38]:
# look at cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels']==2, toronto_merged.columns[[1] + list(range(5,toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


# My deduction is that metro Toronto is quite homogeneous: virtually one big cluster.  

## The other small cluster is distinguished by park, lake and trail. 