# Capstone Project - The battle of neighborhoods (Week 5)

# 1. Install required libraries

In [36]:
!pip install BeautifulSoup4
!pip install lxml
!pip install tabulate




# Import required libraries

In [37]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate


# Scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [38]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))


# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [39]:
df2=df[0][df[0].Borough != 'Not assigned']


# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [40]:
df2.columns = ['PostalCode', 'Borough', 'Neighborhood']


# Combine into one row with the neighborhoods separated with a comma

In [41]:
df2 = df2.groupby('PostalCode').agg({'Borough':'first', 
                             'Neighborhood': ', '.join
                             }).reset_index()


# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [42]:
df2['Neighborhood'] = [row[-2] if row[-1]=='Not assigned' else row[-1] for row in df2.itertuples()]


# Print the dataframe

In [43]:
print(tabulate(df2, headers='keys', tablefmt='psql') )


+-----+--------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------+
|     | PostalCode   | Borough          | Neighborhood                                                                                                                           |
|-----+--------------+------------------+----------------------------------------------------------------------------------------------------------------------------------------|
|   0 | M1B          | Scarborough      | Rouge, Malvern                                                                                                                         |
|   1 | M1C          | Scarborough      | Highland Creek, Rouge Hill, Port Union                                                                                                 |
|   2 | M1E          | Scarborough      | Guildwood, Morningside, West Hill                              

# Use the .shape method to print the number of rows

In [44]:
df2.shape

(103, 3)

# 2. Use the the csv file to create the dataframe with Latitude and Longitude

In [98]:
df3 = pd.read_csv("http://cocl.us/Geospatial_data")
df2['Latitude'] = df3['Latitude'].values
df2['Longitude'] = df3['Longitude'].values
df2

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# 3. Explore and cluster the neighborhoods in Toronto

In [46]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


#### Use geopy library to get the latitude and longitude values of Toronto.

In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent <em>ny_explorer</em>, as shown below.

In [47]:
#address = 'New York City, NY'
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [48]:
# create map of Toronto using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

#### Work with only boroughs that contain the word Toronto

In [49]:
toronto_data = df2[df2['Borough'].str.contains("Toronto")]
toronto_data.reset_index(drop=True,inplace=True)
toronto_data.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Let's get the geographical coordinates of Toronto.

In [99]:
# create map of Tornoto using latitude and longitude values
map_tornoto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tornoto)  
    
map_tornoto

#### Define Foursquare Credentials and Version

In [68]:
CLIENT_ID = 'Z0Y5ID5TUT1MIIQUAGLLBMP3VOWX2OUC2VIDIOILSCOUN2F3' # your Foursquare ID
CLIENT_SECRET = 'HEIMYFTCIGFATQ43JN1VM5UYHT4WYMDYTIVKA5RZ4CQ5HMTF' # your Foursquare Secret

#CLIENT_ID = '3VRPMKUFOHOBYUZ3CWB14EYQNXXDDOJYYW55VBTWX0FQ04N1' # your Foursquare ID
#CLIENT_SECRET = 'HD1B20SYCDRCPJHKFAUDBLN1WGECMKGI4AOLAEWFN1ZWL24O' # your Foursquare Secret

#CLIENT_ID = 'QT2UDRDVHUR2RDHG42LH1NPGWHRJSSW2AJBCQGTAF2YHZILD' # your Foursquare ID
#CLIENT_SECRET = 'QQR1J2K5V12LDEDGRK42HNVPJPSHVBMXCJFXWYNEFA0CEAX2' # your Foursquare Secret

VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: Z0Y5ID5TUT1MIIQUAGLLBMP3VOWX2OUC2VIDIOILSCOUN2F3
CLIENT_SECRET:HEIMYFTCIGFATQ43JN1VM5UYHT4WYMDYTIVKA5RZ4CQ5HMTF


Get the neighborhood's name.

In [69]:
toronto_data.loc[0, 'Neighborhood']

'The Beaches'

Get the neighborhood's latitude and longitude values.

In [72]:
neighborhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [73]:
# Testing
#55 | M5C          | Downtown Toronto | St. James Town                                                                                                                         |  43.651494 |  -79.375418 |

#neighborhood_latitude =  43.651494 # neighborhood latitude value
#neighborhood_longitude = -79.375418 # neighborhood longitude value

#neighborhood_name =  'St. James Town' # neighborhood name

First, let's create the GET request URL. 

In [74]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL


'https://api.foursquare.com/v2/venues/explore?&client_id=Z0Y5ID5TUT1MIIQUAGLLBMP3VOWX2OUC2VIDIOILSCOUN2F3&client_secret=HEIMYFTCIGFATQ43JN1VM5UYHT4WYMDYTIVKA5RZ4CQ5HMTF&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

Send the GET request and examine the resutls

In [75]:
results = requests.get(url).json()
#results

Let's borrow the **get_category_type** function from the Foursquare lab.

In [76]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json and structure it into a *pandas* dataframe.

In [77]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues


Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,Glen Stewart Ravine,Other Great Outdoors,43.6763,-79.294784
2,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
3,Grover Pub and Grub,Pub,43.679181,-79.297215
4,Domino's Pizza,Pizza Place,43.679058,-79.297382
5,Upper Beaches,Neighborhood,43.680563,-79.292869


And how many venues were returned by Foursquare?

In [78]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

6 venues were returned by Foursquare.


#### Let's create a function to repeat the same process to all the neighborhoods 

In [79]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['id'],
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

   
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue',   
                  'Venue ID',           
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe called *totonto_venues*.

In [80]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The Junction Sout

#### Let's check the size of the resulting dataframe

In [81]:
toronto_venues = toronto_venues[toronto_venues['Venue Category'].str.contains("Chinese Restaurant")]
print(toronto_venues.shape)
toronto_venues

(13, 8)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue ID,Venue Latitude,Venue Longitude,Venue Category
122,North Toronto West,43.715383,-79.405678,C'est Bon,4aec79e5f964a5209fc721e3,43.716785,-79.400406,Chinese Restaurant
234,"Cabbagetown, St. James Town",43.667967,-79.367675,China Gourmet,4bca992068f976b017d35f83,43.66418,-79.368359,Chinese Restaurant
296,Church and Wellesley,43.66586,-79.38316,Crown Princess Fine Dining 伯爵名宴,4c792e4981bca093af5efc14,43.666455,-79.387698,Chinese Restaurant
442,"Ryerson, Garden District",43.657162,-79.378937,GB Hand-Pulled Noodles,58e91d60e0adac258fcea481,43.656434,-79.383783,Chinese Restaurant
649,Central Bay Street,43.657952,-79.387383,Yueh Tung Chinese Restaurant,52a7ae41498eed3af4d0a3fa,43.655281,-79.385337,Chinese Restaurant
650,Central Bay Street,43.657952,-79.387383,GB Hand-Pulled Noodles,58e91d60e0adac258fcea481,43.656434,-79.383783,Chinese Restaurant
836,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,Pearl Harbourfront,4ae33054f964a520759121e3,43.638157,-79.380688,Chinese Restaurant
1162,"Harbord, University of Toronto",43.662696,-79.400049,River Tai Restaurant,4b340703f964a5200d2425e3,43.662902,-79.403167,Chinese Restaurant
1213,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,Swatow Restaurant 汕頭小食家,4ae29812f964a520288f21e3,43.653866,-79.398334,Chinese Restaurant
1225,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,Rosewood Chinese Cuisine,4fd3ea3ee4b0191b9c2b1aff,43.653171,-79.39671,Chinese Restaurant


In [82]:
#let's get a list of venues

venue_id_list = toronto_venues['Venue ID'].tolist()

In [83]:
#set up to pull the likes from the API based on venue ID

url_list = []
like_list = []
json_list = []

for i in venue_id_list:
    #venue_url = 'https://api.foursquare.com/v2/venues/{}/likes?client_id={}&client_secret={}&v={}'.format(i, CLIENT_ID, CLIENT_SECRET, VERSION)
    venue_url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&v={}'.format(i, CLIENT_ID, CLIENT_SECRET, VERSION)
    url_list.append(venue_url)
for link in url_list:
    result = requests.get(link).json()
    
#    likes = result['response']['venue']['rating']
#    likes = result['response']['venue']['likes']['count']
#    like_list.append(likes)
#print(like_list)
#print(result)

    try:
        name = result['response']['venue']['name']
        likes = result['response']['venue']['rating']
        like_list.append(likes)
        print(name)
        print(likes)
    except:
        likes = -1
        like_list.append(likes)
        print(name)
        print(likes)

C'est Bon
7.4
China Gourmet
6.0
Crown Princess Fine Dining 伯爵名宴
7.5
GB Hand-Pulled Noodles
7.8
Yueh Tung Chinese Restaurant
7.9
GB Hand-Pulled Noodles
7.8
Pearl Harbourfront
8.2
River Tai Restaurant
6.6
Swatow Restaurant 汕頭小食家
7.7
Rosewood Chinese Cuisine
7.7
New Sky Restaurant 小沙田食家
7.6
Asian Legend 味香村
7.8
Crown Princess Fine Dining 伯爵名宴
7.5


In [84]:
#double check that we did not lose any venues based on if likes were available

print(len(like_list))
print(len(venue_id_list))

13
13


In [85]:
toronto_venues['Rating'] = like_list
toronto_venues2 = toronto_venues.copy()
toronto_venues2

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue ID,Venue Latitude,Venue Longitude,Venue Category,Rating
122,North Toronto West,43.715383,-79.405678,C'est Bon,4aec79e5f964a5209fc721e3,43.716785,-79.400406,Chinese Restaurant,7.4
234,"Cabbagetown, St. James Town",43.667967,-79.367675,China Gourmet,4bca992068f976b017d35f83,43.66418,-79.368359,Chinese Restaurant,6.0
296,Church and Wellesley,43.66586,-79.38316,Crown Princess Fine Dining 伯爵名宴,4c792e4981bca093af5efc14,43.666455,-79.387698,Chinese Restaurant,7.5
442,"Ryerson, Garden District",43.657162,-79.378937,GB Hand-Pulled Noodles,58e91d60e0adac258fcea481,43.656434,-79.383783,Chinese Restaurant,7.8
649,Central Bay Street,43.657952,-79.387383,Yueh Tung Chinese Restaurant,52a7ae41498eed3af4d0a3fa,43.655281,-79.385337,Chinese Restaurant,7.9
650,Central Bay Street,43.657952,-79.387383,GB Hand-Pulled Noodles,58e91d60e0adac258fcea481,43.656434,-79.383783,Chinese Restaurant,7.8
836,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,Pearl Harbourfront,4ae33054f964a520759121e3,43.638157,-79.380688,Chinese Restaurant,8.2
1162,"Harbord, University of Toronto",43.662696,-79.400049,River Tai Restaurant,4b340703f964a5200d2425e3,43.662902,-79.403167,Chinese Restaurant,6.6
1213,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,Swatow Restaurant 汕頭小食家,4ae29812f964a520288f21e3,43.653866,-79.398334,Chinese Restaurant,7.7
1225,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,Rosewood Chinese Cuisine,4fd3ea3ee4b0191b9c2b1aff,43.653171,-79.39671,Chinese Restaurant,7.7


In [86]:
#restore dataframe
#toronto_venues= toronto_venues2
#toronto_venues.drop('Cluster Labels',1,inplace=True)
#toronto_venues.drop('Rating_cat',1,inplace=True)

Let's check how many venues were returned for each neighborhood

#### Let's find out how many unique categories can be curated from all the returned venues

In [87]:
# let's set up a function that will re-categorize our restaurants based on Rating

def conditions(s):
    if s['Rating']<=6:
        return 'poor'
    if s['Rating']<=7:
        return 'below avg'
    if s['Rating']<=8:
        return 'above avg'
    else:
        return 'great'

toronto_venues['Rating_cat']=toronto_venues.apply(conditions, axis=1)
toronto_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue ID,Venue Latitude,Venue Longitude,Venue Category,Rating,Rating_cat
122,North Toronto West,43.715383,-79.405678,C'est Bon,4aec79e5f964a5209fc721e3,43.716785,-79.400406,Chinese Restaurant,7.4,above avg
234,"Cabbagetown, St. James Town",43.667967,-79.367675,China Gourmet,4bca992068f976b017d35f83,43.66418,-79.368359,Chinese Restaurant,6.0,poor
296,Church and Wellesley,43.66586,-79.38316,Crown Princess Fine Dining 伯爵名宴,4c792e4981bca093af5efc14,43.666455,-79.387698,Chinese Restaurant,7.5,above avg
442,"Ryerson, Garden District",43.657162,-79.378937,GB Hand-Pulled Noodles,58e91d60e0adac258fcea481,43.656434,-79.383783,Chinese Restaurant,7.8,above avg
649,Central Bay Street,43.657952,-79.387383,Yueh Tung Chinese Restaurant,52a7ae41498eed3af4d0a3fa,43.655281,-79.385337,Chinese Restaurant,7.9,above avg
650,Central Bay Street,43.657952,-79.387383,GB Hand-Pulled Noodles,58e91d60e0adac258fcea481,43.656434,-79.383783,Chinese Restaurant,7.8,above avg
836,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,Pearl Harbourfront,4ae33054f964a520759121e3,43.638157,-79.380688,Chinese Restaurant,8.2,great
1162,"Harbord, University of Toronto",43.662696,-79.400049,River Tai Restaurant,4b340703f964a5200d2425e3,43.662902,-79.403167,Chinese Restaurant,6.6,below avg
1213,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,Swatow Restaurant 汕頭小食家,4ae29812f964a520288f21e3,43.653866,-79.398334,Chinese Restaurant,7.7,above avg
1225,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,Rosewood Chinese Cuisine,4fd3ea3ee4b0191b9c2b1aff,43.653171,-79.39671,Chinese Restaurant,7.7,above avg


## Analyze Each Neighborhood

In [88]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category','Rating_cat']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Venue'] = toronto_venues['Venue'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()


Unnamed: 0,Venue,Chinese Restaurant,above avg,below avg,great,poor
122,C'est Bon,1,1,0,0,0
234,China Gourmet,1,0,0,0,1
296,Crown Princess Fine Dining 伯爵名宴,1,1,0,0,0
442,GB Hand-Pulled Noodles,1,1,0,0,0
649,Yueh Tung Chinese Restaurant,1,1,0,0,0


And let's examine the new dataframe size.

In [89]:
toronto_onehot.shape


(13, 6)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

#### Let's confirm the new size

Run *k*-means to cluster the neighborhood into 4 clusters.

In [90]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_onehot.drop('Venue', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 0, 0, 0, 0, 1, 2, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [91]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_venues.insert(0, 'Cluster Labels', kmeans.labels_)
#oronto_venues.head()


In [92]:
toronto_merged = toronto_venues

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
#toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Cluster Labels,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue ID,Venue Latitude,Venue Longitude,Venue Category,Rating,Rating_cat
122,0,North Toronto West,43.715383,-79.405678,C'est Bon,4aec79e5f964a5209fc721e3,43.716785,-79.400406,Chinese Restaurant,7.4,above avg
234,3,"Cabbagetown, St. James Town",43.667967,-79.367675,China Gourmet,4bca992068f976b017d35f83,43.66418,-79.368359,Chinese Restaurant,6.0,poor
296,0,Church and Wellesley,43.66586,-79.38316,Crown Princess Fine Dining 伯爵名宴,4c792e4981bca093af5efc14,43.666455,-79.387698,Chinese Restaurant,7.5,above avg
442,0,"Ryerson, Garden District",43.657162,-79.378937,GB Hand-Pulled Noodles,58e91d60e0adac258fcea481,43.656434,-79.383783,Chinese Restaurant,7.8,above avg
649,0,Central Bay Street,43.657952,-79.387383,Yueh Tung Chinese Restaurant,52a7ae41498eed3af4d0a3fa,43.655281,-79.385337,Chinese Restaurant,7.9,above avg


Finally, let's visualize the resulting clusters

In [100]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Venue Latitude'], toronto_merged['Venue Longitude'], toronto_merged['Venue'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Exam Cluster 1

In [94]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0]

Unnamed: 0,Cluster Labels,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue ID,Venue Latitude,Venue Longitude,Venue Category,Rating,Rating_cat
122,0,North Toronto West,43.715383,-79.405678,C'est Bon,4aec79e5f964a5209fc721e3,43.716785,-79.400406,Chinese Restaurant,7.4,above avg
296,0,Church and Wellesley,43.66586,-79.38316,Crown Princess Fine Dining 伯爵名宴,4c792e4981bca093af5efc14,43.666455,-79.387698,Chinese Restaurant,7.5,above avg
442,0,"Ryerson, Garden District",43.657162,-79.378937,GB Hand-Pulled Noodles,58e91d60e0adac258fcea481,43.656434,-79.383783,Chinese Restaurant,7.8,above avg
649,0,Central Bay Street,43.657952,-79.387383,Yueh Tung Chinese Restaurant,52a7ae41498eed3af4d0a3fa,43.655281,-79.385337,Chinese Restaurant,7.9,above avg
650,0,Central Bay Street,43.657952,-79.387383,GB Hand-Pulled Noodles,58e91d60e0adac258fcea481,43.656434,-79.383783,Chinese Restaurant,7.8,above avg
1213,0,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,Swatow Restaurant 汕頭小食家,4ae29812f964a520288f21e3,43.653866,-79.398334,Chinese Restaurant,7.7,above avg
1225,0,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,Rosewood Chinese Cuisine,4fd3ea3ee4b0191b9c2b1aff,43.653171,-79.39671,Chinese Restaurant,7.7,above avg
1226,0,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,New Sky Restaurant 小沙田食家,4b074bb1f964a52077fb22e3,43.655337,-79.398897,Chinese Restaurant,7.6,above avg
1242,0,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,Asian Legend 味香村,4adb5472f964a520fc2521e3,43.653603,-79.395047,Chinese Restaurant,7.8,above avg
1682,0,Queen's Park,43.662301,-79.389494,Crown Princess Fine Dining 伯爵名宴,4c792e4981bca093af5efc14,43.666455,-79.387698,Chinese Restaurant,7.5,above avg


#### Exam Cluster 2

In [95]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1]

Unnamed: 0,Cluster Labels,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue ID,Venue Latitude,Venue Longitude,Venue Category,Rating,Rating_cat
836,1,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,Pearl Harbourfront,4ae33054f964a520759121e3,43.638157,-79.380688,Chinese Restaurant,8.2,great


#### Exam Cluster 3

In [96]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2]

Unnamed: 0,Cluster Labels,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue ID,Venue Latitude,Venue Longitude,Venue Category,Rating,Rating_cat
1162,2,"Harbord, University of Toronto",43.662696,-79.400049,River Tai Restaurant,4b340703f964a5200d2425e3,43.662902,-79.403167,Chinese Restaurant,6.6,below avg


#### Exam Cluster 4

In [97]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3]

Unnamed: 0,Cluster Labels,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue ID,Venue Latitude,Venue Longitude,Venue Category,Rating,Rating_cat
234,3,"Cabbagetown, St. James Town",43.667967,-79.367675,China Gourmet,4bca992068f976b017d35f83,43.66418,-79.368359,Chinese Restaurant,6.0,poor
