# Title: Coursera IBM Data Science Capstone project

### This notebook clusters food places in different neighborhoods of  Toronto, CA based on number of likes given and categories. 

### Importing necessary libraries.

In [1]:
import numpy as np
import pandas as pd
import requests
import json
# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from bs4 import BeautifulSoup 
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
!pip install folium # uncomment this line if you haven't completed the Foursquare API lab
import folium 



Getting data from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M which contains information about 'Postal codes', 'Borough', and 'name of neighborhoods' of Toronto, CA.

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL) 
# print(r.content) 
soup = BeautifulSoup(r.content, 'html5lib') 
# print(soup.prettify()) 

In [3]:
from collections import defaultdict
dic = defaultdict(list)

### Extracting data from wikipedia page.

In [4]:
table = soup.table
rows = table.find_all('tr')
for r in rows:
    cols = r.find_all('td')
#     row = [ dict[]=i.text[:-1] for i in cols]
    for i in range(len(cols)):
        if i == 0:
            dic['Postal Code'].append(cols[i].text[:-1])
        if i == 1:
            dic['Borough'].append(cols[i].text[:-1])
        if i == 2:
            dic['Neighborhood'].append(cols[i].text[:-1])

In [5]:
data = pd.DataFrame.from_dict(dic)
data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
to_remove = data[data['Borough'] == 'Not assigned'].index
data.drop(to_remove, inplace = True)
data

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
data.reset_index(inplace = True)
data.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
data.drop('index', axis = 1)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [9]:
data.shape

(103, 4)

### Latitudes and Longitudes for all Boroughs extracted from Wikipedia page.

In [10]:
lat_long = pd.read_csv('http://cocl.us/Geospatial_data')
lat_long.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
data.columns = data.columns.str.strip()
data.sort_values('Postal Code', ascending = True, axis = 0, inplace = True)
data.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood
6,9,M1B,Scarborough,"Malvern, Rouge"
12,18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
18,27,M1E,Scarborough,"Guildwood, Morningside, West Hill"
22,36,M1G,Scarborough,Woburn
26,45,M1H,Scarborough,Cedarbrae


In [12]:
lat_long.columns = lat_long.columns.str.strip()
lat_long.sort_values('Postal Code', ascending = True, axis = 0, inplace = True)
lat_long.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Adding Latitude and Longitude to 'data'.

In [13]:
data['Latitude'] = lat_long['Latitude']
data['Longitude'] = lat_long['Longitude']
data.reset_index(inplace = True)
data.drop(['level_0', 'index'], axis = 1, inplace = True)
data.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.727929,-79.262029
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.7942,-79.262029
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.778517,-79.346556
3,M1G,Scarborough,Woburn,43.77012,-79.408493
4,M1H,Scarborough,Cedarbrae,43.745906,-79.352188
5,M1J,Scarborough,Scarborough Village,43.728496,-79.495697
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.70906,-79.363452
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.72802,-79.38879
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.667967,-79.367675
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.650571,-79.384568


In [14]:
data['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

In [15]:
import json
# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries imported.')

Libraries imported.


### Plotting all the neighborhoods of Toronto, CA on map using folium and geocoders.

In [16]:
address = 'Toronto, CA'
geolocater = Nominatim(user_agent = "tor_explorer")
location = geolocater.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coordinates of Toronto are {}, {}'.format(latitude, longitude))

Coordinates of Scarborough are 43.6534817, -79.3839347


In [17]:
map_tor = folium.Map(location = [latitude, longitude], zoom_start = 12)

for lat, lang, label in zip(data['Latitude'], data['Longitude'], data['Neighborhood']):
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lang],
        radius=5,
        popup=label,
        color='Blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)
    
map_tor 

### Getting nearby venues of Neighborhoods using Foursquare API, present in 'data' with category, ID, Latitude and Longitude.

In [18]:
CLIENT_ID = 'K0S5WC0VPH3FXOJXNVY1WKEZOSTJCUJBGFMT52TQ2BHAT3MX' # your Foursquare ID
CLIENT_SECRET = 'IJ34AI144DDJLZ2W3ANDQ0NJSD2XMM01YHY3K0JHBRDLM0WX' # your Foursquare Secret
VERSION = '20200531' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: K0S5WC0VPH3FXOJXNVY1WKEZOSTJCUJBGFMT52TQ2BHAT3MX
CLIENT_SECRET:IJ34AI144DDJLZ2W3ANDQ0NJSD2XMM01YHY3K0JHBRDLM0WX


In [19]:
neigh_lat = data.loc[0, 'Latitude']
neigh_long = data.loc[0, 'Longitude']
neigh_name = data.loc[0, 'Neighborhood'].split(',')[0]

print('The latitude and longitude of {} are {}, {}.'.format(neigh_name, neigh_lat, neigh_long))

The latitude and longitude of Malvern are 43.7279292, -79.26202940000002.


In [20]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, neigh_lat, neigh_long, VERSION, radius, LIMIT)
url


'https://api.foursquare.com/v2/venues/explore?client_id=K0S5WC0VPH3FXOJXNVY1WKEZOSTJCUJBGFMT52TQ2BHAT3MX&client_secret=IJ34AI144DDJLZ2W3ANDQ0NJSD2XMM01YHY3K0JHBRDLM0WX&ll=43.7279292,-79.26202940000002&v=20200531&radius=500&limit=100'

In [21]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ed352ed9fcb92001b6f99ef'},
 'response': {'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 7,
  'suggestedBounds': {'ne': {'lat': 43.7324292045, 'lng': -79.25581377000155},
   'sw': {'lat': 43.723429195499996, 'lng': -79.26824502999848}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b6a37ccf964a520a5cd2be3',
       'name': 'Giant Tiger',
       'location': {'address': '682 Kennedy Road',
        'crossStreet': 'Eglinton Ave. E.',
        'lat': 43.72744662939136,
        'lng': -79.26624035854763,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.72744662939136,
          'lng': -79.26624035854763}],
        'distance': 342,
        'postalCo

In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius = 500):
    venues = []
    LIMIT = 30
    for name, lat, long in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            long, 
            radius, 
            LIMIT)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues.append([
            (name,
            lat,
            long,
            v['venue']['id'], 
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues for item in venue_list])
        
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude',
                  'Venue ID',          
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [23]:
toronto_venues = getNearbyVenues(names = data['Neighborhood'],
               latitudes = data['Latitude'],
               longitudes = data['Longitude'])

In [24]:
print(toronto_venues.shape)
toronto_venues.head()

(1348, 8)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.727929,-79.262029,4b6a37ccf964a520a5cd2be3,Giant Tiger,43.727447,-79.26624,Department Store
1,"Malvern, Rouge",43.727929,-79.262029,4c6aa500c946e21ec432ed8e,Tim Hortons,43.726895,-79.266157,Coffee Shop
2,"Malvern, Rouge",43.727929,-79.262029,4f1b1a8fe4b0838231c3badb,Bros. CONVENIENCE,43.727781,-79.265708,Convenience Store
3,"Malvern, Rouge",43.727929,-79.262029,4dc158bd52b1877d85b172c8,Dollarama,43.727092,-79.265784,Discount Store
4,"Malvern, Rouge",43.727929,-79.262029,4bd325219854d13aa0fafc4d,Tandy Leather,43.726974,-79.266513,Hobby Shop


In [25]:
toronto_venues['Venue Category'].value_counts()

Coffee Shop                      97
Café                             71
Park                             45
Restaurant                       40
Pizza Place                      35
Sandwich Place                   33
Bakery                           30
Italian Restaurant               25
Bank                             22
Grocery Store                    22
Japanese Restaurant              21
Pharmacy                         19
Fast Food Restaurant             18
Bar                              18
Gym                              18
Pub                              16
Sushi Restaurant                 16
Breakfast Spot                   14
Ice Cream Shop                   13
Liquor Store                     13
Diner                            13
Hotel                            13
Greek Restaurant                 12
Gastropub                        12
Thai Restaurant                  12
Dessert Shop                     12
American Restaurant              12
Clothing Store              

In [26]:
temp_data = toronto_venues['Venue Category'].str.contains('Restaurant|Café|Coffee|Pizza|Burger|Diner|Sandwich')
temp_data.head() 

0    False
1     True
2    False
3    False
4    False
Name: Venue Category, dtype: bool

In [27]:
temp_data.value_counts()

False    801
True     547
Name: Venue Category, dtype: int64

In [28]:
toronto_res = toronto_venues[temp_data]
toronto_res.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category
1,"Malvern, Rouge",43.727929,-79.262029,4c6aa500c946e21ec432ed8e,Tim Hortons,43.726895,-79.266157,Coffee Shop
8,"Rouge Hill, Port Union, Highland Creek",43.7942,-79.262029,4b50b100f964a520b72d27e3,El Pulgarcito,43.792648,-79.259208,Latin American Restaurant
15,"Guildwood, Morningside, West Hill",43.778517,-79.346556,51212da18055ce6af0c027d7,Hero Certified Burgers,43.777295,-79.344584,Burger Joint
18,"Guildwood, Morningside, West Hill",43.778517,-79.346556,4af5ff72f964a520f6ff21e3,New York Fries - Fairview Mall,43.778605,-79.343577,Restaurant
20,"Guildwood, Morningside, West Hill",43.778517,-79.346556,50bceb3ae4b01dc9b287cc99,Aroma Espresso Bar,43.7777,-79.344652,Coffee Shop


### Finding number of likes using for each venues using its ID.

In [29]:
venue_id = toronto_res['Venue ID']

In [30]:
url_list = []
like_list = []
for ids in venue_id:
    url = 'https://api.foursquare.com/v2/venues/{}/likes?&client_id={}&client_secret={}&v={}'.format(
            ids,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            )
    url_list.append(url)  

In [31]:
like_list.clear()
for url in  url_list:
    results = requests.get(url).json()
    try:
        like_count = results['response']['likes']['count']
        like_list.append(like_count)
    except:
        like_list.append(0)

In [32]:
# results = requests.get('https://api.foursquare.com/v2/venues/4c6aa500c946e21ec432ed8e/likes?&client_id=K0S5WC0VPH3FXOJXNVY1WKEZOSTJCUJBGFMT52TQ2BHAT3MX&client_secret=IJ34AI144DDJLZ2W3ANDQ0NJSD2XMM01YHY3K0JHBRDLM0WX&v=20180605').json()
# results
len(like_list)

547

In [33]:
toronto_res.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category
1,"Malvern, Rouge",43.727929,-79.262029,4c6aa500c946e21ec432ed8e,Tim Hortons,43.726895,-79.266157,Coffee Shop
8,"Rouge Hill, Port Union, Highland Creek",43.7942,-79.262029,4b50b100f964a520b72d27e3,El Pulgarcito,43.792648,-79.259208,Latin American Restaurant
15,"Guildwood, Morningside, West Hill",43.778517,-79.346556,51212da18055ce6af0c027d7,Hero Certified Burgers,43.777295,-79.344584,Burger Joint
18,"Guildwood, Morningside, West Hill",43.778517,-79.346556,4af5ff72f964a520f6ff21e3,New York Fries - Fairview Mall,43.778605,-79.343577,Restaurant
20,"Guildwood, Morningside, West Hill",43.778517,-79.346556,50bceb3ae4b01dc9b287cc99,Aroma Espresso Bar,43.7777,-79.344652,Coffee Shop


In [34]:
toronto_res.shape

(547, 8)

In [35]:
toronto_res['Likes'] = like_list
toronto_res.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category,Likes
1,"Malvern, Rouge",43.727929,-79.262029,4c6aa500c946e21ec432ed8e,Tim Hortons,43.726895,-79.266157,Coffee Shop,8
8,"Rouge Hill, Port Union, Highland Creek",43.7942,-79.262029,4b50b100f964a520b72d27e3,El Pulgarcito,43.792648,-79.259208,Latin American Restaurant,9
15,"Guildwood, Morningside, West Hill",43.778517,-79.346556,51212da18055ce6af0c027d7,Hero Certified Burgers,43.777295,-79.344584,Burger Joint,7
18,"Guildwood, Morningside, West Hill",43.778517,-79.346556,4af5ff72f964a520f6ff21e3,New York Fries - Fairview Mall,43.778605,-79.343577,Restaurant,7
20,"Guildwood, Morningside, West Hill",43.778517,-79.346556,50bceb3ae4b01dc9b287cc99,Aroma Espresso Bar,43.7777,-79.344652,Coffee Shop,13


In [36]:
print('max : {}', toronto_res['Likes'].max())
print('min : {}', toronto_res['Likes'].min())
print('median : {}', toronto_res['Likes'].median())
print(toronto_res['Likes'].value_counts())

max : {} 488
min : {} 0
median : {} 16.0
1      37
0      30
6      28
4      21
21     21
9      18
7      16
2      16
11     16
3      15
8      14
10     14
14     13
5      10
16     10
35      9
15      9
28      8
32      8
19      8
47      7
12      7
18      6
30      6
25      6
23      6
24      5
20      5
54      5
13      5
       ..
299     1
306     1
339     1
355     1
149     1
142     1
57      1
141     1
65      1
69      1
74      1
78      1
79      1
81      1
82      1
85      1
89      1
90      1
91      1
94      1
100     1
105     1
113     1
114     1
118     1
121     1
128     1
132     1
136     1
71      1
Name: Likes, Length: 128, dtype: int64


### Applying one hot encoding to categories of places.

In [37]:
tor_res_onehot = pd.get_dummies(toronto_res[['Venue Category', 'Likes']], prefix = "", prefix_sep = "")
tor_res_onehot['Venue'] = toronto_res['Venue']
col = tor_res_onehot.columns.tolist()
ind = col.index('Venue')
fixed_columns = [tor_res_onehot.columns[ind]] + list(tor_res_onehot.columns[0:ind]) + list(tor_res_onehot.columns[ind+1:])
tor_res_onehot = tor_res_onehot[fixed_columns]
tor_res_onehot.head()

Unnamed: 0,Venue,Likes,American Restaurant,Asian Restaurant,Belgian Restaurant,Burger Joint,Café,Cajun / Creole Restaurant,Caribbean Restaurant,Chinese Restaurant,...,Ramen Restaurant,Restaurant,Sandwich Place,Seafood Restaurant,Sushi Restaurant,Taiwanese Restaurant,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
1,Tim Hortons,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,El Pulgarcito,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,Hero Certified Burgers,7,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,New York Fries - Fairview Mall,7,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
20,Aroma Espresso Bar,13,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Clustering food places using KMeans algorithm based on their categories and number of likes.

In [61]:
clusters = 5

tor_clustering = tor_res_onehot.drop('Venue', 1)

kmeans = KMeans(n_clusters = clusters, random_state = 4).fit(tor_clustering)

kmeans.labels_[0:10]

array([0, 0, 0, 0, 0, 3, 0, 3, 0, 0], dtype=int32)

In [62]:
tor_res_onehot['Cluster Labels'] = kmeans.labels_
tor_res_onehot.head()

Unnamed: 0,Venue,Likes,American Restaurant,Asian Restaurant,Belgian Restaurant,Burger Joint,Café,Cajun / Creole Restaurant,Caribbean Restaurant,Chinese Restaurant,...,Restaurant,Sandwich Place,Seafood Restaurant,Sushi Restaurant,Taiwanese Restaurant,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Cluster Labels
1,Tim Hortons,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,El Pulgarcito,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,Hero Certified Burgers,7,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,New York Fries - Fairview Mall,7,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
20,Aroma Espresso Bar,13,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
toronto_res['Cluster Labels'] = tor_res_onehot['Cluster Labels'] 

toronto_res.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue ID,Venue,Venue Latitude,Venue Longitude,Venue Category,Likes,Cluster Labels
1,"Malvern, Rouge",43.727929,-79.262029,4c6aa500c946e21ec432ed8e,Tim Hortons,43.726895,-79.266157,Coffee Shop,8,0
8,"Rouge Hill, Port Union, Highland Creek",43.7942,-79.262029,4b50b100f964a520b72d27e3,El Pulgarcito,43.792648,-79.259208,Latin American Restaurant,9,0
15,"Guildwood, Morningside, West Hill",43.778517,-79.346556,51212da18055ce6af0c027d7,Hero Certified Burgers,43.777295,-79.344584,Burger Joint,7,0
18,"Guildwood, Morningside, West Hill",43.778517,-79.346556,4af5ff72f964a520f6ff21e3,New York Fries - Fairview Mall,43.778605,-79.343577,Restaurant,7,0
20,"Guildwood, Morningside, West Hill",43.778517,-79.346556,50bceb3ae4b01dc9b287cc99,Aroma Espresso Bar,43.7777,-79.344652,Coffee Shop,13,0


In [64]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_res['Venue Latitude'], toronto_res['Venue Longitude'], toronto_res['Venue'], toronto_res['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [78]:
cluster_0 = toronto_res.loc[toronto_res['Cluster Labels'] == 0, toronto_res.columns[[0, 4, 7, 8, 9] ]]
cluster_1 = toronto_res.loc[toronto_res['Cluster Labels'] == 1, toronto_res.columns[[0, 4, 7, 8, 9] ]]
cluster_2 = toronto_res.loc[toronto_res['Cluster Labels'] == 2, toronto_res.columns[[0, 4, 7, 8, 9] ]]
cluster_3 = toronto_res.loc[toronto_res['Cluster Labels'] == 3, toronto_res.columns[[0, 4, 7, 8, 9] ]]
cluster_4 = toronto_res.loc[toronto_res['Cluster Labels'] == 4, toronto_res.columns[[0, 4, 7, 8, 9] ]]

In [72]:
clusters_df = pd.DataFrame(columns = ['Min', 'Max', 'Mean'])
clusters_df

Unnamed: 0,Min,Max,Mean


In [73]:
clusters_df['Max'] = toronto_res.groupby(['Cluster Labels']).max()['Likes']
clusters_df['Min'] = toronto_res.groupby(['Cluster Labels']).min()['Likes']
clusters_df['Mean'] = toronto_res.groupby(['Cluster Labels']).mean()['Likes']
clusters_df

Unnamed: 0_level_0,Min,Max,Mean
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,37,11.895939
1,118,218,170.225
2,231,355,283.411765
3,38,114,63.913978
4,449,488,462.0


In [74]:
clusters_df['Mean'] = clusters_df['Mean'].round(2)
clusters_df

Unnamed: 0_level_0,Min,Max,Mean
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,37,11.9
1,118,218,170.22
2,231,355,283.41
3,38,114,63.91
4,449,488,462.0


In [75]:
clusters_df.sort_values('Min', inplace = True)
clusters_df

Unnamed: 0_level_0,Min,Max,Mean
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,37,11.9
3,38,114,63.91
1,118,218,170.22
2,231,355,283.41
4,449,488,462.0


## Analysis:

<p>
Here, we have categorised different food places into 7 different clusters based on there category and number of likes. The above dataframe shows different statistics for each cluster namely min, max, and mean. We can easily identify the different ranges od likes from clusters.<br><br>
1. Cluster_0: In this cluster, we have range of 0-37. So, we can say that cluster contains places which are rated poorly based on the data we have used. Because there can be many factors such as how many people go there everyday, what kind of people lives in neighborhood and so on. As there can be many factors to evaluate the same, but keeping in mind the data used in this notebook It contains places with very low likes compared to other places.<br><br>
2. Cluster_3: We have places with number of likes 38-114. We can identify this cluster as slightly better rated places compared to Cluster_0.<br><br>
3. Cluster_1: This places have good rating as many users have liked so people may want to go here than the above two clusters.<br><br>
4. Cluster_2: This contain places with high number of likes so they must be very popular aroung the neighborhood.<br><br>
5. Cluster_4: This are very highly rated places and they should be at the top in the list of recommandation.<br><br>    
</p>

<p>With above analysis we can easily find highly rated places in particular neighborhood in Toronto. It can also be used to analyze that in which area what kind of food places have low number of likes. So, if someone want to open an restaurant or food chain. They can do so and have good sells if they can provide good food as other places are not much popular in neighborhood.</p>

### Plotting data of food places which have number of likes > 200. Blue shows 'Neighborhoods' and Red shows 'Food places'.

In [89]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neigh, like in zip(toronto_res['Neighborhood Latitude'], toronto_res['Neighborhood Longitude'], toronto_res['Neighborhood'], toronto_res['Likes']):
    label = folium.Popup(str(neigh), parse_html=True)
    if like > 200:
        folium.CircleMarker(
            [lat, lon],
            radius=10,
            popup=label,
            color='Blue',
            fill=True,
            fill_color='Blue',
            fill_opacity=0.7).add_to(map_clusters)
for lat, lon, poi, like in zip(toronto_res['Venue Latitude'], toronto_res['Venue Longitude'], toronto_res['Venue'], toronto_res['Likes']):
    label = folium.Popup(str(poi), parse_html=True)
    if like > 200:    
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_clusters)
map_clusters