#### Download the dependencies

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
import json 
import requests
import folium
import wikipedia
import urllib

from urllib.request import urlopen
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print('Libraries imported.')

Libraries imported.


# 1. Data Preparation

In [10]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" 
content = urlopen(url).read()
soup = BeautifulSoup(content)
article = soup.find('table', class_ = "wikitable sortable")
neighborhoods = pd.DataFrame()

b = 2
k = 2
for td in article.find_all('td'):
    b = b + 1
    b2 = (b//3) - 1
    k = (k + 1) % 3
    kolom = td.text
    neighborhoods.loc[b2,k]=kolom

neighborhoods = neighborhoods.replace(r'\n',' ', regex=True)
neighborhoods = neighborhoods.set_axis(['Postcode', 'Borough', 'Neighborhood'], axis=1, inplace=False)
neighborhoods = neighborhoods[~(neighborhoods.Borough == "Not assigned")]
neighborhoods = neighborhoods.replace(r'Not assigned',"""Queen's Park""", regex=True)
neighborhoods = neighborhoods.reset_index()
neighborhoods.sort_values(by=['Borough'])
del neighborhoods['index']

neighborhoods = neighborhoods.groupby(['Postcode','Borough'])['Neighborhood'].apply( ', '.join).reset_index()
geos = pd.read_csv('Geospatial_Coordinates.csv')
neighborhoods = neighborhoods.join(geos, how='outer')
del neighborhoods['Postal Code']

neighborhoods

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge , Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood , Morningside , West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park , Ionview , Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea , Golden Mile , Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest , Cliffside , Scarborough Village W...",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff , Cliffside West",43.692657,-79.264848


### Create a map of Toronto with neighborhoods superimposed on top.

In [3]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude + 0.06
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of Toronto, Canada are 43.713963, -79.387207.


In [4]:
# create map using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Selecting East York as Borough of Interest

In [14]:
east_york = neighborhoods[neighborhoods['Borough'] == 'East York'].reset_index(drop=True)
east_york

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4B,East York,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937
1,M4C,East York,Woodbine Heights,43.695344,-79.318389
2,M4G,East York,Leaside,43.70906,-79.363452
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372
4,M4J,East York,East Toronto,43.685347,-79.338106


### Get the geographical coordinates of Central Toronto.

In [31]:
address = 'East York, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude + 0.0055
longitude = location.longitude - 0.005
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of East York, Canada are 43.6968391, -79.3328212.


In [32]:
# create map using latitude and longitude values
map_ey = folium.Map(location=[latitude, longitude], zoom_start=14)

# add markers to map
for lat, lng, label in zip(east_york['Latitude'], east_york['Longitude'], east_york['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ey)  
    
map_ey

#### Define Foursquare Credentials and Version

In [33]:
CLIENT_ID = 'NGXMNDQGMTNYBEUS0QNJBALTPK4SY4M55V3C5132D0Y5OUSR' # your Foursquare ID
CLIENT_SECRET = '3SLRGAELDPYSIW1VRUT4EE3I3A0FHFWRENFMHPOS2B0VNFQC' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: NGXMNDQGMTNYBEUS0QNJBALTPK4SY4M55V3C5132D0Y5OUSR
CLIENT_SECRET:3SLRGAELDPYSIW1VRUT4EE3I3A0FHFWRENFMHPOS2B0VNFQC


#### Let's explore the first neighborhood in our dataframe.

Get the neighborhood's name.

In [49]:
east_york.loc[0, 'Neighborhood']

'Woodbine Gardens , Parkview Hill '

Get the neighborhood's latitude and longitude values.

In [41]:
neighborhood_latitude = east_york.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = east_york.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = east_york.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Woodbine Gardens , Parkview Hill  are 43.7063972, -79.309937.


#### Now, let's get the top 100 venues that are in neighborhood within a radius of 500 meters.

First, let's create the GET request URL. Name your URL **url**.

In [42]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=NGXMNDQGMTNYBEUS0QNJBALTPK4SY4M55V3C5132D0Y5OUSR&client_secret=3SLRGAELDPYSIW1VRUT4EE3I3A0FHFWRENFMHPOS2B0VNFQC&v=20180605&ll=43.7063972,-79.309937&radius=500&limit=100'

Send the GET request and examine the resutls

In [43]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5daf1bdfc58ed7002cae28f2'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': "O'Connor - Parkview",
  'headerFullLocation': "O'Connor - Parkview, Toronto",
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 13,
  'suggestedBounds': {'ne': {'lat': 43.7108972045, 'lng': -79.30372360313615},
   'sw': {'lat': 43.701897195499996, 'lng': -79.31615039686386}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b5a3842f964a52023b528e3',
       'name': 'Jawny Bakers',
       'location': {'address': "804 O'Connor Dr",
        'crossStreet': 'St Clair E',
        'lat': 43.705782646822,
        'lng': -79.31291304477831,
        'labeledLatLngs':

From the Foursquare lab in the previous module, we know that all the information is in the *items* key. Before we proceed, let's borrow the **get_category_type** function from the Foursquare lab.

In [44]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a *pandas* dataframe.

In [45]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Jawny Bakers,Gastropub,43.705783,-79.312913
1,East York Gymnastics,Gym / Fitness Center,43.710654,-79.309279
2,Shoppers Drug Mart,Pharmacy,43.705933,-79.312825
3,TD Canada Trust,Bank,43.70574,-79.31227
4,Pizza Pizza,Pizza Place,43.705159,-79.31313


And how many venues were returned by Foursquare?

In [46]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

13 venues were returned by Foursquare.


<a id='item2'></a>

# 2. Explore Neighborhoods in East York

#### Let's create a function to repeat the same process to all the neighborhoods in Manhattan

In [47]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Now write the code to run the above function on each neighborhood and create a new dataframe.

In [48]:
east_york_venues = getNearbyVenues(names=east_york['Neighborhood'],
                                   latitudes=east_york['Latitude'],
                                   longitudes=east_york['Longitude'])

Woodbine Gardens , Parkview Hill 
Woodbine Heights 
Leaside 
Thorncliffe Park 
East Toronto 


#### Let's check the size of the resulting dataframe

In [51]:
print(east_york_venues.shape)
east_york_venues.head()

(77, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937,Jawny Bakers,43.705783,-79.312913,Gastropub
1,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937,East York Gymnastics,43.710654,-79.309279,Gym / Fitness Center
2,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937,Shoppers Drug Mart,43.705933,-79.312825,Pharmacy
3,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937,TD Canada Trust,43.70574,-79.31227,Bank
4,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937,Pizza Pizza,43.705159,-79.31313,Pizza Place


Let's check how many venues were returned for each neighborhood

In [52]:
east_york_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
East Toronto,4,4,4,4,4,4
Leaside,34,34,34,34,34,34
Thorncliffe Park,17,17,17,17,17,17
"Woodbine Gardens , Parkview Hill",13,13,13,13,13,13
Woodbine Heights,9,9,9,9,9,9


#### Let's find out how many unique categories can be curated from all the returned venues

In [53]:
print('There are {} uniques categories.'.format(len(east_york_venues['Venue Category'].unique())))

There are 45 uniques categories.


## 3. Analyze Each Neighborhood

In [54]:
# one hot encoding
east_york_onehot = pd.get_dummies(east_york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
east_york_onehot['Neighborhood'] = east_york_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [east_york_onehot.columns[-1]] + list(east_york_onehot.columns[:-1])
east_york_onehot = east_york_onehot[fixed_columns]

east_york_onehot.head()

Unnamed: 0,Neighborhood,Asian Restaurant,Athletics & Sports,Bagel Shop,Bank,Beer Store,Bike Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Clothing Store,Coffee Shop,Convenience Store,Cosmetics Shop,Curling Ice,Dessert Shop,Electronics Store,Fast Food Restaurant,Fish & Chips Shop,Furniture / Home Store,Gastropub,Grocery Store,Gym,Gym / Fitness Center,Indian Restaurant,Intersection,Liquor Store,Mexican Restaurant,Park,Pet Store,Pharmacy,Pizza Place,Record Shop,Restaurant,Sandwich Place,Shopping Mall,Skating Rink,Smoothie Shop,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Video Store,Warehouse Store,Yoga Studio
0,"Woodbine Gardens , Parkview Hill",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Woodbine Gardens , Parkview Hill",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Woodbine Gardens , Parkview Hill",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Woodbine Gardens , Parkview Hill",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"Woodbine Gardens , Parkview Hill",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [55]:
east_york_onehot.shape

(77, 46)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [56]:
east_york_grouped = east_york_onehot.groupby('Neighborhood').mean().reset_index()
east_york_grouped

Unnamed: 0,Neighborhood,Asian Restaurant,Athletics & Sports,Bagel Shop,Bank,Beer Store,Bike Shop,Breakfast Spot,Brewery,Burger Joint,Bus Line,Clothing Store,Coffee Shop,Convenience Store,Cosmetics Shop,Curling Ice,Dessert Shop,Electronics Store,Fast Food Restaurant,Fish & Chips Shop,Furniture / Home Store,Gastropub,Grocery Store,Gym,Gym / Fitness Center,Indian Restaurant,Intersection,Liquor Store,Mexican Restaurant,Park,Pet Store,Pharmacy,Pizza Place,Record Shop,Restaurant,Sandwich Place,Shopping Mall,Skating Rink,Smoothie Shop,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Video Store,Warehouse Store,Yoga Studio
0,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Leaside,0.0,0.0,0.029412,0.029412,0.029412,0.029412,0.029412,0.029412,0.058824,0.0,0.029412,0.117647,0.0,0.0,0.0,0.029412,0.029412,0.0,0.029412,0.058824,0.0,0.029412,0.029412,0.0,0.0,0.0,0.029412,0.029412,0.0,0.029412,0.0,0.0,0.029412,0.029412,0.029412,0.029412,0.0,0.029412,0.088235,0.029412,0.029412,0.029412,0.0,0.0,0.0
2,Thorncliffe Park,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.117647,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.058824,0.0,0.117647,0.0,0.058824,0.0,0.058824,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.058824,0.058824
3,"Woodbine Gardens , Parkview Hill",0.0,0.076923,0.0,0.076923,0.0,0.0,0.076923,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153846,0.0,0.0,0.076923,0.0,0.0,0.076923,0.0,0.076923,0.0,0.0,0.0,0.076923,0.076923,0.153846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Woodbine Heights,0.111111,0.111111,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0


#### Let's confirm the new size

In [58]:
east_york_grouped.shape

(5, 46)

#### Let's print each neighborhood along with the top 5 most common venues

In [59]:
num_top_venues = 5

for hood in east_york_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = east_york_grouped[east_york_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----East Toronto ----
                venue  freq
0                Park  0.50
1         Coffee Shop  0.25
2   Convenience Store  0.25
3        Liquor Store  0.00
4  Mexican Restaurant  0.00


----Leaside ----
                    venue  freq
0             Coffee Shop  0.12
1     Sporting Goods Shop  0.09
2            Burger Joint  0.06
3  Furniture / Home Store  0.06
4                     Gym  0.03


----Thorncliffe Park ----
               venue  freq
0     Sandwich Place  0.12
1       Burger Joint  0.12
2  Indian Restaurant  0.12
3                Gym  0.06
4      Grocery Store  0.06


----Woodbine Gardens , Parkview Hill ----
                  venue  freq
0           Pizza Place  0.15
1  Fast Food Restaurant  0.15
2          Intersection  0.08
3                  Bank  0.08
4  Gym / Fitness Center  0.08


----Woodbine Heights ----
                venue  freq
0    Asian Restaurant  0.11
1         Curling Ice  0.11
2  Athletics & Sports  0.11
3         Video Store  0.11
4          Beer S

#### Let's put that into a *pandas* dataframe

First, let's write a function to sort the venues in descending order.

In [83]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [84]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = east_york_grouped['Neighborhood']

for ind in np.arange(east_york_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(east_york_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,Park,Coffee Shop,Convenience Store,Yoga Studio,Furniture / Home Store,Fish & Chips Shop,Fast Food Restaurant,Electronics Store,Dessert Shop,Curling Ice
1,Leaside,Coffee Shop,Sporting Goods Shop,Furniture / Home Store,Burger Joint,Gym,Bike Shop,Dessert Shop,Electronics Store,Fish & Chips Shop,Clothing Store
2,Thorncliffe Park,Burger Joint,Indian Restaurant,Sandwich Place,Yoga Studio,Pizza Place,Bank,Coffee Shop,Grocery Store,Warehouse Store,Liquor Store
3,"Woodbine Gardens , Parkview Hill",Fast Food Restaurant,Pizza Place,Gastropub,Athletics & Sports,Bank,Gym / Fitness Center,Breakfast Spot,Intersection,Bus Line,Pharmacy
4,Woodbine Heights,Asian Restaurant,Pharmacy,Video Store,Athletics & Sports,Beer Store,Skating Rink,Curling Ice,Cosmetics Shop,Park,Fast Food Restaurant


## 4. Cluster Neighborhoods

Run *k*-means to cluster the neighborhood into 3 clusters.

In [85]:
# set number of clusters
kclusters = 3

east_york_grouped_clustering = east_york_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(east_york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5]

array([2, 1, 1, 1, 0])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [86]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

east_york_merged = east_york

# merge east_york with east_york to add latitude/longitude for each neighborhood
east_york_merged = east_york_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

east_york_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4B,East York,"Woodbine Gardens , Parkview Hill",43.706397,-79.309937,1,Fast Food Restaurant,Pizza Place,Gastropub,Athletics & Sports,Bank,Gym / Fitness Center,Breakfast Spot,Intersection,Bus Line,Pharmacy
1,M4C,East York,Woodbine Heights,43.695344,-79.318389,0,Asian Restaurant,Pharmacy,Video Store,Athletics & Sports,Beer Store,Skating Rink,Curling Ice,Cosmetics Shop,Park,Fast Food Restaurant
2,M4G,East York,Leaside,43.70906,-79.363452,1,Coffee Shop,Sporting Goods Shop,Furniture / Home Store,Burger Joint,Gym,Bike Shop,Dessert Shop,Electronics Store,Fish & Chips Shop,Clothing Store
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372,1,Burger Joint,Indian Restaurant,Sandwich Place,Yoga Studio,Pizza Place,Bank,Coffee Shop,Grocery Store,Warehouse Store,Liquor Store
4,M4J,East York,East Toronto,43.685347,-79.338106,2,Park,Coffee Shop,Convenience Store,Yoga Studio,Furniture / Home Store,Fish & Chips Shop,Fast Food Restaurant,Electronics Store,Dessert Shop,Curling Ice


Finally, let's visualize the resulting clusters

In [90]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=14)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(east_york_merged['Latitude'], east_york_merged['Longitude'], east_york_merged['Neighborhood'], east_york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<a id='item5'></a>

## 5. Examine Clusters

Now, you can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, you can then assign a name to each cluster. I will leave this exercise to you.

#### Cluster 1

In [91]:
east_york_merged.loc[east_york_merged['Cluster Labels'] == 0, east_york_merged.columns[[1] + list(range(5, east_york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,East York,0,Asian Restaurant,Pharmacy,Video Store,Athletics & Sports,Beer Store,Skating Rink,Curling Ice,Cosmetics Shop,Park,Fast Food Restaurant


#### Cluster 2

In [92]:
east_york_merged.loc[east_york_merged['Cluster Labels'] == 1, east_york_merged.columns[[1] + list(range(5, east_york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East York,1,Fast Food Restaurant,Pizza Place,Gastropub,Athletics & Sports,Bank,Gym / Fitness Center,Breakfast Spot,Intersection,Bus Line,Pharmacy
2,East York,1,Coffee Shop,Sporting Goods Shop,Furniture / Home Store,Burger Joint,Gym,Bike Shop,Dessert Shop,Electronics Store,Fish & Chips Shop,Clothing Store
3,East York,1,Burger Joint,Indian Restaurant,Sandwich Place,Yoga Studio,Pizza Place,Bank,Coffee Shop,Grocery Store,Warehouse Store,Liquor Store


#### Cluster 3

In [93]:
east_york_merged.loc[east_york_merged['Cluster Labels'] == 2, east_york_merged.columns[[1] + list(range(5, east_york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,East York,2,Park,Coffee Shop,Convenience Store,Yoga Studio,Furniture / Home Store,Fish & Chips Shop,Fast Food Restaurant,Electronics Store,Dessert Shop,Curling Ice
