## Segmenting and Clustering Neighborhoods in Toronto

**Web scrapping section:**

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
url_page = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
data = requests.get(url_page).text

In [4]:
soup = BeautifulSoup(data, "html5lib")

**Printing with prettify  (just for visualization)**

In [5]:
# printing with prettify()
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"60e5a44e-3093-42f6-98af-4c1787ac6e13","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":1029579868,"wgRevisionId":1029579868,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Communica

In [6]:
table_contents=[]
table = soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode']=row.p.text[:3]
        cell['Borough']=(row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# printing
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
        

**This is the dataframe created:**

In [7]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


**This is the dataframe shape:**

In [8]:
df.shape

(103, 3)

### Clustering Section

**Imports:** 

In [10]:
import random
from IPython.display import Image
from IPython.core.display import HTML
from pandas.io.json import json_normalize
import geocoder
from geopy.geocoders import Nominatim
import folium
from folium import plugins
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

**Reading and concatenate to create the dataframe**

In [11]:
df_tor= pd.read_csv('Geospatial_Coordinates.csv')
tor_frame = [df, df_tor] ## df is the previous dataframe
tor_frame = pd.concat(tor_frame, axis=1, sort=False)
tor_frame # this is the new dataframe

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M1B,43.806686,-79.194353
1,M4A,North York,Victoria Village,M1C,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M1E,43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M1G,43.770992,-79.216917
4,M7A,Queen's Park,Ontario Provincial Government,M1H,43.773136,-79.239476
...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M9N,43.706876,-79.518188
99,M4Y,Downtown Toronto,Church and Wellesley,M9P,43.696319,-79.532242
100,M7Y,East Toronto Business,Enclave of M4L,M9R,43.688905,-79.554724
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M9V,43.739416,-79.588437


**Dropping the columns Postal Code**

In [12]:
tor_frame.drop(['Postal Code', 'PostalCode'], axis=1, inplace=True)

In [13]:
tor_frame.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York,Parkwoods,43.806686,-79.194353
1,North York,Victoria Village,43.784535,-79.160497
2,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,Queen's Park,Ontario Provincial Government,43.773136,-79.239476


In [14]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(tor_frame['Borough'].unique()),
        tor_frame.shape[0]
    )
)

The dataframe has 15 boroughs and 103 neighborhoods.


**Toronto latitude and longitude:**

In [15]:
city = 'Toronto, ON'
geolocator = Nominatim(user_agent="toronto")
location = geolocator.geocode(city)
latitude = location.latitude
longitude = location.longitude
print("Toronto latitude: ", location.latitude,"\n","Toronto longitude: ", location.longitude )

Toronto latitude:  43.6534817 
 Toronto longitude:  -79.3839347


In [16]:
# this is another way using Nominatim passing the coordinates we can see its name

geolocator = Nominatim(user_agent = "url")
location = geolocator.reverse("43.6534817, -79.3839347")
print(location.address)

Toronto City Hall, 100, Queen Street West, Financial District, Spadina—Fort York, Old Toronto, Toronto, Golden Horseshoe, Ontario, M5H 2N2, Canada


**Toronto map with neighborhoods:**

In [17]:
# create map of Toronto using latitude and longitude values
map_tor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(tor_frame['Latitude'], tor_frame['Longitude'], tor_frame['Borough'], tor_frame['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor

**Segmenting and clustering the North York Borough:**

In [18]:
north_york_data = tor_frame[tor_frame['Borough'] == 'North York'].reset_index(drop=True)
north_york_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,North York,Parkwoods,43.806686,-79.194353
1,North York,Victoria Village,43.784535,-79.160497
2,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
3,North York,Don Mills North,43.711112,-79.284577
4,North York,Glencairn,43.75741,-79.273304


Let's get the geographical coordinates of North York

In [19]:
address = 'North York, ON, Canada'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude_ny = location.latitude
longitude_ny = location.longitude
print("North York coordinates are: {}, {}".format(latitude_ny, longitude_ny))

North York coordinates are: 43.7543263, -79.44911696639593


### Visualization
This is the neighborhood map from North York 

In [20]:
# create map of Manhattan using latitude and longitude values
map_north_york = folium.Map(location=[latitude_ny, longitude_ny], zoom_start=11)

# add markers to map
for lat, lng, label in zip(north_york_data['Latitude'], north_york_data['Longitude'], north_york_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_north_york)
    

map_north_york


In [21]:
# create map of North York using latitude and longitude values
# this is a way for grouping together neighborhoods
map_north_york = folium.Map(location=[latitude_ny, longitude_ny], zoom_start=11)

# instantiate a mark cluster object for the incidents in the dataframe
group_tor = plugins.MarkerCluster().add_to(map_north_york)

# add markers to map
for lat, lng, label in zip(north_york_data['Latitude'], north_york_data['Longitude'], north_york_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(group_tor)  
    
map_north_york

**Foursquare API:**

In [23]:
# @hidden_cell
CLIENT_ID = 'IYJ1TPEJFEWISEQRDNZNXO4HJQ0MTMY0AVDGBJBOLUEYAGH0' # your Foursquare ID
CLIENT_SECRET = 'NBRRXVQT1SW0MURIG4D2I3WY5BPEGFOLO5HMF0JHHZLXYKB3' # your Foursquare Secret
ACCESS_TOKEN = 'GV2OQEHO0DVFSNTSKPYTFGYKDTENRZWEVTNQYGA2BHF4J40H' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 100

**Let's explore the first neighborhood in the dataframe**

In [24]:
north_york_data.loc[0, 'Neighborhood']

'Parkwoods'

Parkwoods latitud and longitude

In [25]:
neighborhood_latitude = north_york_data.loc[0, 'Latitude']
neighborhood_longitude = north_york_data.loc[0, 'Longitude']
neighborhood_name = north_york_data.loc[0, 'Neighborhood']

print("Latitude and longitude values of {} are {}, {}".format(neighborhood_name, neighborhood_latitude, neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.806686299999996, -79.19435340000001


Top 100 venues that are in Parkwoods within a radius of 1000 meters.

In [28]:
# @hidden_cell
LIMIT = 100
radius = 1000
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=IYJ1TPEJFEWISEQRDNZNXO4HJQ0MTMY0AVDGBJBOLUEYAGH0&client_secret=NBRRXVQT1SW0MURIG4D2I3WY5BPEGFOLO5HMF0JHHZLXYKB3&v=20180604&ll=43.806686299999996,-79.19435340000001&radius=1000&limit=100'

Send the GET request and examine the resutls

In [29]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60d9cce8e87a100d585584ff'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 19,
  'suggestedBounds': {'ne': {'lat': 43.81568630900001,
    'lng': -79.18190576146081},
   'sw': {'lat': 43.797686290999984, 'lng': -79.20680103853921}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4c6ac7de35d3be9a50bf2206',
       'name': 'RBC Royal Bank',
       'location': {'address': '865 MILNER AVE',
        'crossStreet': 'Morningside',
        'lat': 43.79878248056552,
        'lng': -79.19709031445504,
        'labeledLatLngs': [{'label': 

Function that extracts the **category of the venue**

In [30]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Cleaning the json and structure it into a pandas dataframe

In [31]:
venues = results['response']['groups'][0]['items']

nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,RBC Royal Bank,Bank,43.798782,-79.19709
1,Wendy’s,Fast Food Restaurant,43.807448,-79.199056
2,Wendy's,Fast Food Restaurant,43.802008,-79.19808
3,Harvey's,Restaurant,43.80002,-79.198307
4,Caribbean Wave,Caribbean Restaurant,43.798558,-79.195777


In [32]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

19 venues were returned by Foursquare.


### Exploring Neighborhoods in North York


**function to repeat the same process to all the neighborhoods in North York**

In [33]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [34]:
# we run the above function on each neighborhood and create a new dataframe called north_york_venues.
north_york_venues = getNearbyVenues(names=north_york_data['Neighborhood'],
                                   latitudes=north_york_data['Latitude'],
                                   longitudes=north_york_data['Longitude']
                                  )

Parkwoods
Victoria Village
Lawrence Manor, Lawrence Heights
Don Mills North
Glencairn
Don Mills South
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Fairview, Henry Farm, Oriole
Northwood Park, York University
Bayview Village
Downsview East
York Mills, Silver Hills
Downsview West
North Park, Maple Leaf Park, Upwood Park
Humber Summit
Willowdale, Newtonbrook
Downsview Central
Bedford Park, Lawrence Manor East
Humberlea, Emery
Willowdale South
Downsview Northwest
York Mills West
Willowdale West


**size of the resulting dataframe:**

In [35]:
print(north_york_venues.shape)
north_york_venues.head()

(633, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,Parkwoods,43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,Victoria Village,43.784535,-79.160497,RIGHT WAY TO GOLF,43.785177,-79.161108,Golf Course
3,Victoria Village,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
4,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917,Starbucks,43.770037,-79.221156,Coffee Shop


Let's check how many venues were returned for each neighborhood

In [36]:
north_york_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor, Wilson Heights, Downsview North",17,17,17,17,17,17
Bayview Village,18,18,18,18,18,18
"Bedford Park, Lawrence Manor East",83,83,83,83,83,83
Don Mills North,7,7,7,7,7,7
Don Mills South,12,12,12,12,12,12
Downsview Central,44,44,44,44,44,44
Downsview East,3,3,3,3,3,3
Downsview Northwest,100,100,100,100,100,100
Downsview West,13,13,13,13,13,13
"Fairview, Henry Farm, Oriole",5,5,5,5,5,5


These unique categories can be curated from all the returned venues

In [37]:
print("There are {} unique categories".format(len(north_york_venues['Venue Category'].unique())))

There are 163 unique categories


### Analyzing Each Neighborhood

In [38]:
# one hot encoding
north_york_onehot = pd.get_dummies(north_york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
north_york_onehot['Neighborhood'] = north_york_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [north_york_onehot.columns[-1]] + list(north_york_onehot.columns[:-1])
north_york_onehot = north_york_onehot[fixed_columns]

north_york_onehot.head()

Unnamed: 0,Yoga Studio,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
north_york_onehot.shape

(633, 163)

**let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category**

In [40]:
north_york_grouped = north_york_onehot.groupby('Neighborhood').mean().reset_index()
north_york_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop
0,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0
2,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.024096,0.0,0.0,0.012048,0.0,0.012048,...,0.0,0.0,0.0,0.0,0.012048,0.0,0.0,0.0,0.012048,0.0
3,Don Mills North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Don Mills South,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Downsview Central,0.0,0.0,0.0,0.0,0.022727,0.0,0.022727,0.0,0.022727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727
6,Downsview East,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Downsview Northwest,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.04,...,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0
8,Downsview West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Fairview, Henry Farm, Oriole",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
north_york_grouped.shape # new size

(24, 163)

#### Let's print each neighborhood along with the top 5 most common venues

In [42]:
num_top_venues = 5

for hood in north_york_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = north_york_grouped[north_york_grouped['Neighborhood']==hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor, Wilson Heights, Downsview North----
                       venue  freq
0                       Bank  0.12
1                Coffee Shop  0.12
2                       Park  0.06
3                Bridal Shop  0.06
4  Middle Eastern Restaurant  0.06


----Bayview Village----
               venue  freq
0     Sandwich Place  0.11
1  Indian Restaurant  0.11
2               Bank  0.06
3     Discount Store  0.06
4           Bus Line  0.06


----Bedford Park, Lawrence Manor East----
                venue  freq
0         Coffee Shop  0.08
1                Café  0.06
2        Cocktail Bar  0.05
3  Italian Restaurant  0.05
4            Beer Bar  0.04


----Don Mills North----
            venue  freq
0          Bakery  0.29
1    Soccer Field  0.14
2   Metro Station  0.14
3            Park  0.14
4  Ice Cream Shop  0.14


----Don Mills South----
                  venue  freq
0  Fast Food Restaurant  0.17
1           Pizza Place  0.17
2           Coffee Shop  0.08
3   Fried Chicken 

**Putting that into a pandas dataframe**

In [43]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Let's create the new dataframe and display the top 10 venues for each neighborhood.

In [44]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = north_york_grouped['Neighborhood']

for ind in np.arange(north_york_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(north_york_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Pizza Place,Shopping Mall,Bridal Shop,Restaurant,Middle Eastern Restaurant,Sandwich Place,Ice Cream Shop,Sushi Restaurant
1,Bayview Village,Indian Restaurant,Sandwich Place,Pharmacy,Discount Store,Burger Joint,Bus Line,Fast Food Restaurant,Bank,Supermarket,Pizza Place
2,"Bedford Park, Lawrence Manor East",Coffee Shop,Café,Italian Restaurant,Cocktail Bar,Restaurant,Beer Bar,Clothing Store,Gym,Farmers Market,Moroccan Restaurant
3,Don Mills North,Bakery,Soccer Field,Ice Cream Shop,Bus Line,Metro Station,Park,Gay Bar,Electronics Store,Distribution Center,Greek Restaurant
4,Don Mills South,Pizza Place,Fast Food Restaurant,Fried Chicken Joint,Coffee Shop,Pharmacy,Gas Station,Chinese Restaurant,Thai Restaurant,Italian Restaurant,Bank
5,Downsview Central,Coffee Shop,Pub,Park,Bakery,Café,Wine Shop,Electronics Store,Performing Arts Venue,Mexican Restaurant,Hotel
6,Downsview East,Park,Convenience Store,Intersection,Wine Shop,Dessert Shop,Escape Room,Electronics Store,Donut Shop,Distribution Center,Discount Store
7,Downsview Northwest,Coffee Shop,Café,Hotel,Sandwich Place,Asian Restaurant,Deli / Bodega,Salad Place,Japanese Restaurant,Bakery,Pharmacy
8,Downsview West,Coffee Shop,Pet Store,Chinese Restaurant,Diner,Spa,Sporting Goods Shop,Fast Food Restaurant,Bagel Shop,Café,Toy / Game Store
9,"Fairview, Henry Farm, Oriole",Gym / Fitness Center,Shopping Mall,Grocery Store,Athletics & Sports,Liquor Store,Dim Sum Restaurant,Escape Room,Electronics Store,Donut Shop,Distribution Center


### Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [45]:
# set number of clusters
kclusters = 5

nyork_grouped_clustering = north_york_grouped.drop('Neighborhood', 1)

# run k-means clustering

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(nyork_grouped_clustering)

# check cluster labels generated for each row in the dataframe

kmeans.labels_[0:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

Creating a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [46]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

north_york_merged = north_york_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
north_york_merged = north_york_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

north_york_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,Parkwoods,43.806686,-79.194353,0,Fast Food Restaurant,Print Shop,Wine Shop,Department Store,Electronics Store,Donut Shop,Distribution Center,Discount Store,Diner,Dim Sum Restaurant
1,North York,Victoria Village,43.784535,-79.160497,4,Golf Course,Bar,Wine Shop,Dessert Shop,Escape Room,Electronics Store,Donut Shop,Distribution Center,Discount Store,Diner
2,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917,2,Coffee Shop,Korean BBQ Restaurant,Wine Shop,Dessert Shop,Escape Room,Electronics Store,Donut Shop,Distribution Center,Discount Store,Diner
3,North York,Don Mills North,43.711112,-79.284577,1,Bakery,Soccer Field,Ice Cream Shop,Bus Line,Metro Station,Park,Gay Bar,Electronics Store,Distribution Center,Greek Restaurant
4,North York,Glencairn,43.75741,-79.273304,1,Indian Restaurant,Pet Store,Vietnamese Restaurant,Brewery,Chinese Restaurant,Dessert Shop,Electronics Store,Donut Shop,Distribution Center,Discount Store


Finally, let's visualize the resulting clusters

In [47]:
# create map
map_clusters = folium.Map(location=[latitude_ny, longitude_ny], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(north_york_merged['Latitude'], north_york_merged['Longitude'], north_york_merged['Neighborhood'], north_york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

In [48]:
north_york_merged.loc[north_york_merged['Cluster Labels'] == 0, north_york_merged.columns[[1] + list(range(5, north_york_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,Fast Food Restaurant,Print Shop,Wine Shop,Department Store,Electronics Store,Donut Shop,Distribution Center,Discount Store,Diner,Dim Sum Restaurant


In [49]:
north_york_merged.loc[north_york_merged['Cluster Labels'] == 1, north_york_merged.columns[[1] + list(range(5, north_york_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Don Mills North,Bakery,Soccer Field,Ice Cream Shop,Bus Line,Metro Station,Park,Gay Bar,Electronics Store,Distribution Center,Greek Restaurant
4,Glencairn,Indian Restaurant,Pet Store,Vietnamese Restaurant,Brewery,Chinese Restaurant,Dessert Shop,Electronics Store,Donut Shop,Distribution Center,Discount Store
5,Don Mills South,Pizza Place,Fast Food Restaurant,Fried Chicken Joint,Coffee Shop,Pharmacy,Gas Station,Chinese Restaurant,Thai Restaurant,Italian Restaurant,Bank
6,Hillcrest Village,Coffee Shop,Restaurant,Gym,Clothing Store,Supermarket,Beer Store,Sporting Goods Shop,Italian Restaurant,Dim Sum Restaurant,Sandwich Place
7,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Pizza Place,Shopping Mall,Bridal Shop,Restaurant,Middle Eastern Restaurant,Sandwich Place,Ice Cream Shop,Sushi Restaurant
8,"Fairview, Henry Farm, Oriole",Gym / Fitness Center,Shopping Mall,Grocery Store,Athletics & Sports,Liquor Store,Dim Sum Restaurant,Escape Room,Electronics Store,Donut Shop,Distribution Center
9,"Northwood Park, York University",Coffee Shop,Pizza Place,Hockey Arena,Portuguese Restaurant,Financial or Legal Service,Intersection,Department Store,Donut Shop,Distribution Center,Discount Store
10,Bayview Village,Indian Restaurant,Sandwich Place,Pharmacy,Discount Store,Burger Joint,Bus Line,Fast Food Restaurant,Bank,Supermarket,Pizza Place
11,Downsview East,Park,Convenience Store,Intersection,Wine Shop,Dessert Shop,Escape Room,Electronics Store,Donut Shop,Distribution Center,Discount Store
12,"York Mills, Silver Hills",Gym / Fitness Center,Hotel,Breakfast Spot,Sandwich Place,Food & Drink Shop,Department Store,Dance Studio,Playground,Pizza Place,Park


In [50]:
north_york_merged.loc[north_york_merged['Cluster Labels'] == 2, north_york_merged.columns[[1] + list(range(5, north_york_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,"Lawrence Manor, Lawrence Heights",Coffee Shop,Korean BBQ Restaurant,Wine Shop,Dessert Shop,Escape Room,Electronics Store,Donut Shop,Distribution Center,Discount Store,Diner


In [51]:
north_york_merged.loc[north_york_merged['Cluster Labels'] == 3, north_york_merged.columns[[1] + list(range(5, north_york_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,Humber Summit,Park,Playground,Trail,Wine Shop,Deli / Bodega,Donut Shop,Distribution Center,Discount Store,Diner,Dim Sum Restaurant


In [52]:
north_york_merged.loc[north_york_merged['Cluster Labels'] == 4, north_york_merged.columns[[1] + list(range(5, north_york_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Victoria Village,Golf Course,Bar,Wine Shop,Dessert Shop,Escape Room,Electronics Store,Donut Shop,Distribution Center,Discount Store,Diner
