# Segmenting and Clustering Neighborhoods in Toronto
In this project we are working with the Postcode, Borough and Neighbourhood data for the City of Toronto ,Canada.The focus here is just to fetch,parse and clean the data. 


In [116]:
import requests
from collections import defaultdict
from collections import namedtuple
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import folium
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## 1 Data Cleaning
### 1.1 Downloading the Wikipedia page 
We use the request library to download the wikipeda page containt the postal codes and neighbourthoods of Toronto ,Canada

In [80]:
#Fetching the Html page content
wiki_html=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
wiki_html=wiki_html.text

### 1.2 Parsing the HTML Table
here we use the Beautiful Package to parse the content of the HTML table in the page and extract the required infromation

In [81]:
#Parsing the table
soup=BeautifulSoup(wiki_html,'html.parser')
headers=[]
toronto=defaultdict(list)
for row_index,row in enumerate(soup.table.find_all('tr')):
    #Parding Headers
    if row_index==0:
        for header in soup.table.find_all('th'):
            headers.append(header.string.strip())
    #Parsing rows
    for col_index,column in enumerate(row.find_all('td')):
        toronto[headers[col_index]].append(column.string)
toronto=pd.DataFrame.from_dict(toronto)
toronto.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,
3,M4A,North York,
4,M5A,Downtown Toronto,


### 1.3 Handeling Missing Values

- Only cells that have an assigned borough are processed. Cells with a borough that is Not assigned will be ignored
- If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [82]:
#Dropping rows with missing Boroughs 
toronto=toronto[(toronto['Borough']!='Not assigned')]
#Handeling missing Neighbourhoods 
for index, row in toronto.iterrows():
    if row['Neighbourhood'] is None:
        row['Neighbourhood']=row['Borough']
#Removing new charatcer line
toronto['Neighbourhood']=toronto['Neighbourhood'].apply(lambda x: x.replace('\n','') )
    
toronto.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,North York
3,M4A,North York,North York
4,M5A,Downtown Toronto,Downtown Toronto
5,M5A,Downtown Toronto,Downtown Toronto
6,M6A,North York,North York


### 1.4  Combing Neighbourhood  for simillar postal codes:
More than one neighborhood can exist in one postal code area. The value of these rows will be combined into one row with the neighborhoods separated with a comma as shown below in row 3.

In [83]:
# Mergin simillar postal codes
toronto_gr=toronto.groupby('Postcode').agg({'Borough' : lambda x: ','.join(set(x)),'Neighbourhood' : lambda x: ','.join(set(x))})
toronto_gr.reset_index(inplace=True)
toronto_gr.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Scarborough
1,M1C,Scarborough,Scarborough
2,M1E,Scarborough,"Scarborough,Guildwood"
3,M1G,Scarborough,Scarborough
4,M1H,Scarborough,Scarborough


### 1.5 Verifing the number of rows

In [84]:
toronto_gr.shape

(103, 3)

## 2. Geocoding
The goecoder APi is highly unreliable and we are using the provided CSV file for coordinates lookup.

In [85]:
lookup=pd.read_csv('Geospatial_Coordinates.csv')
Coordinates = namedtuple('Coordinates', 'latitude longitude')
def geocode(postalcode):
    result=lookup[lookup['Postal Code']==postalcode]
    Coordinates.latitude=float(result['Latitude'])
    Coordinates.longitude=float(result['Longitude'])
    return Coordinates
   

In [86]:
toronto_gr['Latitude']=toronto_gr['Postcode'].apply(lambda x:geocode(x).latitude)
toronto_gr['Longitude']=toronto_gr['Postcode'].apply(lambda x:geocode(x).longitude)
toronto_gr.drop('location' ,errors='ignore',axis=1, inplace=True)
toronto_gr.head() 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Scarborough,43.806686,-79.194353
1,M1C,Scarborough,Scarborough,43.784535,-79.160497
2,M1E,Scarborough,"Scarborough,Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Scarborough,43.770992,-79.216917
4,M1H,Scarborough,Scarborough,43.773136,-79.239476


In [87]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_gr['Borough'].unique()),
        toronto_gr.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


We Use geopy library to get the latitude and longitude of Toronto.


In [88]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Create a map of Toronto with neighborhoods superimposed on top.

In [89]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for index, row in toronto_gr.iterrows():
    label = '{}, {}'.format( row['Neighbourhood'], row['Borough'])
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [row['Latitude'], row['Longitude']],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Exploration
For simplicity we only segment and cluster the neighborhoods in Downtown Toronto. So let's slice the original dataframe and create a new dataframe of the DownTown data.

In [90]:
downtown=toronto_gr[toronto_gr['Borough']=='Downtown Toronto'].reset_index(drop=True)
downtown.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4W,Downtown Toronto,Downtown Toronto,43.679563,-79.377529
1,M4X,Downtown Toronto,Downtown Toronto,43.667967,-79.367675
2,M4Y,Downtown Toronto,Downtown Toronto,43.66586,-79.38316
3,M5A,Downtown Toronto,Downtown Toronto,43.65426,-79.360636
4,M5B,Downtown Toronto,"Downtown Toronto,Garden District",43.657162,-79.378937


To visualize the Downtown Toronto We need to find its central location coordinates

In [91]:
address = 'Downtown Toronto, Toronto,ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.655115, -79.380219.


Now, Lets visualize the Downtown Toronto on the map

In [92]:
# create map of Manhattan using latitude and longitude values
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for index, row in downtown.iterrows():
    label = folium.Popup(row['Neighbourhood'], parse_html=True)
    folium.CircleMarker(
        [row['Latitude'], row['Longitude']],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

In the next section, we will use the Foursquare API to explore the neighborhoods and segment them

In [93]:
CLIENT_ID = 'DDKJFMDJVCQDOMJ5VQ2JN3X3LO3W2AQBG0ZZAVVYLOYCDDCF' # your Foursquare ID
CLIENT_SECRET = 'SQVPM0MWXJP5XVJO3UF45MZAZ3BPRO4LB5PJ21LNCKT4CAC3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: DDKJFMDJVCQDOMJ5VQ2JN3X3LO3W2AQBG0ZZAVVYLOYCDDCF
CLIENT_SECRET:SQVPM0MWXJP5XVJO3UF45MZAZ3BPRO4LB5PJ21LNCKT4CAC3


Let's explore the first neighborhood in our dataframe.
Get the neighborhood's name and its coordinates.

In [94]:
downtown.loc[0, 'Neighbourhood']
neighbourhood_latitude = downtown.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = downtown.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = downtown.loc[0, 'Neighbourhood'] # neighborhood name
print('Neighbourhood:',neighbourhood_name)
print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Neighbourhood: Downtown Toronto
Latitude and longitude values of Downtown Toronto are 43.6795626, -79.37752940000001.


In [95]:
radius=500
LIMIT=100
VERSION = '20180604'
search_query=neighbourhood_name
# url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, search_query, radius, LIMIT)
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighbourhood_latitude, neighbourhood_longitude, VERSION, radius, LIMIT)
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c99741a9fb6b73b712e0873'},
 'response': {'headerLocation': 'Rosedale',
  'headerFullLocation': 'Rosedale, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.6840626045, 'lng': -79.37131878274371},
   'sw': {'lat': 43.675062595499995, 'lng': -79.38374001725632}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4aff2d47f964a520743522e3',
       'name': 'Rosedale Park',
       'location': {'address': '38 Scholfield Ave.',
        'crossStreet': 'at Edgar Ave.',
        'lat': 43.68232820227814,
        'lng': -79.37893434347683,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.68232820227814,
          'lng': -79.37893434347683}],
        'distance': 32

We borrow the get_category_type function from the Foursquare lab.

In [96]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we clean the json and structure it into a pandas dataframe.

In [97]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Rosedale Park,Playground,43.682328,-79.378934
1,Whitney Park,Park,43.682036,-79.373788
2,Alex Murray Parkette,Park,43.6783,-79.382773
3,Milkman's Lane,Trail,43.676352,-79.373842


Lets see how many venues were returned by Foursquare?

In [98]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


## 3. Explore Neighborhoods in Downtown Toronto

Let's boroow the function to repeat the same process to all the neighborhoods in DownTown Toronto

In [99]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [100]:
downtown_venues = getNearbyVenues(names=downtown['Neighbourhood'],
                                   latitudes=downtown['Latitude'],
                                   longitudes=downtown['Longitude']
                                  )

Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto
Downtown Toronto,Garden District
Downtown Toronto
Downtown Toronto
Central Bay Street
Downtown Toronto,Richmond
Downtown Toronto,Harbourfront East
Downtown Toronto
Downtown Toronto,Victoria Hotel
Downtown Toronto,Harbord
Downtown Toronto
Downtown Toronto,Bathurst Quay,Harbourfront West,Island airport
Stn A PO Boxes 25 The Esplanade
Downtown Toronto
Christie


Let have a quick look at the resulting dataframe

In [101]:
downtown_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Downtown Toronto,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Downtown Toronto,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Downtown Toronto,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Downtown Toronto,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,Downtown Toronto,43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


We can check to see how many venues were returned for each neighborhood

In [102]:
downtown_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Bay Street,81,81,81,81,81,81
Christie,15,15,15,15,15,15
Downtown Toronto,638,638,638,638,638,638
"Downtown Toronto,Bathurst Quay,Harbourfront West,Island airport",13,13,13,13,13,13
"Downtown Toronto,Garden District",100,100,100,100,100,100
"Downtown Toronto,Harbord",34,34,34,34,34,34
"Downtown Toronto,Harbourfront East",100,100,100,100,100,100
"Downtown Toronto,Richmond",100,100,100,100,100,100
"Downtown Toronto,Victoria Hotel",100,100,100,100,100,100
Stn A PO Boxes 25 The Esplanade,94,94,94,94,94,94


Let's find out how many unique categories can be curated from all the returned venues

In [103]:
print('There are {} uniques categories.'.format(len(downtown_venues['Venue Category'].unique())))

There are 205 uniques categories.


## 4. Analyzing Each Neighborhood

In [104]:
# one hot encoding
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_onehot['Neighbourhood'] = downtown_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]

downtown_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Lets see the size of the new dataframe

In [105]:
downtown_onehot.shape

(1275, 206)

Next, we want to group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [106]:
downtown_grouped = downtown_onehot.groupby('Neighbourhood').mean().reset_index()
downtown_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012346,...,0.0,0.0,0.0,0.012346,0.0,0.0,0.012346,0.0,0.0,0.012346
1,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Downtown Toronto,0.001567,0.001567,0.0,0.0,0.0,0.0,0.0,0.0,0.015674,...,0.001567,0.001567,0.003135,0.014107,0.001567,0.009404,0.004702,0.001567,0.001567,0.001567
3,"Downtown Toronto,Bathurst Quay,Harbourfront We...",0.0,0.0,0.076923,0.076923,0.076923,0.153846,0.153846,0.153846,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Downtown Toronto,Garden District",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.01,0.0,0.0,0.01,0.01,0.01,0.01,0.0,0.0,0.0
5,"Downtown Toronto,Harbord",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0
6,"Downtown Toronto,Harbourfront East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0
7,"Downtown Toronto,Richmond",0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.01,0.0
8,"Downtown Toronto,Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0
9,Stn A PO Boxes 25 The Esplanade,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010638,...,0.0,0.0,0.0,0.010638,0.0,0.0,0.0,0.0,0.0,0.0


Just lets check the new size

In [107]:
downtown_grouped.shape

(10, 206)

Let's borrow the function to print each neighborhood along with the top 5 most common venues

In [108]:
num_top_venues = 5

for hood in downtown_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = downtown_grouped[downtown_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Bay Street----
                venue  freq
0         Coffee Shop  0.16
1  Italian Restaurant  0.05
2                 Bar  0.04
3                Café  0.04
4  Chinese Restaurant  0.04


----Christie----
               venue  freq
0      Grocery Store  0.20
1               Café  0.20
2               Park  0.13
3  Convenience Store  0.07
4          Nightclub  0.07


----Downtown Toronto----
         venue  freq
0  Coffee Shop  0.09
1         Café  0.06
2        Hotel  0.03
3       Bakery  0.03
4   Restaurant  0.03


----Downtown Toronto,Bathurst Quay,Harbourfront West,Island airport----
              venue  freq
0    Airport Lounge  0.15
1   Airport Service  0.15
2  Airport Terminal  0.15
3  Sculpture Garden  0.08
4           Airport  0.08


----Downtown Toronto,Garden District----
                       venue  freq
0                Coffee Shop  0.08
1             Clothing Store  0.07
2                       Café  0.04
3             Cosmetics Shop  0.04
4  Middle Eastern Resta

Let's put that into a pandas dataframe

In [109]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [110]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = downtown_grouped['Neighbourhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Bubble Tea Shop,Burger Joint,Chinese Restaurant,Bar,Spa,Japanese Restaurant,Thai Restaurant
1,Christie,Grocery Store,Café,Park,Restaurant,Convenience Store,Italian Restaurant,Diner,Nightclub,Coffee Shop,Baby Store
2,Downtown Toronto,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
3,"Downtown Toronto,Bathurst Quay,Harbourfront We...",Airport Lounge,Airport Terminal,Airport Service,Harbor / Marina,Boat or Ferry,Sculpture Garden,Boutique,Airport Gate,Airport,Airport Food Court
4,"Downtown Toronto,Garden District",Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Pizza Place,Theater,Lingerie Store,Japanese Restaurant,Italian Restaurant


## 4. Cluster Neighborhoods
We will use the k-means to cluster the neighborhood into 5 clusters.

In [111]:
# set number of clusters
kclusters = 5

downtown_grouped_clustering = downtown_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 4, 1, 4, 2, 0, 4, 4, 4], dtype=int32)

In the next step we will create a new dataframe that includes the cluster as well as the top 10 venues for each neighbourhood.

In [112]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

downtown_merged = downtown

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

downtown_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Downtown Toronto,Downtown Toronto,43.679563,-79.377529,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
1,M4X,Downtown Toronto,Downtown Toronto,43.667967,-79.367675,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
2,M4Y,Downtown Toronto,Downtown Toronto,43.66586,-79.38316,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
3,M5A,Downtown Toronto,Downtown Toronto,43.65426,-79.360636,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
4,M5B,Downtown Toronto,"Downtown Toronto,Garden District",43.657162,-79.378937,4,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Pizza Place,Theater,Lingerie Store,Japanese Restaurant,Italian Restaurant


Lets Visualize the data

In [117]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighbourhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 6. Examining the Clusters

### Cluster 1 

In [118]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 0, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
7,Downtown Toronto,0,Coffee Shop,Italian Restaurant,Café,Bubble Tea Shop,Burger Joint,Chinese Restaurant,Bar,Spa,Japanese Restaurant,Thai Restaurant
9,Downtown Toronto,0,Coffee Shop,Aquarium,Hotel,Italian Restaurant,Café,Fried Chicken Joint,Scenic Lookout,Brewery,Pizza Place,Bakery


###  Cluster 2 

In [120]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 1, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,Downtown Toronto,1,Airport Lounge,Airport Terminal,Airport Service,Harbor / Marina,Boat or Ferry,Sculpture Garden,Boutique,Airport Gate,Airport,Airport Food Court


###  Cluster 3

In [123]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 2, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Downtown Toronto,2,Café,Restaurant,Bookstore,Japanese Restaurant,Bar,Bakery,Comfort Food Restaurant,Chinese Restaurant,Pub,Poutine Place


###  Cluster 4

In [124]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 3, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Downtown Toronto,3,Grocery Store,Café,Park,Restaurant,Convenience Store,Italian Restaurant,Diner,Nightclub,Coffee Shop,Baby Store


### Cluster 5

In [125]:
downtown_merged.loc[downtown_merged['Cluster Labels'] == 4, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
1,Downtown Toronto,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
2,Downtown Toronto,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
3,Downtown Toronto,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
4,Downtown Toronto,4,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Pizza Place,Theater,Lingerie Store,Japanese Restaurant,Italian Restaurant
5,Downtown Toronto,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
6,Downtown Toronto,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
8,Downtown Toronto,4,Coffee Shop,Bar,Café,Steakhouse,Thai Restaurant,American Restaurant,Asian Restaurant,Sushi Restaurant,Burger Joint,Gym
10,Downtown Toronto,4,Coffee Shop,Café,Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Gastropub,Park,Breakfast Spot
11,Downtown Toronto,4,Coffee Shop,Hotel,Café,Restaurant,American Restaurant,Gym,Gastropub,Deli / Bodega,Steakhouse,Seafood Restaurant
