# Crawling postal code of Toronto

In [0]:
import pandas as pd
import numpy as np
import os
import re
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe



import matplotlib.pyplot as plt
%matplotlib inline
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm, trange
from pprint import pprint

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

In [0]:
# crawling
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [0]:
list_postcode = list()
list_borough = list()
list_neighbor = list()

# parsing table
table = soup.find('table', class_='sortable')
for tr in table.find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue
    postcode, borough, neighbor = [td.text.strip() for td in tds[:3]]
    # print(postcode, borough, neighbor)
    list_postcode.append(postcode)
    list_borough.append(borough)
    list_neighbor.append(neighbor)

# make dataframe
df_toronto = pd.DataFrame({'PostalCode': list_postcode,
                           'Borough': list_borough,
                           'Neighbor': list_neighbor})
# delete not assigned postal code
df_toronto = df_toronto.query('Borough != "Not assigned"').reset_index(drop=True)

# fill nan value of Neighbor column to Borough column
df_toronto['Neighbor'] = df_toronto['Neighbor'].replace("Not assigned", np.nan)
df_toronto['Neighbor'].fillna(df_toronto['Borough'], inplace=True)


# merge neighbors that have same PostalCode
df_toronto = df_toronto.groupby(['PostalCode', 'Borough']).apply(lambda x: ', '.join(x['Neighbor'])).reset_index().rename(columns={0: 'Neighbor'})
df_toronto

Unnamed: 0,PostalCode,Borough,Neighbor
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [0]:
df_toronto.shape

(103, 3)

# Get Latitude and Longitude of Toronto by postal code

In [0]:
df_coord = pd.read_csv('Geospatial_Coordinates.csv')
df_coord

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [0]:
df_toronto = pd.merge(df_toronto, df_coord, left_on='PostalCode', right_on='Postal Code').drop(columns='Postal Code')
df_toronto

Unnamed: 0,PostalCode,Borough,Neighbor,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


# Clustering the neighborhoods in Toronto

In [0]:
df_toronto = df_toronto.drop(columns='PostalCode')

In [0]:
# Show Toronto map

latitude = 43.806686
longitude = -79.194353

# create map of Toronto using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)


for row in df_toronto.itertuples(index=False):
    lat = row.Latitude
    lng = row.Longitude
    borough = row.Borough
    neighbor = row.Neighbor

    label = '{}, {}'.format(neighbor, borough)
    label = folium.Popup(label, parse_html=True)


    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(map_newyork)  

map_newyork

In [0]:
neighbor_name = df_toronto.iloc[0]['Neighbor'].split(',')[0]
neighbor_lat = df_toronto.iloc[0]['Latitude']
neighbor_lng = df_toronto.iloc[0]['Longitude']
print(neighbor_name, neighbor_lat, neighbor_lng)

Rouge 43.806686299999996 -79.19435340000001


In [0]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [0]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbor_lat, 
    neighbor_lng, 
    radius, 
    LIMIT)
# url # display URL

In [0]:
results = requests.get(url).json()
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy's,Fast Food Restaurant,43.807448,-79.199056
1,Interprovincial Group,Print Shop,43.80563,-79.200378


In [0]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

2 venues were returned by Foursquare.


In [0]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list = list()

    for name, lat, lng in zip(names, latitudes, longitudes):
        if len(name.split(',')) > 1:
            name = name.split(',')[0]

        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=df_toronto['Neighbor'],
                                 latitudes=df_toronto['Latitude'],
                                 longitudes=df_toronto['Longitude'],
                                 )

Rouge
Highland Creek
Guildwood
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park
Clairlea
Cliffcrest
Birch Cliff
Dorset Park
Maryvale
Agincourt
Clarks Corners
Agincourt North
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview
Bayview Village
Silver Hills
Newtonbrook
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park
Bathurst Manor
Northwood Park
CFB Toronto
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West
The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park
Deer Park
Rosedale
Cabbagetown
Church and Wellesley
Harbourfront
Ryerson
St. James Town
Berczy Park
Central Bay Street
Adelaide
Harbourfront East
Design Exchange
Commerce Court
Bedford Park
Roselawn
Forest Hill North
The Annex
Harbord
Chinatown
CN Tower
Stn A PO Boxes 25 The Esplanade
First Canadian Place
Lawr

In [0]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rouge,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Rouge,43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,Highland Creek,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,Guildwood,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,Guildwood,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [0]:
toronto_venues.groupby('Neighborhood')['Venue'].count()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 272 uniques categories.


In [0]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
# print(toronto_onehot.shape)

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped

(100, 272)


Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Auto Workshop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Basketball Court,Basketball Stadium,Beach,Bed & Breakfast,Beer Bar,Beer Store,Belgian Restaurant,Bike Shop,Bistro,Board Shop,Boat or Ferry,Bookstore,Boutique,Brazilian Restaurant,...,Skate Park,Skating Rink,Smoke Shop,Snack Place,Soccer Field,Southern / Soul Food Restaurant,Spa,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Strip Club,Supermarket,Supplement Shop,Sushi Restaurant,Swim School,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.030000,0.000000,0.0,0.0,0.0,0.0,0.0,0.020000,0.000000,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,...,0.0,0.000000,0.01,0.0,0.0,0.0,0.000000,0.01,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.01,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.01
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,...,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00
2,Agincourt North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.333333,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,...,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00
3,Albion Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,...,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00
4,Alderwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,...,0.0,0.111111,0.00,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Willowdale West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,...,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00
96,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,...,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00
97,Woodbine Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.000000,0.083333,0.0,0.0,0.0,0.0,0.0,0.000000,0.083333,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,...,0.0,0.000000,0.00,0.0,0.0,0.0,0.000000,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.00
98,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.00,0.0,0.090909,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,...,0.0,0.181818,0.00,0.0,0.0,0.0,0.090909,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.00


In [0]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
             venue  freq
0      Coffee Shop  0.07
1              Bar  0.04
2       Steakhouse  0.04
3             Café  0.04
4  Thai Restaurant  0.03


----Agincourt----
                       venue  freq
0  Latin American Restaurant  0.33
1             Breakfast Spot  0.33
2                     Lounge  0.33
3                Yoga Studio  0.00
4                Men's Store  0.00


----Agincourt North----
                      venue  freq
0                    Bakery  0.33
1                      Park  0.33
2                Playground  0.33
3  Mediterranean Restaurant  0.00
4         Mobile Phone Shop  0.00


----Albion Gardens----
                  venue  freq
0         Grocery Store  0.18
1          Liquor Store  0.09
2        Sandwich Place  0.09
3  Fast Food Restaurant  0.09
4   Fried Chicken Joint  0.09


----Alderwood----
          venue  freq
0   Pizza Place  0.22
1      Pharmacy  0.11
2           Gym  0.11
3  Skating Rink  0.11
4   Coffee Shop  0.11


----Bathurst M

In [0]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [0]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Steakhouse,Bar,Sushi Restaurant,Clothing Store,Thai Restaurant,Burger Joint,Asian Restaurant,Hotel
1,Agincourt,Latin American Restaurant,Lounge,Breakfast Spot,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
2,Agincourt North,Park,Bakery,Playground,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
3,Albion Gardens,Grocery Store,Fried Chicken Joint,Fast Food Restaurant,Beer Store,Japanese Restaurant,Sandwich Place,Discount Store,Liquor Store,Pizza Place,Pharmacy
4,Alderwood,Pizza Place,Pub,Pharmacy,Sandwich Place,Pool,Skating Rink,Gym,Coffee Shop,Colombian Restaurant,Dance Studio


In [0]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [0]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Steakhouse,Bar,Sushi Restaurant,Clothing Store,Thai Restaurant,Burger Joint,Asian Restaurant,Hotel
1,Agincourt,Latin American Restaurant,Lounge,Breakfast Spot,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Eastern European Restaurant
2,Agincourt North,Park,Bakery,Playground,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
3,Albion Gardens,Grocery Store,Fried Chicken Joint,Fast Food Restaurant,Beer Store,Japanese Restaurant,Sandwich Place,Discount Store,Liquor Store,Pizza Place,Pharmacy
4,Alderwood,Pizza Place,Pub,Pharmacy,Sandwich Place,Pool,Skating Rink,Gym,Coffee Shop,Colombian Restaurant,Dance Studio


In [0]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto.copy()
toronto_merged['Neighbor'] = toronto_merged['Neighbor'].apply(lambda x: x.split(',')[0] if len(x.split(',')) > 1 else x)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbor')
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].apply(lambda x: 5.0 if np.isnan(x) else x)
toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighbor,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,Rouge,43.806686,-79.194353,0.0,Fast Food Restaurant,Print Shop,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,Department Store
1,Scarborough,Highland Creek,43.784535,-79.160497,0.0,Bar,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,Farmers Market
2,Scarborough,Guildwood,43.763573,-79.188711,0.0,Electronics Store,Pizza Place,Medical Center,Intersection,Rental Car Location,Mexican Restaurant,Breakfast Spot,Spa,Drugstore,Donut Shop
3,Scarborough,Woburn,43.770992,-79.216917,0.0,Coffee Shop,Indian Restaurant,Korean Restaurant,Women's Store,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
4,Scarborough,Cedarbrae,43.773136,-79.239476,0.0,Fried Chicken Joint,Gas Station,Hakka Restaurant,Bank,Caribbean Restaurant,Athletics & Sports,Thai Restaurant,Bakery,Dessert Shop,Dim Sum Restaurant


In [0]:
toronto_merged['Cluster Labels'].value_counts()

0.0    88
1.0     9
4.0     2
5.0     2
3.0     1
2.0     1
Name: Cluster Labels, dtype: int64

In [0]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters+1)
ys = [i + x + (i*x)**2 for i in range(kclusters+1)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbor'], toronto_merged['Cluster Labels']):
    cluster = int(round(cluster))
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=1).add_to(map_clusters)
       
map_clusters

In [0]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 5.0]

Unnamed: 0,Borough,Neighbor,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,Scarborough,Upper Rouge,43.836125,-79.205636,5.0,,,,,,,,,,
94,Etobicoke,Cloverdale,43.650943,-79.554724,5.0,,,,,,,,,,
