In [1]:
#import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time

In [2]:
#scrap neighbourhood data from Wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data = requests.get(url) 

In [3]:
#use beautifulsoup to pull table data and organize into dataframe
soup = BeautifulSoup(data.text, 'html.parser')
tableContent = soup.table.find_all('td')

#initiate main list and sub-list
x = []
y = []

#loop through entire list
for z in range(int(len(tableContent))):
    #append values into sub-list and remove line breaks
    y.append(tableContent[z].text.replace('\n',''))
    #for every third value, append sub-list to main list and reset sub-list
    if z % 3 == 2:
        x.append(y)
        y = []

#parse main list into a dataframe
df = pd.DataFrame(x)

#add column titles, delete borough column
df.columns = ['Postcode','Borough','Neighbourhood']

#check dataframe
print(df.shape)
df.head(10)

(289, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [5]:
#remove rows where Neighbourhood = 'Not assigned'
df = df[df.Neighbourhood != 'Not assigned']

#sort values by postcode for easier observation
df = df.sort_values('Postcode')

#reset index
df.reset_index(inplace=True, drop=True)

print(df.shape)
df.head(10)

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Port Union
4,M1C,Scarborough,Rouge Hill
5,M1E,Scarborough,Guildwood
6,M1E,Scarborough,Morningside
7,M1E,Scarborough,West Hill
8,M1G,Scarborough,Woburn
9,M1H,Scarborough,Cedarbrae


In [6]:
#import libraries
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values



In [8]:
def pullLatLong(address):
    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(address)
    
    if location != None:
        latitude = location.latitude
        longitude = location.longitude
    else:
        latitude = 0
        longitude = 0
    
    print(address+': '+str(latitude)+', '+str(longitude))
    
    time.sleep(0.3)
    
    return {'lat':latitude, 'long':longitude}

In [9]:
df['Latitude'] = 0
df['Longitude'] = 0

for nbhd in df['Neighbourhood']:
    x = pullLatLong(nbhd+' , Toronto, Canada')
    df.loc[df['Neighbourhood'] == nbhd,'Latitude'] = x['lat']
    df.loc[df['Neighbourhood'] == nbhd,'Longitude'] = x['long']

print(df.shape)
df.head(10)

Rouge , Toronto, Canada: 43.8049304, -79.1658374
Malvern , Toronto, Canada: 43.8091955, -79.2217008
Highland Creek , Toronto, Canada: 43.7901172, -79.1733344
Port Union , Toronto, Canada: 43.7755039, -79.1349765
Rouge Hill , Toronto, Canada: 43.7802711, -79.1304992
Guildwood , Toronto, Canada: 43.7548985, -79.1977755
Morningside , Toronto, Canada: 43.7826012, -79.2049579
West Hill , Toronto, Canada: 43.7689144, -79.1872905
Woburn , Toronto, Canada: 43.7598243, -79.2252908
Cedarbrae , Toronto, Canada: 43.75646655, -79.226692442588
Scarborough Village , Toronto, Canada: 43.7437422, -79.2116324
East Birchmount Park , Toronto, Canada: 43.7141672, -79.271109
Ionview , Toronto, Canada: 43.7359904, -79.2765146
Kennedy Park , Toronto, Canada: 43.724878, -79.2539688
Clairlea , Toronto, Canada: 43.7088231, -79.2959856
Golden Mile , Toronto, Canada: 43.7278414, -79.2876217
Oakridge , Toronto, Canada: 43.6971738, -79.2748232
Cliffcrest , Toronto, Canada: 43.7219387, -79.2362324
Cliffside , Toronto

North Park , Toronto, Canada: 43.7186899, -79.4775337
Maple Leaf Park , Toronto, Canada: 43.71584525, -79.4934073307555
Upwood Park , Toronto, Canada: 43.7089606, -79.5008945986058
Silverthorn , Toronto, Canada: 43.682123, -79.4715724
Del Ray , Toronto, Canada: 0, 0
Keelesdale , Toronto, Canada: 43.690157, -79.4750564
Mount Dennis , Toronto, Canada: 43.6869597, -79.4895513
The Junction North , Toronto, Canada: 43.6654775, -79.470352
Runnymede , Toronto, Canada: 43.6655802, -79.4821078
The Junction South , Toronto, Canada: 43.6654775, -79.470352
High Park , Toronto, Canada: 43.64628135, -79.4638213811953
Roncesvalles , Toronto, Canada: 43.6514426, -79.4510381
Parkdale , Toronto, Canada: 43.6371777, -79.4360045
Runnymede , Toronto, Canada: 43.6655802, -79.4821078
Swansea , Toronto, Canada: 43.64494, -79.478313
Canada Post Gateway Processing Centre , Toronto, Canada: 0, 0
Business Reply Mail Processing Centre 969 Eastern , Toronto, Canada: 0, 0
Humber Bay Shores , Toronto, Canada: 0, 0
Ne

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Rouge,43.80493,-79.165837
1,M1B,Scarborough,Malvern,43.809196,-79.221701
2,M1C,Scarborough,Highland Creek,43.790117,-79.173334
3,M1C,Scarborough,Port Union,43.775504,-79.134976
4,M1C,Scarborough,Rouge Hill,43.780271,-79.130499
5,M1E,Scarborough,Guildwood,43.754899,-79.197776
6,M1E,Scarborough,Morningside,43.782601,-79.204958
7,M1E,Scarborough,West Hill,43.768914,-79.187291
8,M1G,Scarborough,Woburn,43.759824,-79.225291
9,M1H,Scarborough,Cedarbrae,43.756467,-79.226692


In [10]:
# delete neighbourhoods that cannot be geocoded
df = df[df.Latitude !=0]

df.reset_index(inplace=True, drop=True)

print(df.shape)
df.head(10)

(198, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,Rouge,43.80493,-79.165837
1,M1B,Scarborough,Malvern,43.809196,-79.221701
2,M1C,Scarborough,Highland Creek,43.790117,-79.173334
3,M1C,Scarborough,Port Union,43.775504,-79.134976
4,M1C,Scarborough,Rouge Hill,43.780271,-79.130499
5,M1E,Scarborough,Guildwood,43.754899,-79.197776
6,M1E,Scarborough,Morningside,43.782601,-79.204958
7,M1E,Scarborough,West Hill,43.768914,-79.187291
8,M1G,Scarborough,Woburn,43.759824,-79.225291
9,M1H,Scarborough,Cedarbrae,43.756467,-79.226692


In [11]:
#save foursquare credentials
CLIENT_ID = '3T1DC2OMPAW5IB2JWHWOTJXS4AHD11GWHAHLFO1K4CRUDLCH' # your Foursquare ID
CLIENT_SECRET = 'W54KY4ZVRZSHEGQEIPJTFZOI5QH3FHDKA5BDRRFWZQTJ4DJH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: 3T1DC2OMPAW5IB2JWHWOTJXS4AHD11GWHAHLFO1K4CRUDLCH
CLIENT_SECRET:W54KY4ZVRZSHEGQEIPJTFZOI5QH3FHDKA5BDRRFWZQTJ4DJH


In [12]:
#define wanted columns
vencols = ['Neighbourhood','Latitude','Longitude','Venue_Name','Venue_Latitude',
               'Venue_Longitude','Venue_Category_1']

#define function to pull venues within the vicinity of each neighbourhood
def getNearbyVenues(nbhd, lat, lng):
    #initialize dataframe
    vendf = pd.DataFrame(columns=vencols)

    radius = 500
    LIMIT = 100
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                radius, 
                LIMIT)
    results = requests.get(url).json()
    
    for item in results['response']['groups'][0]['items']:
        a = item['venue']['name']
        b = item['venue']['location']['lat']
        c = item['venue']['location']['lng']
        d = item['venue']['categories'][0]['name']
        rowdata = {'Neighbourhood':nbhd,'Latitude':lat,'Longitude':lng,
                   'Venue_Name': a,'Venue_Latitude': b,'Venue_Longitude': c,'Venue_Category_1': d}
        vendf = vendf.append(rowdata, ignore_index=True)

    return vendf

In [13]:
#initialize empty dataframe for venue data
df1 = pd.DataFrame(columns=vencols)

#pull venues around each neighbourhood
for nbhd in df['Neighbourhood']:
    print(nbhd)
    tempdf = getNearbyVenues(nbhd, df.loc[df['Neighbourhood'] == nbhd,'Latitude'].values[0],
                                df.loc[df['Neighbourhood'] == nbhd,'Longitude'].values[0])
    df1 = pd.concat([df1,tempdf])

df1.reset_index(inplace=True, drop=True)

df1

Rouge
Malvern
Highland Creek
Port Union
Rouge Hill
Guildwood
Morningside
West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park
Ionview
Kennedy Park
Clairlea
Golden Mile
Oakridge
Cliffcrest
Cliffside
Scarborough Village West
Cliffside West
Birch Cliff
Dorset Park
Scarborough Town Centre
Wexford Heights
Wexford
Maryvale
Agincourt
Tam O'Shanter
Sullivan
Clarks Corners
Steeles East
Milliken
L'Amoreaux East
Agincourt North
L'Amoreaux West
Steeles West
Upper Rouge
Hillcrest Village
Henry Farm
Fairview
Oriole
Bayview Village
York Mills
Silver Hills
Newtonbrook
Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Don Mills South
Flemingdon Park
Wilson Heights
Bathurst Manor
Downsview North
Northwood Park
York University
Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens
Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West
Riverdale
The Beaches West
Ind

Unnamed: 0,Neighbourhood,Latitude,Longitude,Venue_Name,Venue_Latitude,Venue_Longitude,Venue_Category_1
0,Rouge,43.804930,-79.165837,Dean Park,43.804364,-79.169159,Park
1,Rouge,43.804930,-79.165837,Paul's Breakfast & Burgers,43.803835,-79.169825,Fast Food Restaurant
2,Malvern,43.809196,-79.221701,Shoppers Drug Mart,43.809202,-79.223320,Pharmacy
3,Malvern,43.809196,-79.221701,Subway,43.806805,-79.222515,Sandwich Place
4,Malvern,43.809196,-79.221701,Pizza Hut,43.808326,-79.220616,Pizza Place
5,Malvern,43.809196,-79.221701,Pizza Pizza,43.806613,-79.221243,Pizza Place
6,Malvern,43.809196,-79.221701,Francois' No Frills,43.808518,-79.223399,Grocery Store
7,Malvern,43.809196,-79.221701,Shoppers Drug Mart,43.806489,-79.223024,Pharmacy
8,Malvern,43.809196,-79.221701,McDonald's,43.806375,-79.221588,Fast Food Restaurant
9,Malvern,43.809196,-79.221701,Circle K,43.808097,-79.220449,Convenience Store


In [14]:
#create dictionary including each unique venue category
venuecat2 = {el:'dummy' for el in df1['Venue_Category_1'].unique()}
print(venuecat2)

{'Park': 'dummy', 'Fast Food Restaurant': 'dummy', 'Pharmacy': 'dummy', 'Sandwich Place': 'dummy', 'Pizza Place': 'dummy', 'Grocery Store': 'dummy', 'Convenience Store': 'dummy', 'Bubble Tea Shop': 'dummy', 'Gym / Fitness Center': 'dummy', 'Skating Rink': 'dummy', 'Neighborhood': 'dummy', 'Train Station': 'dummy', 'Storage Facility': 'dummy', 'Baseball Field': 'dummy', 'Coffee Shop': 'dummy', 'Beer Store': 'dummy', 'Supermarket': 'dummy', 'Discount Store': 'dummy', 'Electronics Store': 'dummy', 'Fried Chicken Joint': 'dummy', 'Food & Drink Shop': 'dummy', 'Liquor Store': 'dummy', 'Smoothie Shop': 'dummy', 'Breakfast Spot': 'dummy', 'Sports Bar': 'dummy', 'Bank': 'dummy', 'Burger Joint': 'dummy', 'Greek Restaurant': 'dummy', 'Thrift / Vintage Store': 'dummy', 'Clothing Store': 'dummy', 'Video Game Store': 'dummy', 'Mexican Restaurant': 'dummy', 'Spa': 'dummy', 'Medical Center': 'dummy', 'Gym': 'dummy', 'Vietnamese Restaurant': 'dummy', 'Paper / Office Supplies Store': 'dummy', 'Toy / Ga

In [16]:
#manually categorize unique venue categories into higher level (level 2) categories, convert into dataframe
venuecat22 = {'Park': 'Sports', 'Fast Food Restaurant': 'Dining', 'Chinese Restaurant': 'Dining', 'Pharmacy': 'Healthcare', 'Sandwich Place': 'Dining', 'Pizza Place': 'Dining', 'Grocery Store': 'Shopping', 'Convenience Store': 'Shopping', 'Bubble Tea Shop': 'Dining', 'Gym / Fitness Center': 'Sports', 'Skating Rink': 'Sports', 'Neighborhood': 'Entertainment', 'Train Station': 'Transport', 'Storage Facility': 'Business', 'Baseball Field': 'Sports', 'Coffee Shop': 'Dining', 'Beer Store': 'Nightlife', 'Supermarket': 'Shopping', 'Discount Store': 'Shopping', 'Electronics Store': 'Shopping', 'Fried Chicken Joint': 'Dining', 'Food & Drink Shop': 'Dining', 'Liquor Store': 'Nightlife', 'Smoothie Shop': 'Dining', 'Breakfast Spot': 'Dining', 'Sports Bar': 'Nightlife', 'Bank': 'Household', 'Greek Restaurant': 'Dining', 'Burger Joint': 'Dining', 'Salon / Barbershop': 'Household', 'Shopping Mall': 'Shopping', 'Video Game Store': 'Shopping', 'Mexican Restaurant': 'Dining', 'Spa': 'Entertainment', 'Medical Center': 'Healthcare', 'Gym': 'Sports', 'Vietnamese Restaurant': 'Dining', 'Paper / Office Supplies Store': 'Shopping', 'Toy / Game Store': 'Shopping', 'Big Box Store': 'Shopping', 'Furniture / Home Store': 'Shopping', 'Optical Shop': 'Shopping', 'Clothing Store': 'Shopping', 'Bar': 'Nightlife', 'Pub': 'Nightlife', 'Asian Restaurant': 'Dining', 'Deli / Bodega': 'Dining', 'Business Service': 'Business', 'Metro Station': 'Transport', 'Gas Station': 'Transport', 'Intersection': 'Transport', 'Diner': 'Dining', 'Burrito Place': 'Dining', 'Steakhouse': 'Dining', 'Cosmetics Shop': 'Shopping', 'Warehouse Store': 'Shopping', 'Health Food Store': 'Shopping', 'Japanese Restaurant': 'Dining', 'Arts & Crafts Store': 'Shopping', 'American Restaurant': 'Dining', 'Laser Tag': 'Entertainment', 'Sporting Goods Shop': 'Shopping', 'Department Store': 'Shopping', 'Shopping Plaza': 'Shopping', 'Bus Station': 'Transport', 'Bridal Shop': 'Shopping', 'Frozen Yogurt Shop': 'Dining', 'Automotive Shop': 'Shopping', 'Shoe Store': 'Shopping', 'IT Services': 'Business', 'Hardware Store': 'Shopping', 'Pet Store': 'Shopping', 'Leather Goods Store': 'Shopping', 'Supplement Shop': 'Healthcare', 'Kids Store': 'Shopping', "Men's Store": 'Shopping', 'Accessories Store': 'Shopping', 'Restaurant': 'Dining', 'Dessert Shop': 'Dining', 'Bus Stop': 'Transport', 'Wings Joint': 'Dining', 'Gym Pool': 'Sports', 'Indian Restaurant': 'Dining', 'Latin American Restaurant': 'Dining', 'Bakery': 'Dining', 'Gaming Cafe': 'Entertainment', 'Tea Room': 'Dining', 'Movie Theater': 'Entertainment', 'Italian Restaurant': 'Shopping', 'Food Court': 'Dining', 'Plaza': 'Shopping', "Women's Store": 'Shopping', 'Middle Eastern Restaurant': 'Dining', 'Mediterranean Restaurant': 'Dining', 'Hakka Restaurant': 'Dining', 'Sushi Restaurant': 'Dining', 'Video Store': 'Shopping', 'Bus Line': 'Transport', 'Korean Restaurant': 'Dining', 'Fish Market': 'Shopping', 'Seafood Restaurant': 'Dining', 'Smoke Shop': 'Shopping', 'Hong Kong Restaurant': 'Dining', 'Cantonese Restaurant': 'Dining', 'Thai Restaurant': 'Dining', 'Market': 'Shopping', 'Falafel Restaurant': 'Dining', 'Gift Shop': 'Shopping', 'Caribbean Restaurant': 'Dining', 'Playground': 'Entertainment', 'Health & Beauty Service': 'Shopping', 'Noodle House': 'Dining', 'Dumpling Restaurant': 'Dining', 'Snack Place': 'Dining', 'Miscellaneous Shop': 'Shopping', 'Taiwanese Restaurant': 'Dining', 'BBQ Joint': 'Dining', 'Dongbei Restaurant': 'Dining', 'Athletics & Sports': 'Shopping', 'Juice Bar': 'Dining', 'Ice Cream Shop': 'Dining', 'Taco Place': 'Dining', 'Recreation Center': 'Entertainment', 'Pool': 'Sports', 'Art Museum': 'Entertainment', 'Ramen Restaurant': 'Dining', 'Café': 'Dining', 'Concert Hall': 'Entertainment', 'University': 'Education', 'Poke Place': 'Dining', 'Art Gallery': 'Shopping', 'Tapas Restaurant': 'Dining', 'Opera House': 'Entertainment', 'Jazz Club': 'Entertainment', 'Gastropub': 'Nightlife', 'Vegetarian / Vegan Restaurant': 'Dining', 'Monument / Landmark': 'Entertainment', 'Record Shop': 'Shopping', 'Exhibit': 'Entertainment', 'Office': 'Business', 'Hotel': 'Entertainment', 'Donut Shop': 'Dining', 'French Restaurant': 'Dining', 'Hot Dog Joint': 'Dining', 'Tennis Court': 'Sports', 'Lawyer': 'Business', 'Outdoor Supply Store': 'Shopping', 'Wine Shop': 'Nightlife', 'Auto Dealership': 'Shopping', 'Bookstore': 'Shopping', 'Creperie': 'Dining', 'Udon Restaurant': 'Dining', 'Home Service': 'Entertainment', 'Karaoke Bar': 'Nightlife', 'Laundry Service': 'Household', 'Chocolate Shop': 'Shopping', 'Gourmet Shop': 'Shopping', 'Jewelry Store': 'Shopping', 'Hockey Arena': 'Sports', 'Pakistani Restaurant': 'Dining', 'Science Museum': 'Entertainment', 'Dance Studio': 'Sports', 'Nightclub': 'Nightlife', 'Bagel Shop': 'Dining', 'Construction & Landscaping': 'Business', 'Salad Place': 'Dining', 'Cafeteria': 'Dining', 'Beach': 'Sports', 'Nail Salon': 'Household', 'Tree': 'Entertainment', 'Martial Arts Dojo': 'Sports', 'Cheese Shop': 'Shopping', 'Afghan Restaurant': 'Dining', 'Turkish Restaurant': 'Dining', 'Yoga Studio': 'Sports', 'Music Venue': 'Entertainment', 'Sculpture Garden': 'Entertainment', 'Historic Site': 'Dining', 'Boat or Ferry': 'Transport', 'Fish & Chips Shop': 'Dining', 'Dim Sum Restaurant': 'Dining', 'Light Rail Station': 'Transport', 'Trail': 'Sports', 'Egyptian Restaurant': 'Dining', 'Brewery': 'Nightlife', 'Indian Chinese Restaurant': 'Dining', 'Indie Theater': 'Entertainment', 'Bistro': 'Dining', 'Scenic Lookout': 'Entertainment', 'Theater': 'Entertainment', 'Wine Bar': 'Nightlife', 'Aquarium': 'Entertainment', 'Baseball Stadium': 'Sports', 'Comedy Club': 'Entertainment', 'Brazilian Restaurant': 'Dining', 'Speakeasy': 'Nightlife', 'School': 'Education', 'Beer Bar': 'Nightlife', 'Soup Place': 'Dining', 'History Museum': 'Entertainment', 'Poutine Place': 'Dining', 'Cocktail Bar': 'Nightlife', 'Mobile Phone Shop': 'Shopping', 'Hobby Shop': 'Shopping', 'Lingerie Store': 'Shopping', 'Camera Store': 'Shopping', 'Massage Studio': 'Healthcare', 'Farmers Market': 'Shopping', 'Indoor Play Area': 'Entertainment', 'New American Restaurant': 'Dining', 'Cuban Restaurant': 'Dining', 'Boutique': 'Shopping', 'Mac & Cheese Joint': 'Dining', 'Southern / Soul Food Restaurant': 'Dining', 'Event Space': 'Entertainment', 'Dog Run': 'Entertainment', 'Butcher': 'Shopping', 'Mattress Store': 'Shopping', 'Castle': 'Entertainment', 'Museum': 'Entertainment', 'Jewish Restaurant': 'Dining', 'Chiropractor': 'Healthcare', 'Modern European Restaurant': 'Dining', 'German Restaurant': 'Dining', 'Bike Trail': 'Sports', 'General Entertainment': 'Entertainment', 'Dive Bar': 'Nightlife', 'Outdoor Sculpture': 'Entertainment', 'Filipino Restaurant': 'Dining', 'Library': 'Entertainment', 'Bike Rental / Bike Share': 'Sports', 'Theme Restaurant': 'Dining', 'Ethiopian Restaurant': 'Dining', 'Gay Bar': 'Nightlife', 'Sake Bar': 'Nightlife', 'Portuguese Restaurant': 'Dining', 'Strip Club': 'Nightlife', 'Performing Arts Venue': 'Entertainment', 'Animal Shelter': 'Entertainment', 'Food Truck': 'Dining', 'Lake': 'Sports', 'Basketball Stadium': 'Sports', 'Lounge': 'Nightlife', 'Hotel Bar': 'Nightlife', 'Convention Center': 'Entertainment', 'Whisky Bar': 'Nightlife', 'Comic Shop': 'Shopping', 'College Rec Center': 'Education', 'Baby Store': 'Shopping', 'Church': 'Entertainment', 'Tailor Shop': 'Shopping', 'Fountain': 'Entertainment', 'Belgian Restaurant': 'Dining', 'Molecular Gastronomy Restaurant': 'Dining', 'Comfort Food Restaurant': 'Dining', 'Irish Pub': 'Nightlife', 'Hostel': 'Business', 'Antique Shop': 'Shopping', 'Spanish Restaurant': 'Dining', 'Sri Lankan Restaurant': 'Dining', 'Souvlaki Shop': 'Shopping', 'Street Art': 'Entertainment', 'Other Nightlife': 'Nightlife', 'General Travel': 'Transport', 'Gluten-free Restaurant': 'Dining', 'Beer Garden': 'Nightlife', 'Farm': 'Entertainment', 'Cupcake Shop': 'Dining', 'Building': 'Business', 'Pie Shop': 'Dining', 'Rental Car Location': 'Transport', 'Design Studio': 'Entertainment', 'Pastry Shop': 'Dining', 'College Gym': 'Sports', 'College Arts Building': 'Entertainment', 'Indie Movie Theater': 'Entertainment', 'Eastern European Restaurant': 'Dining', 'Pool Hall': 'Entertainment', 'Organic Grocery': 'Shopping', 'Hotpot Restaurant': 'Dining', 'Thrift / Vintage Store': 'Shopping', 'Hawaiian Restaurant': 'Dining', 'Costume Shop': 'Shopping', 'Doner Restaurant': 'Dining', 'Persian Restaurant': 'Dining', 'Hospital': 'Healthcare', 'African Restaurant': 'Dining', 'Peruvian Restaurant': 'Dining', 'Bowling Alley': 'Sports', 'Garden': 'Sports', 'Harbor / Marina': 'Transport', 'Tunnel': 'Transport', 'Airport Service': 'Transport', 'Kitchen Supply Store': 'Shopping', 'Platform': 'Transport', 'Rock Climbing Spot': 'Sports', 'Scandinavian Restaurant': 'Dining', 'Flower Shop': 'Shopping', 'Food': 'Dining', 'Tanning Salon': 'Entertainment', 'Tibetan Restaurant': 'Dining', 'North Indian Restaurant': 'Dining', 'Soccer Stadium': 'Sports', 'Racetrack': 'Sports', 'Theme Park': 'Entertainment', 'Flea Market': 'Shopping', 'Theme Park Ride / Attraction': 'Entertainment', 'South American Restaurant': 'Dining', 'Check Cashing Service': 'Business', 'Amphitheater': 'Entertainment', 'Other Great Outdoors': 'Entertainment', 'Zoo': 'Entertainment', 'Auto Garage': 'Shopping', 'Social Club': 'Entertainment', 'River': 'Entertainment', 'Vineyard': 'Entertainment', 'Shipping Store': 'Shopping', 'Golf Course': 'Sports', 'Piano Bar': 'Nightlife', 'Empanada Restaurant': 'Dining', 'Soccer Field': 'Sports', 'Laundromat': 'Household'}
venuecat22 = {'Venue_Category_1':list(venuecat22.keys()),'Venue_Category_2':list(venuecat22.values())}
venuecat22 = pd.DataFrame(venuecat22)

print(venuecat22['Venue_Category_2'].unique())
venuecat22.head(10)

['Sports' 'Dining' 'Healthcare' 'Shopping' 'Entertainment' 'Transport'
 'Business' 'Nightlife' 'Household' 'Education']


Unnamed: 0,Venue_Category_1,Venue_Category_2
0,Park,Sports
1,Fast Food Restaurant,Dining
2,Chinese Restaurant,Dining
3,Pharmacy,Healthcare
4,Sandwich Place,Dining
5,Pizza Place,Dining
6,Grocery Store,Shopping
7,Convenience Store,Shopping
8,Bubble Tea Shop,Dining
9,Gym / Fitness Center,Sports


In [17]:
#update original venues dataframe with level 2 categories
df2 = pd.merge(df1, venuecat22, on='Venue_Category_1')
df2 = df2.sort_values('Neighbourhood')
df2 = df2.reset_index(drop=True)

print(df2.shape)
df2.head(10)

(5714, 8)


Unnamed: 0,Neighbourhood,Latitude,Longitude,Venue_Name,Venue_Latitude,Venue_Longitude,Venue_Category_1,Venue_Category_2
0,Adelaide,43.650528,-79.379515,Leña,43.651751,-79.379021,Latin American Restaurant,Dining
1,Adelaide,43.650528,-79.379515,Cactus Club Cafe,43.649552,-79.381671,American Restaurant,Dining
2,Adelaide,43.650528,-79.379515,The Gabardine,43.650988,-79.381225,American Restaurant,Dining
3,Adelaide,43.650528,-79.379515,Assembly Chef's Hall,43.650579,-79.383412,Food Court,Dining
4,Adelaide,43.650528,-79.379515,The Omni King Edward Hotel,43.649191,-79.376006,Hotel,Entertainment
5,Adelaide,43.650528,-79.379515,The Adelaide Hotel Toronto,43.649831,-79.380164,Hotel,Entertainment
6,Adelaide,43.650528,-79.379515,One King West Hotel & Residence,43.648947,-79.377966,Hotel,Entertainment
7,Adelaide,43.650528,-79.379515,Cosmopolitan Toronto Centre Hotel & Spa,43.649064,-79.377598,Hotel,Entertainment
8,Adelaide,43.650528,-79.379515,Cambridge Suites Toronto,43.651836,-79.378107,Hotel,Entertainment
9,Adelaide,43.650528,-79.379515,Richmond Station,43.651569,-79.379266,American Restaurant,Dining


In [33]:
#summary of each venue category
df2.pivot_table(index=['Venue_Category_2'], aggfunc='size', fill_value=0)

Venue_Category_2
Business           39
Dining           3132
Education          10
Entertainment     432
Healthcare         74
Household          76
Nightlife         466
Shopping         1027
Sports            355
Transport         103
dtype: int64

In [39]:
#count of venue categories by neighbourhood
df3 = df2.pivot_table(index=['Neighbourhood'], columns='Venue_Category_2', aggfunc='size', fill_value=0)

print(df3.shape)
df3.head(10)

(195, 10)


Venue_Category_2,Business,Dining,Education,Entertainment,Healthcare,Household,Nightlife,Shopping,Sports,Transport
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Adelaide,1,54,0,11,0,1,8,20,4,1
Agincourt,0,9,0,0,0,0,0,1,0,2
Agincourt North,0,18,0,2,1,1,2,7,0,0
Albion Gardens,0,1,0,3,0,0,0,1,1,0
Alderwood,0,4,0,0,1,0,1,0,3,0
Bathurst Manor,0,0,0,1,0,0,0,1,2,0
Bathurst Quay,0,12,0,1,0,0,1,1,5,3
Bayview Village,0,5,0,0,0,1,0,6,0,1
Bedford Park,1,0,0,0,0,0,0,0,0,0
Berczy Park,1,54,0,10,0,0,11,18,6,0


In [79]:
#create copy of df3, initiate df4 to rank top 10 locations for each category
df33 = df3
df4 = pd.DataFrame(columns=df33.columns)

for column in df33.columns:
    df33 = df33.sort_values(column, ascending=False)
    df4[column]=df33.index[0:10]

df4.index.rename('Rank',inplace=True)
df4.index += 1
df4

Venue_Category_2,Business,Dining,Education,Entertainment,Healthcare,Household,Nightlife,Shopping,Sports,Transport
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Maple Leaf Park,Kensington Market,Jamestown,CN Tower,Deer Park,The Beaches West,Toronto Dominion Centre,Lawrence Heights,CN Tower,Runnymede
2,Adelaide,First Canadian Place,King and Spadina,Studio District,New Toronto,The Beaches,King,Yorkville,Harbourfront West,The Danforth West
3,First Canadian Place,Chinatown,Fairview,Princess Gardens,Malvern,Parkwoods,Commerce Court,Golden Mile,Harbourfront,Bathurst Quay
4,Flemingdon Park,Design Exchange,Mount Olive,Harbourfront West,Runnymede,The Junction North,First Canadian Place,Garden District,Harbourfront East,Wexford
5,Golden Mile,Toronto Dominion Centre,Princess Gardens,Harbourfront East,Lawrence Park,Runnymede,Little Portugal,Trinity,King and Spadina,Guildwood
6,Grange Park,Commerce Court,Martin Grove,Harbourfront,St. James Town,Lawrence Park,Design Exchange,Grange Park,Richmond,Riverdale
7,Guildwood,Willowdale,Richmond,Commerce Court,Parkdale Village,St. James Town,Church and Wellesley,Scarborough Town Centre,Studio District,Maryvale
8,Harbourfront,Willowdale South,Garden District,King,Parkdale,The Junction South,North Toronto West,Adelaide,The Beaches,Parkdale
9,Harbourfront East,Willowdale West,Grange Park,Trinity,Maryvale,First Canadian Place,Northwest,South Niagara,The Beaches West,Harbourfront West
10,Harbourfront West,King,Studio District,Toronto Dominion Centre,West Hill,Martin Grove,Studio District,Richmond,Design Exchange,Thorncliffe Park


In [93]:
#cluster locations to compare similar clusters

#copy df, normalize data
df5 = df3
df5_norm = (df5 - df5.mean()) / (df5.max() - df5.min())

df5_norm.head(10)

Venue_Category_2,Business,Dining,Education,Entertainment,Healthcare,Household,Nightlife,Shopping,Sports,Transport,Cluster
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Adelaide,0.4,0.541978,-0.051282,0.366026,-0.189744,0.203419,0.400733,0.288889,0.217949,0.117949,-0.373077
Agincourt,-0.1,-0.100879,-0.051282,-0.092308,-0.189744,-0.129915,-0.170696,-0.08366,-0.182051,0.367949,-0.123077
Agincourt North,-0.1,0.027692,-0.051282,-0.008974,0.310256,0.203419,-0.027839,0.033987,-0.182051,-0.132051,0.376923
Albion Gardens,-0.1,-0.215165,-0.051282,0.032692,-0.189744,-0.129915,-0.170696,-0.08366,-0.082051,-0.132051,-0.123077
Alderwood,-0.1,-0.172308,-0.051282,-0.092308,0.310256,-0.129915,-0.099267,-0.103268,0.117949,-0.132051,-0.123077
Bathurst Manor,-0.1,-0.229451,-0.051282,-0.050641,-0.189744,-0.129915,-0.170696,-0.08366,0.017949,-0.132051,-0.123077
Bathurst Quay,-0.1,-0.058022,-0.051282,-0.050641,-0.189744,-0.129915,-0.099267,-0.08366,0.317949,0.617949,0.376923
Bayview Village,-0.1,-0.158022,-0.051282,-0.092308,-0.189744,0.203419,-0.170696,0.014379,-0.182051,0.117949,-0.123077
Bedford Park,0.4,-0.229451,-0.051282,-0.092308,-0.189744,-0.129915,-0.170696,-0.103268,-0.182051,-0.132051,-0.123077
Berczy Park,0.4,0.541978,-0.051282,0.324359,-0.189744,-0.129915,0.615018,0.249673,0.417949,-0.132051,-0.373077


In [97]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df5_norm)

# append clustering data to dataframe
df5_clustered = df5
df5_clustered['Cluster'] = kmeans.labels_

print(df5_clustered.shape)
df5_clustered.head(10)

(195, 11)


Venue_Category_2,Business,Dining,Education,Entertainment,Healthcare,Household,Nightlife,Shopping,Sports,Transport,Cluster
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Adelaide,1,54,0,11,0,1,8,20,4,1,3
Agincourt,0,9,0,0,0,0,0,1,0,2,1
Agincourt North,0,18,0,2,1,1,2,7,0,0,2
Albion Gardens,0,1,0,3,0,0,0,1,1,0,1
Alderwood,0,4,0,0,1,0,1,0,3,0,1
Bathurst Manor,0,0,0,1,0,0,0,1,2,0,1
Bathurst Quay,0,12,0,1,0,0,1,1,5,3,2
Bayview Village,0,5,0,0,0,1,0,6,0,1,1
Bedford Park,1,0,0,0,0,0,0,0,0,0,1
Berczy Park,1,54,0,10,0,0,11,18,6,0,3


In [113]:
import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors
import folium # map rendering library

In [111]:
#add lat long to clustered df
df6 = df[['Neighbourhood','Latitude','Longitude']]
df6.set_index('Neighbourhood', inplace=True)
df6 = pd.merge(df6, df5_clustered, on='Neighbourhood')

print(df6.shape)
df6.head(10)

(197, 13)


Unnamed: 0_level_0,Latitude,Longitude,Business,Dining,Education,Entertainment,Healthcare,Household,Nightlife,Shopping,Sports,Transport,Cluster
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Rouge,43.80493,-79.165837,0,1,0,0,0,0,0,0,1,0,1
Malvern,43.809196,-79.221701,0,5,0,0,2,0,0,2,3,0,0
Highland Creek,43.790117,-79.173334,0,0,0,1,0,0,0,0,1,0,1
Port Union,43.775504,-79.134976,0,0,0,0,0,0,0,0,2,0,1
Rouge Hill,43.780271,-79.130499,0,0,0,0,0,0,0,0,0,1,1
Guildwood,43.754899,-79.197776,1,0,0,0,0,0,0,0,1,2,1
Morningside,43.782601,-79.204958,0,5,0,0,1,0,1,4,2,0,1
West Hill,43.768914,-79.187291,0,18,0,1,2,1,3,8,1,0,0
Woburn,43.759824,-79.225291,0,8,0,0,1,1,1,6,1,0,0
Cedarbrae,43.756467,-79.226692,0,6,0,0,0,0,2,12,2,1,1


In [118]:
# create map
torontoLatLong = pullLatLong('Toronto, Canada')
map_clusters = folium.Map(location=[torontoLatLong['lat'], torontoLatLong['long']], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df6['Latitude'], df6['Longitude'], df6.index, df6['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Toronto, Canada: 43.653963, -79.387207


In [116]:
df5_clustered.groupby('Cluster').mean()

Venue_Category_2,Business,Dining,Education,Entertainment,Healthcare,Household,Nightlife,Shopping,Sports,Transport
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.038462,21.807692,0.0,0.769231,1.461538,1.115385,1.730769,5.653846,2.038462,1.0
1,0.15,2.56,0.0,0.52,0.12,0.07,0.31,1.19,1.17,0.41
2,0.136364,23.068182,0.0,1.931818,0.5,0.568182,3.545455,9.954545,1.409091,0.454545
3,0.8,56.066667,0.0,10.666667,0.066667,0.533333,11.0,11.266667,5.533333,1.0
4,0.5,45.3,1.0,11.5,0.1,0.7,6.9,15.4,4.0,0.1
