## Importing Libraries

In [216]:
import folium
import geopy
import geocoder
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import requests
pd.set_option('display.max_rows',100)
pd.set_option('display.max_columns',30)

In [217]:
df = pd.read_csv('Toronto-nb2.csv')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76575,-79.1752
3,M1G,Scarborough,Woburn,43.7682,-79.21761
4,M1H,Scarborough,Cedarbrae,43.76969,-79.23944


## Assigning Values

In [218]:
df_tor =df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
df_tor.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.67703,-79.29542
1,M4K,East Toronto,"The Danforth West, Riverdale",43.68375,-79.35528
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.66797,-79.31468
3,M4M,East Toronto,Studio District,43.66091,-79.33503
4,M4N,Central Toronto,Lawrence Park,43.72898,-79.39173


In [219]:
latitude = 43.6532
longitude = 79.3832
toronto_m = folium.Map(location = [longitude,latitude], zoom_start=2)
for lat,lng,bor,nb in zip(df['Latitude'],df['Longitude'],df['Borough'],df['Neighborhood']):
    label = '{}, {}'.format(nb, bor)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=4,
        popup = label,
        color='red',
        fill=True,
        fill_opacity=0.3,
        parse_html=False).add_to(toronto_m)
toronto_m    

## Using Foursquare API to compare similarities in Venue

In [220]:
client_id = 'ITKVISZUN50K5OIU5THJ0A330ABDVLFMOSX4NUB41ESFVYSA'
client_secret = 'MACPFVUEFU3BEHD4YZREQUBDEBFX0JFOPQ0YLIEBRHWQ0ELC'
ver = '20180604'
lim = 30

In [221]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id, client_secret, ver, lat, lng, radius, lim)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([( name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

In [222]:
df_cnb = getNearbyVenues(df['Neighborhood'],df['Latitude'],df['Longitude'])
df_cnb.head()

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale, Willowdale East
York Mills West
Willowdale, Willowdale West
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto, Broadview North (Old East York)
The Danforth West, 

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.81153,-79.19552,Wood Bison Paddock,43.811732,-79.200708,Zoo Exhibit
1,"Malvern, Rouge",43.81153,-79.19552,Canadian Appliance Source Whitby,43.808353,-79.191331,Home Service
2,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871,Affordable Toronto Movers,43.787919,-79.162977,Moving Target
3,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871,Royal Canadian Legion,43.782533,-79.163085,Bar
4,"Guildwood, Morningside, West Hill",43.76575,-79.1752,Homestead Roofing Repair,43.76514,-79.178663,Construction & Landscaping


## One-Hot Encoding 

In [223]:
df_oh = pd.get_dummies(df_cnb[['Venue Category']],prefix="", prefix_sep="")
df_oh['Neighborhood'] = df_cnb['Neighborhood']
fixed_columns = [df_oh.columns[-1]] + list(df_oh.columns[:-1])
df_oh = df_oh[fixed_columns]
df_oh.head()

Unnamed: 0,Zoo Exhibit,Airport,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,BBQ Joint,Baby Store,Badminton Court,Bakery,...,Tennis Court,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [224]:
df_oh.shape

(1381, 227)

In [225]:
df_gp = df_oh.groupby('Neighborhood').mean().reset_index()
df_gp.head(10)

Unnamed: 0,Neighborhood,Zoo Exhibit,Airport,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,BBQ Joint,Baby Store,Badminton Court,...,Tennis Court,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0
6,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0
8,"Business reply mail Processing Centre, South C...",0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.033333,0.066667,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0
9,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333


## Defining a Function to get top 5 venues

In [226]:
def venue_fun(row,num):
    row_ct = row.iloc[1:]
    row_s = row_ct.sort_values(ascending=False)
    return row_s.index.values[0:num]

In [227]:
num=10
indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

nb_vs = pd.DataFrame(columns=columns)
nb_vs['Neighborhood'] = df_gp['Neighborhood']

for ind in np.arange(df_gp.shape[0]):
    nb_vs.iloc[ind, 1:] = venue_fun(df_gp.iloc[ind, :], num)

print('Shape:',nb_vs.shape)
nb_vs.head()

Shape: (97, 11)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Park,Pool,Supermarket,Sushi Restaurant,Discount Store,Badminton Court,Shopping Mall,Cupcake Shop,Doctor's Office,Fast Food Restaurant
1,"Alderwood, Long Branch",Convenience Store,Gym,Pizza Place,Pub,Dance Studio,Sandwich Place,Yoga Studio,Eastern European Restaurant,Dog Run,Donut Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Park,IT Services,Dive Bar,Field,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space,Electronics Store
3,Bayview Village,Park,Trail,Golf Driving Range,Construction & Landscaping,Gym,Field,Farmers Market,Farm,Falafel Restaurant,Event Space
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Greek Restaurant,Sushi Restaurant,Comfort Food Restaurant,Pharmacy,Pizza Place,Pub,Café


In [228]:
km = KMeans(n_clusters=4,init='k-means++',random_state=1)
dff = df_gp.drop('Neighborhood',1)
km.fit(dff)
print(km.labels_[0:97])

[1 1 2 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 2 1 1 1 2 1 1 1 0 1 2
 1 1 2 0 1 1 1 1 1 1 2 1 1 1 1 1 2 2 2 0 1 1 1 1 1 2 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 3 1 1 1 2 1 0 1]


In [229]:
nb_vs.insert(0, 'Cluster Labels', km.labels_)

In [230]:
toronto_merged = df
toronto_merged = toronto_merged.join(nb_vs.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.81153,-79.19552,1.0,Zoo Exhibit,Home Service,Doctor's Office,Field,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space,Electronics Store
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.78564,-79.15871,1.0,Moving Target,Bar,Doctor's Office,Field,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space,Electronics Store
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76575,-79.1752,0.0,Construction & Landscaping,Gym / Fitness Center,Park,Convenience Store,Cosmetics Shop,Field,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant
3,M1G,Scarborough,Woburn,43.7682,-79.21761,2.0,Park,Business Service,Korean Restaurant,Coffee Shop,Doctor's Office,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space
4,M1H,Scarborough,Cedarbrae,43.76969,-79.23944,0.0,Playground,Trail,Yoga Studio,Dive Bar,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Event Space,Electronics Store


In [231]:
dffs.dropna(inplace=True)
dffs.isna().sum()

PostalCode                0
Borough                   0
Neighborhood              0
Latitude                  0
Longitude                 0
Cluster_num               0
1st Most Common Venue     0
2nd Most Common Venue     0
3rd Most Common Venue     0
4th Most Common Venue     0
5th Most Common Venue     0
6th Most Common Venue     0
7th Most Common Venue     0
8th Most Common Venue     0
9th Most Common Venue     0
10th Most Common Venue    0
dtype: int64

## Visualizing Clusters

In [232]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
map_cluster = folium.Map(location=[43.653963, -79.387207], zoom_start=15)
rbw = ['red','coral','mediumseagreen','midnightblue']
for lat,lng,nb,cluster in zip(dffs['Latitude'],dffs['Longitude'],dffs['Neighborhood'],km.labels_):
    label = folium.Popup(str(nb) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius = 5,
        popup = label,
        color = rbw[cluster],
        fill=True,
        fill_color=rbw[cluster],
        fill_opacity=0.7).add_to(map_cluster)
map_cluster

In [233]:
print('Thanks for coming by :)')

Thanks for coming by :)
