In [1]:
!pip install geopy
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 7.9MB/s ta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


In [2]:
import pandas as pd 
import numpy as np 
from geopy.geocoders import Nominatim
import folium

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [3]:
df=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)[0]

    Next cell implements the following requirements:    
    * Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.    
    * If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [4]:
df=df[df['Borough']!='Not assigned'].reset_index(drop=True)
df.loc[df['Neighbourhood'] == "Not assigned", "Neighbourhood"] = df['Borough']

In [5]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
df.shape

(103, 3)

Let's create "toronto neighbourhoods" dataframe

In [16]:
toronto_df = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_df.rename(columns = {'Neighbourhood':'Neighbourhood_List'}, inplace = True)
#toronto_df
neighbourhood_split = pd.DataFrame(toronto_df['Neighbourhood_List'].str.split(',').tolist(),index=toronto_df['Postal Code']).stack()
neighbourhood_df = neighbourhood_split.reset_index([0, 'Postal Code'])
neighbourhood_df.rename(columns = {0:'Neighbourhood'}, inplace = True)
#neighbourhood_df

toronto_neighbourhood_df = pd.merge(toronto_df, neighbourhood_df, on='Postal Code', how='inner')
toronto_neighbourhood_df.drop('Neighbourhood_List', axis=1, inplace=True)
toronto_neighbourhood_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M5A,Downtown Toronto,Regent Park
1,M5A,Downtown Toronto,Harbourfront
2,M7A,Downtown Toronto,Queen's Park
3,M7A,Downtown Toronto,Ontario Provincial Government
4,M5B,Downtown Toronto,Garden District
5,M5B,Downtown Toronto,Ryerson
6,M5C,Downtown Toronto,St. James Town
7,M4E,East Toronto,The Beaches
8,M5E,Downtown Toronto,Berczy Park
9,M5G,Downtown Toronto,Central Bay Street


Let's add Latitude and Longitude columns

In [18]:
toronto_neighbourhood_df['Latitude']=0
toronto_neighbourhood_df['Longitude']=0

In [19]:
toronto_neighbourhood_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park,0,0
1,M5A,Downtown Toronto,Harbourfront,0,0
2,M7A,Downtown Toronto,Queen's Park,0,0
3,M7A,Downtown Toronto,Ontario Provincial Government,0,0
4,M5B,Downtown Toronto,Garden District,0,0


Let's get "Latitude and Longitude " for the boroughs

In [20]:
def get_location(nbgh):
    geolocator = Nominatim(user_agent="explorer")
    location = geolocator.geocode('{}, Toronto, Ontario'.format(nbgh) )
    if location is None:
        latitude = 0
        longitude = 0
    else:
        latitude = location.latitude
        longitude = location.longitude

    return latitude,longitude

In [21]:
for i in toronto_neighbourhood_df.index:
    print(toronto_neighbourhood_df['Neighbourhood'][i])
    r=get_location(toronto_neighbourhood_df['Neighbourhood'][i])
    toronto_neighbourhood_df.loc[i,'Latitude'] = r[0]
    toronto_neighbourhood_df.loc[i,'Longitude'] = r[1]
    

print('done')

Regent Park
 Harbourfront
Queen's Park
 Ontario Provincial Government
Garden District
 Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond
 Adelaide
 King
Dufferin
 Dovercourt Village
Harbourfront East
 Union Station
 Toronto Islands
Little Portugal
 Trinity
The Danforth West
 Riverdale
Toronto Dominion Centre
 Design Exchange
Brockton
 Parkdale Village
 Exhibition Place
India Bazaar
 The Beaches West
Commerce Court
 Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
 Forest Hill Road Park
High Park
 The Junction South
North Toronto West
 Lawrence Park
The Annex
 North Midtown
 Yorkville
Parkdale
 Roncesvalles
Davisville
University of Toronto
 Harbord
Runnymede
 Swansea
Moore Park
 Summerhill East
Kensington Market
 Chinatown
 Grange Park
Summerhill West
 Rathnelly
 South Hill
 Forest Hill SE
 Deer Park
CN Tower
 King and Spadina
 Railway Lands
 Harbourfront West
 Bathurst Quay
 South Niagara
 Island airport
R

In [22]:
toronto_neighbourhood_df.shape

(78, 5)

In [23]:
toronto_neighbourhood_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457
1,M5A,Downtown Toronto,Harbourfront,43.64008,-79.38015
2,M7A,Downtown Toronto,Queen's Park,43.659659,-79.39034
3,M7A,Downtown Toronto,Ontario Provincial Government,0.0,0.0
4,M5B,Downtown Toronto,Garden District,43.6565,-79.377114


Let's get venues

In [24]:
CLIENT_ID = 'DGXI5NPQD2QYSNXIVCQE1DE1GZEME5ZCWW10NI5ADHBX5S3C' # your Foursquare ID
CLIENT_SECRET = '0LY1BZVNKVNYK5BT5MP3AHMNAEB41XF4WO4VV555ZN2ISDNP' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [25]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        if lat!=0.0 and lng!=0.0:
            print(name)

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                radius, 
                LIMIT)
            #print(url)

            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['TorontoNeighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [26]:
toronto_venues = getNearbyVenues(names=toronto_neighbourhood_df['Neighbourhood'],
                                   latitudes=toronto_neighbourhood_df['Latitude'],
                                   longitudes=toronto_neighbourhood_df['Longitude']
                                  )

Regent Park
 Harbourfront
Queen's Park
Garden District
 Ryerson
St. James Town
The Beaches
Berczy Park
Christie
Richmond
 Adelaide
 King
Dufferin
 Dovercourt Village
Harbourfront East
 Union Station
 Toronto Islands
Little Portugal
 Trinity
The Danforth West
 Riverdale
Toronto Dominion Centre
 Design Exchange
Brockton
 Parkdale Village
 Exhibition Place
India Bazaar
 The Beaches West
Commerce Court
 Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
 Forest Hill Road Park
High Park
 The Junction South
North Toronto West
 Lawrence Park
The Annex
 North Midtown
 Yorkville
Parkdale
 Roncesvalles
Davisville
University of Toronto
 Harbord
Runnymede
 Swansea
Moore Park
 Summerhill East
Kensington Market
 Chinatown
 Grange Park
Summerhill West
 Rathnelly
 South Hill
 Forest Hill SE
 Deer Park
CN Tower
 King and Spadina
 Harbourfront West
 Bathurst Quay
 South Niagara
 Island airport
Rosedale
St. James Town
 Cabbagetown
First Canadian Place
 Undergr

In [29]:
toronto_venues.shape

(3651, 7)

In [30]:
toronto_venues.head()

Unnamed: 0,TorontoNeighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Regent Park,43.660706,-79.360457,Regent Park Aquatic Centre,43.6606,-79.361392,Pool
1,Regent Park,43.660706,-79.360457,Daniels Spectrum,43.660137,-79.361808,Performing Arts Venue
2,Regent Park,43.660706,-79.360457,Sumach Espresso,43.658135,-79.359515,Coffee Shop
3,Regent Park,43.660706,-79.360457,Thai To Go,43.663418,-79.36071,Thai Restaurant
4,Regent Park,43.660706,-79.360457,Paintbox Bistro,43.66005,-79.362855,Restaurant


One hot encoding

In [32]:
#one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
fixed_columns=list(toronto_onehot.columns)

# add neighborhood column back to dataframe
toronto_onehot['TorontoNeighborhood'] = toronto_venues['TorontoNeighborhood'] 

toronto_onehot = toronto_onehot[['TorontoNeighborhood'] + fixed_columns]
toronto_onehot

Unnamed: 0,TorontoNeighborhood,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Regent Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [33]:
toronto_grouped = toronto_onehot.groupby('TorontoNeighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,TorontoNeighborhood,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Adelaide,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.010000,0.000000,0.000000,0.000000,0.00,0.010000,0.00000,0.000000,0.000000
1,Bathurst Quay,0.000000,0.0,0.00000,0.040000,0.000000,0.000000,0.000000,0.040000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.00000,0.000000,0.000000
2,Cabbagetown,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.00000,0.000000,0.000000
3,Chinatown,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.041667,0.013889,0.000000,0.027778,0.00,0.013889,0.00000,0.000000,0.000000
4,Deer Park,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.016949,0.00,0.000000,0.00000,0.000000,0.016949
5,Design Exchange,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.020000,0.000000,0.000000,0.000000,0.00,0.010000,0.00000,0.000000,0.000000
6,Dovercourt Village,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.00000,0.000000,0.000000
7,Exhibition Place,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.00000,0.000000,0.000000
8,Forest Hill Road Park,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.00000,0.000000,0.000000
9,Forest Hill SE,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.00000,0.000000,0.000000


k-means to cluster the Toronto into 6 clusters.

In [36]:
kclusters = 6

toronto_grouped_clustering = toronto_grouped.drop('TorontoNeighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([4, 4, 2, 2, 4, 4, 4, 2, 5, 1, 2, 2, 4, 4, 2, 4, 4, 2, 2, 2, 2, 2,
       2, 4, 2, 2, 2, 3, 2, 2, 3, 2, 2, 4, 4, 2, 4, 2, 4, 2, 4, 4, 2, 2,
       2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 1, 2, 2, 4, 4, 0, 3, 2, 4, 4, 2, 2,
       2, 2, 2, 4, 2], dtype=int32)

In [40]:

toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_grouped

toronto_neighbourhood_df.rename(columns={'Neighbourhood': 'TorontoNeighborhood'},inplace=True)
toronto_neighbourhood_df.head()
toronto_final = pd.merge(toronto_neighbourhood_df, toronto_grouped, on='TorontoNeighborhood', how='inner')
toronto_final



Unnamed: 0,Postal Code,Borough,TorontoNeighborhood,Latitude,Longitude,Cluster Labels,Accessories Store,Afghan Restaurant,African Restaurant,Airport,...,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M5A,Downtown Toronto,Regent Park,43.660706,-79.360457,4,0.000000,0.000000,0.00000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
1,M5A,Downtown Toronto,Harbourfront,43.640080,-79.380150,4,0.000000,0.000000,0.00000,0.000000,...,0.00,0.010000,0.000000,0.000000,0.000000,0.01,0.000000,0.0,0.000000,0.000000
2,M7A,Downtown Toronto,Queen's Park,43.659659,-79.390340,4,0.000000,0.000000,0.00000,0.000000,...,0.00,0.013889,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
3,M5B,Downtown Toronto,Garden District,43.656500,-79.377114,2,0.000000,0.000000,0.00000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
4,M5B,Downtown Toronto,Ryerson,43.658469,-79.378993,4,0.000000,0.000000,0.00000,0.000000,...,0.00,0.000000,0.010000,0.000000,0.000000,0.00,0.010000,0.0,0.000000,0.000000
5,M5C,Downtown Toronto,St. James Town,43.669403,-79.372704,4,0.000000,0.000000,0.00000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
6,M4X,Downtown Toronto,St. James Town,43.669403,-79.372704,4,0.000000,0.000000,0.00000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.000000
7,M4E,East Toronto,The Beaches,43.671024,-79.296712,2,0.000000,0.000000,0.00000,0.000000,...,0.00,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.025000,0.000000
8,M5E,Downtown Toronto,Berczy Park,43.647984,-79.375396,4,0.000000,0.000000,0.00000,0.000000,...,0.00,0.010000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.010000
9,M6G,Downtown Toronto,Christie,43.664111,-79.418405,2,0.000000,0.000000,0.00000,0.000000,...,0.00,0.000000,0.000000,0.017857,0.017857,0.00,0.017857,0.0,0.000000,0.000000


Draw map

In [44]:
# create map
import matplotlib.cm as cm
import matplotlib.colors as colors

address = 'Toronto, CA'

geolocator = Nominatim(user_agent="can_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_final['Latitude'], toronto_final['Longitude'], toronto_final['TorontoNeighborhood'], toronto_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters