# Part I

In [1]:
import pandas as pd
import numpy as np
import requests

In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XjZO1QpAMFIAALOzc6EAAACH","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communi

In [4]:
table=soup.find('table', class_='wikitable sortable')
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North

In [5]:
A=[]
B=[]
C=[]

for row in table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

In [6]:
df=pd.DataFrame(A,columns=['Postcode'])
df['Borough']=B
df['Neighbourhood']=C
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [7]:
#drop those borough equals to 'Not assigned'
data=df[df['Borough']!='Not assigned']
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [8]:
#combine several rows of the same postcode into one row with the neighbourhoods separated with a comma
data=data.groupby("Postcode").agg(lambda x:','.join(set(x)))
data.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
M1E,Scarborough,"Guildwood\n,West Hill,Morningside"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae\n


In [9]:
#If a cell has a borough but a Not assigned neighbourhood, then the neighbourhood will be the same as the borough. 
data.loc[data['Neighbourhood']=="Not assigned",'Neighbourhood']=data.loc[data['Neighbourhood']=="Not assigned",'Borough']
data.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
M1E,Scarborough,"Guildwood\n,West Hill,Morningside"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae\n


In [10]:
cleandata=data.reset_index()
cleandata.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
2,M1E,Scarborough,"Guildwood\n,West Hill,Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae\n


In [11]:
cleandata.shape

(103, 3)

# Part II

In [12]:
!conda install -c conda-forge geocoder --yes
print("Installation Done!")
import geocoder
print("Geo Coder imported!")

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Installation Done!
Geo Coder imported!


In [13]:
#define a function to get latitude and longtitude by postcode
def get_geocoder(postal_code):
    lat_lng_coords=None
    while(lat_lng_coords is None):
        g=geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code.strip()))
        lat_lng_coords=g.latlng
        latitude=lat_lng_coords[0]
        longitude=lat_lng_coords[1]
    return latitude,longitude

In [14]:
#add latitude and longtitude to the dataframe
cleandata['Latitude'], cleandata['Longitude']=zip(*cleandata['Postcode'].apply(get_geocoder))
cleandata.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood\n,West Hill,Morningside",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae\n,43.769688,-79.23944


# Part III

In [15]:
#filter dataset and leave only Borough containing 'Toronto'
neighbourhoods=cleandata[cleandata['Borough'].str.contains('Toronto')]
neighbourhoods.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676531,-79.295425
41,M4K,East Toronto,"The Danforth West\n,Riverdale",43.683178,-79.355105
42,M4L,East Toronto,"India Bazaar,The Beaches West\n",43.667965,-79.314667
43,M4M,East Toronto,Studio District\n,43.660629,-79.334855
44,M4N,Central Toronto,Lawrence Park,43.72842,-79.387133


In [16]:
neighbourhoods=neighbourhoods.reset_index()
neighbourhoods.head()

Unnamed: 0,index,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,37,M4E,East Toronto,The Beaches,43.676531,-79.295425
1,41,M4K,East Toronto,"The Danforth West\n,Riverdale",43.683178,-79.355105
2,42,M4L,East Toronto,"India Bazaar,The Beaches West\n",43.667965,-79.314667
3,43,M4M,East Toronto,Studio District\n,43.660629,-79.334855
4,44,M4N,Central Toronto,Lawrence Park,43.72842,-79.387133


In [17]:
neighbourhood=neighbourhoods.drop(['index'],axis=1)
neighbourhood.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676531,-79.295425
1,M4K,East Toronto,"The Danforth West\n,Riverdale",43.683178,-79.355105
2,M4L,East Toronto,"India Bazaar,The Beaches West\n",43.667965,-79.314667
3,M4M,East Toronto,Studio District\n,43.660629,-79.334855
4,M4N,Central Toronto,Lawrence Park,43.72842,-79.387133


In [18]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium 
print("folium imported!")

usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: # uncomment this line if you haven't completed the Foursquare API lab


folium imported!


In [19]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim
print("Nominatim imported!")

Nominatim imported!


usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: # uncomment this line if you haven't completed the Foursquare API lab


In [20]:
address='Toronto, Ontario'
geolocator=Nominatim(user_agent="toronto_explorer")
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


### create map of Toronta using latitude and longitude values

In [21]:
map_Toronto=folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbour in zip(neighbourhood['Latitude'], neighbourhood['Longitude'], neighbourhood['Borough'], neighbourhood['Neighbourhood']):
    label = '{}, {}'.format(neighbour, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [22]:
#Define Foursquare Credentials and Version
CLIENT_ID = 'SVSCM14TUGHYRHDIY0ZKGQYIMBCHJYCVJBFYPN3WIFFNSENT' # your Foursquare ID
CLIENT_SECRET = 'J0F2SOLBUJ0SSNLBHTJDAISAEPGKH04YEMUZZG31WAKFE0WC' # your Foursquare Secret
VERSION = '20200204' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: SVSCM14TUGHYRHDIY0ZKGQYIMBCHJYCVJBFYPN3WIFFNSENT
CLIENT_SECRET:J0F2SOLBUJ0SSNLBHTJDAISAEPGKH04YEMUZZG31WAKFE0WC


In [23]:
neighbourhood.loc[0, 'Neighbourhood']
neighbourhood_latitude=neighbourhood.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude=neighbourhood.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name=neighbourhood.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of The Beaches are 43.67653121600006, -79.29542499999997.


In [24]:
LIMIT=100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius


# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=SVSCM14TUGHYRHDIY0ZKGQYIMBCHJYCVJBFYPN3WIFFNSENT&client_secret=J0F2SOLBUJ0SSNLBHTJDAISAEPGKH04YEMUZZG31WAKFE0WC&v=20200204&ll=43.67653121600006,-79.29542499999997&radius=500&limit=100'

In [None]:
results=requests.get(url).json()
results

In [None]:
#function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list=row['categories']
    except:
        categories_list=row['venue.categories']
        
    if len(categories_list)==0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
from pandas.io.json import json_normalize
venues=results['response']['groups'][0]['items']
    
nearby_venues=json_normalize(venues) # flatten JSON

# filter columns
filtered_columns=['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues=nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories']=nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns=[col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

In [25]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [26]:
Toronto_venues = getNearbyVenues(names=neighbourhood['Neighbourhood'],
                                   latitudes=neighbourhood['Latitude'],
                                   longitudes=neighbourhood['Longitude']
                                  )

The Beaches
The Danforth West
,Riverdale
India Bazaar,The Beaches West

Studio District

Lawrence Park
Davisville North

North Toronto West

Davisville

Moore Park,Summerhill East

South Hill,Forest Hill SE
,Rathnelly,Deer Park,Summerhill West

Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Harbourfront
Garden District
,Ryerson

St. James Town
Berczy Park
Central Bay Street

Adelaide
,Richmond
,King

Toronto Islands,Harbourfront East
,Union Station
Toronto Dominion Centre,Design Exchange
Commerce Court,Victoria Hotel

Roselawn

Forest Hill North,Forest Hill West

The Annex,Yorkville,North Midtown

Harbord
,University of Toronto
Grange Park,Kensington Market,Chinatown
Bathurst Quay
,CN Tower,King and Spadina,South Niagara,Harbourfront West
,Railway Lands,Island airport

Stn A PO Boxes 25 The Esplanade

First Canadian Place,Underground city
Christie

Dufferin
,Dovercourt Village
Trinity,Little Portugal
Exhibition Place,Brockton
,Parkdale Village
High Park,The Junction South

Ro

In [27]:
Toronto_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676531,-79.295425,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676531,-79.295425,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676531,-79.295425,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676531,-79.295425,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West\n,Riverdale",43.683178,-79.355105,Dollarama,43.686197,-79.355989,Discount Store


In [28]:
#check how many venues were returned for each neighbourhood
Toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide\n,Richmond\n,King\n",100,100,100,100,100,100
"Bathurst Quay\n,CN Tower,King and Spadina,South Niagara,Harbourfront West\n,Railway Lands,Island airport\n",70,70,70,70,70,70
Berczy Park,61,61,61,61,61,61
Business Reply Mail Processing Centre 969 Eastern\n,100,100,100,100,100,100
"Cabbagetown,St. James Town",42,42,42,42,42,42
Central Bay Street\n,96,96,96,96,96,96
Christie\n,11,11,11,11,11,11
Church and Wellesley,80,80,80,80,80,80
"Commerce Court,Victoria Hotel\n",100,100,100,100,100,100
Davisville\n,26,26,26,26,26,26


In [29]:
#find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 221 uniques categories.


In [30]:
# one hot encoding
Toronto_onehot=pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
Toronto_onehot['Neighbourhood']=Toronto_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns=[Toronto_onehot.columns[-1]]+list(Toronto_onehot.columns[:-1])
Toronto_onehot=Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Toy / Game Store,Trail,Train Station,Tram Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West\n,Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
#group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category
Toronto_grouped=Toronto_onehot.groupby('Neighbourhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighbourhood,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,...,Toy / Game Store,Trail,Train Station,Tram Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,"Adelaide\n,Richmond\n,King\n",0.0,0.0,0.03,0.0,0.01,0.0,0.03,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
1,"Bathurst Quay\n,CN Tower,King and Spadina,Sout...",0.0,0.0,0.0,0.0,0.0,0.0,0.014286,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014286
2,Berczy Park,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.0,0.016393,...,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 East...,0.0,0.0,0.02,0.0,0.0,0.01,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
4,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381


In [32]:
#write a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [33]:
#create the new dataframe and display the top 10 venues for each neighbourhood
num_top_venues=10

indicators=['st', 'nd', 'rd']

# create columns according to number of top venues
columns=['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted=pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood']=Toronto_grouped['Neighbourhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:]=return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide\n,Richmond\n,King\n",Coffee Shop,Café,Hotel,Steakhouse,Bakery,Burger Joint,Gym,Asian Restaurant,Bar,Gastropub
1,"Bathurst Quay\n,CN Tower,King and Spadina,Sout...",Coffee Shop,Italian Restaurant,Café,Bar,Gym / Fitness Center,Sandwich Place,Restaurant,Electronics Store,Pub,Park
2,Berczy Park,Coffee Shop,Cocktail Bar,Farmers Market,Hotel,Steakhouse,Seafood Restaurant,Café,Beer Bar,Cheese Shop,Restaurant
3,Business Reply Mail Processing Centre 969 East...,Coffee Shop,Steakhouse,Bar,Hotel,Pub,Gym,Seafood Restaurant,Sushi Restaurant,Café,Thai Restaurant
4,"Cabbagetown,St. James Town",Coffee Shop,Café,Italian Restaurant,Restaurant,Pizza Place,Bakery,Park,Yoga Studio,Convenience Store,Chinese Restaurant


## cluster neighbours

In [34]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters=5
Toronto_grouped_clustering=Toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans=KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [38]:
# add clustering labels
#neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged=neighbourhood

# merge toronto_grouped with neighbourhood to add latitude/longitude for each neighbourhood
Toronto_merged=Toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

Toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676531,-79.295425,0,Health Food Store,Pub,Trail,Neighborhood,Eastern European Restaurant,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
1,M4K,East Toronto,"The Danforth West\n,Riverdale",43.683178,-79.355105,1,Bus Line,Grocery Store,Park,Discount Store,Electronics Store,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
2,M4L,East Toronto,"India Bazaar,The Beaches West\n",43.667965,-79.314667,0,Park,Sandwich Place,Gym,Italian Restaurant,Pub,Movie Theater,Fast Food Restaurant,Fish & Chips Shop,Burrito Place,Pet Store
3,M4M,East Toronto,Studio District\n,43.660629,-79.334855,0,Diner,Brewery,Italian Restaurant,Pizza Place,Sushi Restaurant,Bar,Gastropub,Café,Coffee Shop,Arts & Crafts Store
4,M4N,Central Toronto,Lawrence Park,43.72842,-79.387133,0,Bus Line,Construction & Landscaping,Swim School,Yoga Studio,Electronics Store,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market


In [40]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [41]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighbourhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters