transform the data into a pandas dataframe

In [1]:
# install BeautifulSoup4:
!conda install -c conda-forge beautifulsoup4
print('beautifulsoup4 installed!')

# install lxml:
!conda install -c conda-forge lxml
print('lxml installed!')

# install geocoder:
!conda install -c conda-forge geocoder --yes
print('geocoder installed!')

# install folium:
!conda install -c conda-forge folium=0.5.0
print('folium installed!')


Solving environment: done

# All requested packages already installed.

beautifulsoup4 installed!
Solving environment: done

# All requested packages already installed.

lxml installed!
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          59 KB

The following NEW packages will be INSTALLED:

    geocoder: 1.38.1-py_1 conda-forge
    ratelim:  0.1.6-py_2  conda-forge


Downloading and Extracting Packages
geocoder-1.38.1      | 53 KB     | ##################################### | 100% 
r

In [2]:
# import the libraries:
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import requests
import geocoder # to retrieve latitude and longitude
import csv
import json
import folium # map rendering library
import matplotlib.cm as cm
import matplotlib.colors as colors

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans

#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

print('Libraries imported.')

Libraries imported.


# import the libraries:
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import requests
import geocoder # to retrieve latitude and longitude
import csv
import json
import folium # map rendering library
import matplotlib.cm as cm
import matplotlib.colors as colors

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans

#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

print('Libraries imported.')

In [4]:
# get the content of the website:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# create a soup object:
bigsoup = BeautifulSoup(source,'lxml')

# show how the tags are nested in the document:
# print(bigsoup.prettify())

In [5]:
# all necessary parts can be found under class 'wikitable sortable':
soup = bigsoup.find('table',{'class':'wikitable sortable'})
# sou

In [6]:
table = []
# loop over all rows = <tr> tag:
for row in soup.find_all('tr')[1:] :         # skip the first row (header, no data cells)
    data = []
    # loop over all data cells in this row = <td> tag:
    for cell in row.find_all('td') :
        try:
            str = cell.find('a').contents    # link = <a> tag
        except:
            str = cell.contents              # otherwise get directly the content of the cell
        data.append(str[0].strip('\n'))      # strip possible \n
    table.append(data)
table

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Downtown Toronto', "Queen's Park"],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B', 'Etobicoke', 'Martin Grove'],
 ['M9B', 'Et

In [7]:
# turn the scraped data into a pandas DataFrame:
df = pd.DataFrame(table)
df.columns = ['PostalCode', 'Borough', 'Neighborhood']

# ignore rows with a borough that is 'Not assigned':
df = df[df['Borough'] != 'Not assigned'].reset_index()

# if a row has a borough but a 'Not assigned' neighborhood, then the neighborhood will be the same as the borough:
index = (df['Neighborhood'] == 'Not assigned')
df.loc[index, 'Neighborhood'] = df.loc[index, 'Borough']

# combine the rows with the same postal code and borough, with the neighborhoods separated with a comma:
neighborhoods = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [8]:
neighborhoods.shape

(103, 3)

Geographical coordinates 

In [9]:
# loop over all rows of dataframe coords:
lat_lng = []
for i in range(neighborhoods.shape[0]) :
    # get the postal code:
    postal_code = neighborhoods.loc[i, 'PostalCode']
    
    # initialize the coordinates to None:
    lat_lng_coords = None
    
    # loop until you get the coordinates:
    while(lat_lng_coords is None) :
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        lat_lng.append(lat_lng_coords)
        print('retrieved for postal code =', postal_code, ' lat =', lat_lng_coords[0], ' lng =', lat_lng_coords[1])

retrieved for postal code = M1B  lat = 43.811525000000074  lng = -79.19551746399998
retrieved for postal code = M1C  lat = 43.78566500000005  lng = -79.15872457299997
retrieved for postal code = M1E  lat = 43.76581500000003  lng = -79.17519294699997
retrieved for postal code = M1G  lat = 43.768369121000035  lng = -79.21758999999997
retrieved for postal code = M1H  lat = 43.76968799900004  lng = -79.23943999999995
retrieved for postal code = M1J  lat = 43.74312500000008  lng = -79.23174973599998
retrieved for postal code = M1K  lat = 43.72627568400003  lng = -79.26362499999993
retrieved for postal code = M1L  lat = 43.71305350000006  lng = -79.28505499999994
retrieved for postal code = M1M  lat = 43.724234575000025  lng = -79.22792499999997
retrieved for postal code = M1N  lat = 43.69677000000007  lng = -79.25996735299998
retrieved for postal code = M1P  lat = 43.759975000000054  lng = -79.26897418299995
retrieved for postal code = M1R  lat = 43.750710464000065  lng = -79.30055999999996

In [10]:
# turn lat_lng into a pandas DataFrame named ll:
ll = pd.DataFrame(lat_lng)
ll.columns = ['Latitude', 'Longitude']
ll.head()

Unnamed: 0,Latitude,Longitude
0,43.811525,-79.195517
1,43.785665,-79.158725
2,43.765815,-79.175193
3,43.768369,-79.21759
4,43.769688,-79.23944


In [11]:
# add the columns with the geographical coordinates to the dataframe neighborhoods:
neighborhoods = pd.concat([neighborhoods,ll], axis=1)
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.217590
4,M1H,Scarborough,Cedarbrae,43.769688,-79.239440
5,M1J,Scarborough,Scarborough Village,43.743125,-79.231750
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.726276,-79.263625
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713054,-79.285055
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.724235,-79.227925
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.696770,-79.259967


Explore and cluster the neighborhoods in Toronto

In [12]:
# get the geographical coordinates of Toronto:
address = 'Toronto, Ontario'
g = geocoder.arcgis(address)
lat_lng_coords = g.latlng
latitude  = g.latlng[0]
longitude = g.latlng[1]

# we will need these coordinates later again:
Toronto_latitude  = latitude
Toronto_longitude = longitude

print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.648690000000045, -79.38543999999996.


In [13]:
# create map of Toronto using latitude and longitude values:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}; {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [18]:
# The code was removed by Watson Studio for sharing.

In [15]:
VERSION = '20191209' # Foursquare API version
radius = 1000
LIMIT = 100

In [16]:
latitude = neighborhoods.loc[0, 'Latitude'] # neighborhood latitude value
longitude = neighborhoods.loc[0, 'Longitude'] # neighborhood longitude value
name = neighborhoods.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of <{}> are {}, {}.'.format(name, latitude, longitude))

Latitude and longitude values of <Rouge, Malvern> are 43.811525000000074, -79.19551746399998.


In [19]:
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
results = requests.get(url).json()
print('There are {} venues found in <{}>.'.format(len(results['response']['groups'][0]['items']), name))

There are 7 venues found in <Rouge, Malvern>.


In [20]:
# and here are the venues:
results

{'meta': {'code': 200, 'requestId': '5df4bdf1b1cac0001b690eac'},
 'response': {'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 7,
  'suggestedBounds': {'ne': {'lat': 43.820525009000086,
    'lng': -79.1830688170168},
   'sw': {'lat': 43.80252499100006, 'lng': -79.20796611098315}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4c97975582b56dcb8320ebaa',
       'name': 'Canadiana exhibit',
       'location': {'lat': 43.81796218928876,
        'lng': -79.19337359666939,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.81796218928876,
          'lng': -79.19337359666939}],
        'distance': 736,
        'cc': 'CA',
        'city': 'Toronto',
        'state': 'ON',

In [21]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [22]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Canadiana exhibit,Zoo Exhibit,43.817962,-79.193374
1,Wendy's,Fast Food Restaurant,43.807448,-79.199056
2,Grizzly Bear Exhibit,Zoo Exhibit,43.817031,-79.193458
3,Upper Rouge Trail,Trail,43.809988,-79.186147
4,Cheetah Exhibit,Other Great Outdoors,43.817588,-79.187206


In [23]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        if (len(results) == 0) :
            print('>>>   ... No venues in', name, '!!!')
            
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [24]:
toronto_venues = getNearbyVenues(names=neighborhoods['Neighborhood'], latitudes=neighborhoods['Latitude'], longitudes=neighborhoods['Longitude'], radius=radius)

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
>>>   ... No venues in Upper Rouge !!!
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliff

In [25]:
print(toronto_venues.shape)
toronto_venues.head(10)

(5097, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.811525,-79.195517,Canadiana exhibit,43.817962,-79.193374,Zoo Exhibit
1,"Rouge, Malvern",43.811525,-79.195517,Wendy's,43.807448,-79.199056,Fast Food Restaurant
2,"Rouge, Malvern",43.811525,-79.195517,Grizzly Bear Exhibit,43.817031,-79.193458,Zoo Exhibit
3,"Rouge, Malvern",43.811525,-79.195517,Upper Rouge Trail,43.809988,-79.186147,Trail
4,"Rouge, Malvern",43.811525,-79.195517,Cheetah Exhibit,43.817588,-79.187206,Other Great Outdoors
5,"Rouge, Malvern",43.811525,-79.195517,Lee Valley,43.803161,-79.199681,Hobby Shop
6,"Rouge, Malvern",43.811525,-79.195517,Zebra Exhibit,43.818045,-79.187056,Zoo Exhibit
7,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725,Fratelli Village Pizzeria,43.784008,-79.169787,Italian Restaurant
8,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725,Shamrock Burgers,43.783823,-79.168406,Burger Joint
9,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725,Ted's Restaurant,43.784468,-79.1692,Breakfast Spot


In [26]:
toronto_venues.groupby('Neighborhood').count()[['Venue']]

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
"Adelaide, King, Richmond",100
Agincourt,43
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",25
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",12
"Alderwood, Long Branch",30
"Bathurst Manor, Downsview North, Wilson Heights",32
Bayview Village,7
"Bedford Park, Lawrence Manor East",36
Berczy Park,100
"Birch Cliff, Cliffside West",11


In [27]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 343 uniques categories.


In [28]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot.shape

(5097, 343)

In [29]:
# rename column 'Neighborhood' -> 'Neighborhood (category)':
toronto_onehot.rename(columns = {'Neighborhood':'Neighborhood (category)'}, inplace = True)

# add neighborhood column back to dataframe
toronto_onehot.insert(loc=0, column='Neighborhood', value=toronto_venues['Neighborhood']) 
toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Arcade,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head(10)

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Arcade,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,...,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
5,"Bathurst Manor, Downsview North, Wilson Heights",0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,0.0,...,0.0,0.027778,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0
8,Berczy Park,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
shape = toronto_grouped.shape
print('There are {} neighborhoods and {} unique categories.'.format(shape[0], shape[1]-1))

There are 101 neighborhoods and 343 unique categories.


In [32]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    # take only the row belonging to this n'hood & transpose:
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()   
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]    # first row = just 'Neighborhood' & the n'hood's name -> drop
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 3})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                 venue  freq
0                 Café  0.07
1          Coffee Shop  0.06
2                Hotel  0.06
3              Theater  0.04
4  Japanese Restaurant  0.03


----Agincourt----
                  venue   freq
0    Chinese Restaurant  0.163
1         Shopping Mall  0.070
2           Supermarket  0.047
3  Hong Kong Restaurant  0.047
4           Coffee Shop  0.047


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                  venue  freq
0    Chinese Restaurant  0.16
1             BBQ Joint  0.12
2                  Park  0.08
3  Fast Food Restaurant  0.08
4                Bakery  0.08


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                  venue   freq
0           Pizza Place  0.167
1  Fast Food Restaurant  0.083
2                  Park  0.083
3   Fried Chicken Joint  0.083
4        Hardware Store  0.083


----Alderwood, Long Branch----
 

In [33]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [34]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Café,Coffee Shop,Hotel,Theater,Japanese Restaurant,Restaurant,Bakery,Gastropub,Breakfast Spot,Asian Restaurant
1,Agincourt,Chinese Restaurant,Shopping Mall,Hong Kong Restaurant,Coffee Shop,Supermarket,Bank,Japanese Restaurant,Noodle House,Restaurant,Sushi Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Chinese Restaurant,BBQ Joint,Fast Food Restaurant,Park,Pharmacy,Bakery,Hobby Shop,Shopping Mall,Caribbean Restaurant,Discount Store
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Pizza Place,Sandwich Place,Fast Food Restaurant,Park,Grocery Store,Beer Store,Coffee Shop,Hardware Store,Pharmacy,Fried Chicken Joint
4,"Alderwood, Long Branch",Coffee Shop,Pizza Place,Pharmacy,Café,Park,Italian Restaurant,Discount Store,Seafood Restaurant,Bar,Burger Joint
5,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Pizza Place,Gas Station,Men's Store,Supermarket,Diner,Bank,Chinese Restaurant,Sushi Restaurant,Sandwich Place
6,Bayview Village,Park,Japanese Restaurant,Chinese Restaurant,Trail,Café,Bank,Fish Market,Fish & Chips Shop,Dumpling Restaurant,Eastern European Restaurant
7,"Bedford Park, Lawrence Manor East",Italian Restaurant,Coffee Shop,Sushi Restaurant,Comfort Food Restaurant,Pizza Place,Sports Club,Bridal Shop,Butcher,Café,Sandwich Place
8,Berczy Park,Coffee Shop,Café,Hotel,Restaurant,Japanese Restaurant,Beer Bar,Farmers Market,Park,Italian Restaurant,Gastropub
9,"Birch Cliff, Cliffside West",Park,College Stadium,Café,Skating Rink,General Entertainment,Gym,Gym Pool,Hotel,Dance Studio,Eastern European Restaurant


In [36]:
# set number of clusters
kclusters = 7

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 2, 2, 2, 0, 2, 3, 0, 0, 3], dtype=int32)

In [37]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neighborhoods

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head(10) # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517,4.0,Zoo Exhibit,Hobby Shop,Other Great Outdoors,Trail,Fast Food Restaurant,Flea Market,Fish Market,Dumpling Restaurant,Eastern European Restaurant,Electronics Store
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725,1.0,Breakfast Spot,Park,Burger Joint,Italian Restaurant,Fast Food Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193,2.0,Convenience Store,Grocery Store,Discount Store,Pharmacy,Supermarket,Gym / Fitness Center,Athletics & Sports,Gymnastics Gym,Restaurant,Fast Food Restaurant
3,M1G,Scarborough,Woburn,43.768369,-79.21759,2.0,Pizza Place,Indian Restaurant,Park,Bakery,Department Store,Sandwich Place,Chinese Restaurant,Discount Store,Supplement Shop,Supermarket
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944,0.0,Indian Restaurant,Bakery,Coffee Shop,Fried Chicken Joint,Athletics & Sports,Flower Shop,Bus Line,Music Store,Gas Station,Caribbean Restaurant
5,M1J,Scarborough,Scarborough Village,43.743125,-79.23175,2.0,Fast Food Restaurant,Sandwich Place,Train Station,Coffee Shop,Restaurant,Chinese Restaurant,Big Box Store,Convenience Store,Indian Restaurant,Falafel Restaurant
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.726276,-79.263625,2.0,Chinese Restaurant,Discount Store,Coffee Shop,Convenience Store,Bank,Department Store,Sandwich Place,Light Rail Station,Grocery Store,Pharmacy
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713054,-79.285055,2.0,Intersection,Bakery,Park,Fast Food Restaurant,Bus Line,Bus Station,Pharmacy,Soccer Field,Beer Store,Coffee Shop
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.724235,-79.227925,2.0,Fast Food Restaurant,Pharmacy,Gas Station,Coffee Shop,Park,Bistro,Hardware Store,Flower Shop,Sandwich Place,Liquor Store
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.69677,-79.259967,3.0,Park,College Stadium,Café,Skating Rink,General Entertainment,Gym,Gym Pool,Hotel,Dance Studio,Eastern European Restaurant


In [38]:
toronto_merged[toronto_merged['Neighborhood'] == 'Upper Rouge']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,M1X,Scarborough,Upper Rouge,43.834215,-79.216701,,,,,,,,,,,


In [40]:
# get the row index of the Upper Rouge neighborhood:
idx = toronto_merged.index[toronto_merged['Neighborhood'] == 'Upper Rouge'].tolist()

# change its cluster to a new cluster number:
toronto_merged.loc[idx, 'Cluster Labels'] = -1

# check the changed value:
toronto_merged[toronto_merged['Neighborhood'] == 'Upper Rouge']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,M1X,Scarborough,Upper Rouge,43.834215,-79.216701,-1.0,,,,,,,,,,


In [41]:
# create map
map_clusters = folium.Map(location=[Toronto_latitude, Toronto_longitude], zoom_start=11)

# set color scheme for the clusters
#colors = ['mediumblue', 'gold', 'ivory', 'magenta', 'lawngreen', 'aqua', 'red', 'black']
colors = ['blue', 'beige', 'orange', 'purple', 'lightgreen', 'darkblue', 'black', 'red']
colors_l = ['skyblue', 'navajowhite', 'orange', 'mediumorchid', 'lightgreen', 'steelblue', 'black', 'crimson']

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']) :
    label = folium.Popup('{} : Cluster {}'.format(poi, int(cluster)), parse_html=True)
    folium.Marker([lat,lon], popup=label, icon=folium.Icon(color=colors[int(cluster)], prefix='fa', icon='circle')).add_to(map_clusters)
    
legend_html = '''
     <div style = "position: fixed; top: 50px; left: 50px; width: 110px; height: 260px; border:2px solid grey; z-index:9999; font-size:14px; ">
     &nbsp; Legend: <br>
     &nbsp; Cluster   0 &nbsp; <i class="fa fa-map-marker fa-2x"
                  style="color:skyblue"></i><br>
     &nbsp; Cluster   1 &nbsp; <i class="fa fa-map-marker fa-2x"
                  style="color:navajowhite"></i><br>
     &nbsp; Cluster   2 &nbsp; <i class="fa fa-map-marker fa-2x"
                  style="color:orange"></i><br>
     &nbsp; Cluster   3 &nbsp; <i class="fa fa-map-marker fa-2x"
                  style="color:mediumorchid"></i><br>
     &nbsp; Cluster   4 &nbsp; <i class="fa fa-map-marker fa-2x"
                  style="color:lightgreen"></i><br>
     &nbsp; Cluster   5 &nbsp; <i class="fa fa-map-marker fa-2x"
                  style="color:steelblue"></i><br>
     &nbsp; Cluster   6 &nbsp; <i class="fa fa-map-marker fa-2x"
                  style="color:black"></i><br>
     &nbsp; Cluster -1 &nbsp; <i class="fa fa-map-marker fa-2x"
                  style="color:crimson"></i>
      </div>
     '''
map_clusters.get_root().html.add_child(folium.Element(legend_html))
       
map_clusters

In [42]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Scarborough,0.0,Indian Restaurant,Bakery,Coffee Shop,Fried Chicken Joint,Athletics & Sports,Flower Shop,Bus Line,Music Store,Gas Station,Caribbean Restaurant
21,North York,0.0,Coffee Shop,Korean Restaurant,Café,Middle Eastern Restaurant,Hardware Store,Shopping Mall,Sandwich Place,Grocery Store,Fast Food Restaurant,Pizza Place
22,North York,0.0,Coffee Shop,Japanese Restaurant,Ramen Restaurant,Sandwich Place,Fast Food Restaurant,Korean Restaurant,Pizza Place,Sushi Restaurant,Café,Grocery Store
23,North York,0.0,Coffee Shop,Restaurant,Park,Intersection,Sandwich Place,Golf Course,Bank,Gym,Gym / Fitness Center,Dentist's Office
26,North York,0.0,Coffee Shop,Pool,Auto Garage,Japanese Restaurant,Restaurant,Bagel Shop,Burger Joint,Dim Sum Restaurant,Diner,Discount Store
27,North York,0.0,Gym,Japanese Restaurant,Asian Restaurant,Coffee Shop,Beer Store,Concert Hall,Bike Shop,Clothing Store,Supermarket,Restaurant
30,North York,0.0,Turkish Restaurant,Coffee Shop,Athletics & Sports,Food Court,Basketball Court,Latin American Restaurant,Go Kart Track,Climbing Gym,Italian Restaurant,Chinese Restaurant
34,North York,0.0,Portuguese Restaurant,Thrift / Vintage Store,Pet Store,Coffee Shop,Bus Line,French Restaurant,Mediterranean Restaurant,Thai Restaurant,Middle Eastern Restaurant,Wings Joint
36,East York,0.0,Pizza Place,Coffee Shop,Park,Ice Cream Shop,Grocery Store,Thai Restaurant,Sushi Restaurant,Sandwich Place,Bus Line,Pet Store
37,East Toronto,0.0,Pub,Coffee Shop,Bar,Bakery,Park,Breakfast Spot,Caribbean Restaurant,Beach,Tea Room,Bookstore


In [43]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Scarborough,1.0,Breakfast Spot,Park,Burger Joint,Italian Restaurant,Fast Food Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant


In [44]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Scarborough,2.0,Convenience Store,Grocery Store,Discount Store,Pharmacy,Supermarket,Gym / Fitness Center,Athletics & Sports,Gymnastics Gym,Restaurant,Fast Food Restaurant
3,Scarborough,2.0,Pizza Place,Indian Restaurant,Park,Bakery,Department Store,Sandwich Place,Chinese Restaurant,Discount Store,Supplement Shop,Supermarket
5,Scarborough,2.0,Fast Food Restaurant,Sandwich Place,Train Station,Coffee Shop,Restaurant,Chinese Restaurant,Big Box Store,Convenience Store,Indian Restaurant,Falafel Restaurant
6,Scarborough,2.0,Chinese Restaurant,Discount Store,Coffee Shop,Convenience Store,Bank,Department Store,Sandwich Place,Light Rail Station,Grocery Store,Pharmacy
7,Scarborough,2.0,Intersection,Bakery,Park,Fast Food Restaurant,Bus Line,Bus Station,Pharmacy,Soccer Field,Beer Store,Coffee Shop
8,Scarborough,2.0,Fast Food Restaurant,Pharmacy,Gas Station,Coffee Shop,Park,Bistro,Hardware Store,Flower Shop,Sandwich Place,Liquor Store
11,Scarborough,2.0,Middle Eastern Restaurant,Pizza Place,Grocery Store,Intersection,Burger Joint,Pharmacy,Korean Restaurant,Rental Car Location,Bakery,Bar
12,Scarborough,2.0,Chinese Restaurant,Shopping Mall,Hong Kong Restaurant,Coffee Shop,Supermarket,Bank,Japanese Restaurant,Noodle House,Restaurant,Sushi Restaurant
13,Scarborough,2.0,Pharmacy,Fast Food Restaurant,Chinese Restaurant,Pizza Place,Coffee Shop,Vietnamese Restaurant,Market,Caribbean Restaurant,Golf Course,Thai Restaurant
14,Scarborough,2.0,Chinese Restaurant,BBQ Joint,Fast Food Restaurant,Park,Pharmacy,Bakery,Hobby Shop,Shopping Mall,Caribbean Restaurant,Discount Store


In [45]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,Scarborough,3.0,Park,College Stadium,Café,Skating Rink,General Entertainment,Gym,Gym Pool,Hotel,Dance Studio,Eastern European Restaurant
19,North York,3.0,Park,Japanese Restaurant,Chinese Restaurant,Trail,Café,Bank,Fish Market,Fish & Chips Shop,Dumpling Restaurant,Eastern European Restaurant
20,North York,3.0,Park,Pool,Gym,Gym / Fitness Center,Intersection,Farm,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store
32,North York,3.0,Park,Moving Target,Home Service,Baseball Field,Business Service,Flea Market,Flower Shop,Food & Drink Shop,Eastern European Restaurant,Food
50,Downtown Toronto,3.0,Park,Neighborhood (category),Grocery Store,Playground,Candy Store,Trail,Athletics & Sports,Beer Store,Zoo Exhibit,Farm


In [46]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,4.0,Zoo Exhibit,Hobby Shop,Other Great Outdoors,Trail,Fast Food Restaurant,Flea Market,Fish Market,Dumpling Restaurant,Eastern European Restaurant,Electronics Store


In [47]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 5, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
59,Downtown Toronto,5.0,Harbor / Marina,Beach,Café,Park,Boat or Ferry,Disc Golf,Pier,Zoo Exhibit,Farmers Market,Ethiopian Restaurant


In [48]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 6, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Scarborough,6.0,Furniture / Home Store,Chinese Restaurant,Pizza Place,Fast Food Restaurant,Coffee Shop,Indian Restaurant,Bakery,Bank,Electronics Store,Park
29,North York,6.0,Coffee Shop,Furniture / Home Store,Pizza Place,Bank,Restaurant,Bar,Caribbean Restaurant,Discount Store,Grocery Store,Park
80,York,6.0,Coffee Shop,Furniture / Home Store,Grocery Store,Discount Store,Sandwich Place,Pizza Place,Bus Line,Fast Food Restaurant,Farm,Eastern European Restaurant


In [49]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == -1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,Scarborough,-1.0,,,,,,,,,,


Final Result: 

  Cluster 0   Coffee shops and various restaurants.
  
  Cluster 1   Breakfast spot, Fast food, but (other than Cluster 2) no Discount store / Supermarket / Grocery etc.
  
  Cluster 2   Fast food, Discount store, Supermarket, Grocery.
  
  Cluster 3   Park and various sports.
  
  Cluster 4   Zoo Exhibit and Other Great Outdoors.
  
  Cluster 5   Harbor/Marina.
  
  Cluster 6   Furniture/Home Store.
  
  Cluster -1   No venues at all.