# Toronto Neighbourhoods and Postal Codes

In [2]:
import pandas as pd
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

In [3]:
#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

This step define a function to geolocate and address or postal code

In [4]:
import geocoder 

In [5]:
def geolocate(address):
    lat_lng_coords = None
    while(lat_lng_coords is None):
      g = geocoder.arcgis(address)
      lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude

URL where Toronto's Postal Codes are taken 

In [6]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [7]:
data  = requests.get(url).text 

In [8]:
soup = BeautifulSoup(data,"html5lib")  # create a soup object using the variable 'data'

Using the read_html method to extract the tables from the URL

In [9]:
dataframe_list = pd.read_html(url, flavor='bs4')

## Toronto's Postal Code table

In [10]:
postal_codes = dataframe_list[0]
postal_codes

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M1ANot assigned,M2ANot assigned,M3ANorth York(Parkwoods),M4ANorth York(Victoria Village),M5ADowntown Toronto(Regent Park / Harbourfront),M6ANorth York(Lawrence Manor / Lawrence Heights),M7AQueen's Park(Ontario Provincial Government),M8ANot assigned,M9AEtobicoke(Islington Avenue)
1,M1BScarborough(Malvern / Rouge),M2BNot assigned,M3BNorth York(Don Mills)North,M4BEast York(Parkview Hill / Woodbine Gardens),"M5BDowntown Toronto(Garden District, Ryerson)",M6BNorth York(Glencairn),M7BNot assigned,M8BNot assigned,M9BEtobicoke(West Deane Park / Princess Garden...
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M2CNot assigned,M3CNorth York(Don Mills)South(Flemingdon Park),M4CEast York(Woodbine Heights),M5CDowntown Toronto(St. James Town),M6CYork(Humewood-Cedarvale),M7CNot assigned,M8CNot assigned,M9CEtobicoke(Eringate / Bloordale Gardens / Ol...
3,M1EScarborough(Guildwood / Morningside / West ...,M2ENot assigned,M3ENot assigned,M4EEast Toronto(The Beaches),M5EDowntown Toronto(Berczy Park),M6EYork(Caledonia-Fairbanks),M7ENot assigned,M8ENot assigned,M9ENot assigned
4,M1GScarborough(Woburn),M2GNot assigned,M3GNot assigned,M4GEast York(Leaside),M5GDowntown Toronto(Central Bay Street),M6GDowntown Toronto(Christie),M7GNot assigned,M8GNot assigned,M9GNot assigned
5,M1HScarborough(Cedarbrae),M2HNorth York(Hillcrest Village),M3HNorth York(Bathurst Manor / Wilson Heights ...,M4HEast York(Thorncliffe Park),M5HDowntown Toronto(Richmond / Adelaide / King),M6HWest Toronto(Dufferin / Dovercourt Village),M7HNot assigned,M8HNot assigned,M9HNot assigned
6,M1JScarborough(Scarborough Village),M2JNorth York(Fairview / Henry Farm / Oriole),M3JNorth York(Northwood Park / York University),M4JEast YorkEast Toronto(The Danforth East),M5JDowntown Toronto(Harbourfront East / Union ...,M6JWest Toronto(Little Portugal / Trinity),M7JNot assigned,M8JNot assigned,M9JNot assigned
7,M1KScarborough(Kennedy Park / Ionview / East B...,M2KNorth York(Bayview Village),M3KNorth York(Downsview)East (CFB Toronto),M4KEast Toronto(The Danforth West / Riverdale),M5KDowntown Toronto(Toronto Dominion Centre / ...,M6KWest Toronto(Brockton / Parkdale Village / ...,M7KNot assigned,M8KNot assigned,M9KNot assigned
8,M1LScarborough(Golden Mile / Clairlea / Oakridge),M2LNorth York(York Mills / Silver Hills),M3LNorth York(Downsview)West,M4LEast Toronto(India Bazaar / The Beaches West),M5LDowntown Toronto(Commerce Court / Victoria ...,M6LNorth York(North Park / Maple Leaf Park / U...,M7LNot assigned,M8LNot assigned,M9LNorth York(Humber Summit)
9,M1MScarborough(Cliffside / Cliffcrest / Scarbo...,M2MNorth York(Willowdale / Newtonbrook),M3MNorth York(Downsview)Central,M4MEast Toronto(Studio District),M5MNorth York(Bedford Park / Lawrence Manor East),M6MYork(Del Ray / Mount Dennis / Keelsdale and...,M7MNot assigned,M8MNot assigned,M9MNorth York(Humberlea / Emery)


Exploring the data of Postal Code to select Boroughs and Neighbourhoods

In [11]:
postal_codes.shape

(20, 9)

In [12]:
postal_codes.iloc[0,0][3:17] # Exploring how to identify the Postal Code assigment

'Not assigned'

In [13]:
Toronto = pd.DataFrame(columns = ['postal_code', 'borough', 'neighbourhoods']) # Create Dataframe for data
Toronto

Unnamed: 0,postal_code,borough,neighbourhoods


The following code , extract borough and neighbourhood info from the postal code table - includes to change separator of neighbourhoods with ","

In [14]:
x = 0
borough=''
neighbour=''
complete = True
for index, row in postal_codes.iterrows():
    for region in list(row):
        Toronto.at[x,'postal_code'] = region[0:3]
        for char in region[3:]:
            if (char != '(') & complete:
                borough = borough + char
            else:
                complete = False
                if char != ')':
                    if char != '(':
                        neighbour = neighbour+char
        Toronto.at[x,'borough'] = borough
        Toronto.at[x,'neighbourhoods'] = neighbour.replace('/',',')
        neighbour=''
        borough=''
        complete=True
        x=x+1        

## Segmenting and cleanning the data to identify values to be corrected or eliminated

In [15]:
Toronto.borough.value_counts()

Not assigned                                                    77
North York                                                      24
Scarborough                                                     17
Downtown Toronto                                                17
Etobicoke                                                       11
Central Toronto                                                  9
West Toronto                                                     6
York                                                             5
East Toronto                                                     4
East York                                                        4
East YorkEast Toronto                                            1
East TorontoBusiness reply mail Processing Centre969 Eastern     1
MississaugaCanada Post Gateway Processing Centre                 1
EtobicokeNorthwest                                               1
Queen's Park                                                  

In [16]:
Toronto.postal_code.value_counts()

M5S    1
M3R    1
M6B    1
M4T    1
M4J    1
      ..
M7B    1
M5A    1
M3N    1
M5J    1
M1Z    1
Name: postal_code, Length: 180, dtype: int64

Remove from dataframe the values with 'Not assigened'

In [17]:
Toronto = Toronto[Toronto.borough != 'Not assigned']

In [18]:
Toronto = Toronto.reset_index(drop=True)

In [19]:
Toronto

Unnamed: 0,postal_code,borough,neighbourhoods
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


These are the boroughs that have an invalid name, and they are corrected

In [20]:
Toronto['borough'].replace({'EtobicokeNorthwest': 'Etobicoke Northwest','East YorkEast Toronto':'East York','MississaugaCanada Post Gateway Processing Centre': 'Mississauga','East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto','Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto'}, inplace=True)

In [21]:
Toronto.borough.value_counts()

North York             24
Downtown Toronto       18
Scarborough            17
Etobicoke              11
Central Toronto         9
West Toronto            6
York                    5
East Toronto            5
East York               5
Queen's Park            1
Mississauga             1
Etobicoke Northwest     1
Name: borough, dtype: int64

In [22]:
Toronto

Unnamed: 0,postal_code,borough,neighbourhoods
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [23]:
Toronto.shape

(103, 3)

## Geolocate the Postal codes , getting the latitude and Longitude

In [24]:
addresses = list(Toronto['postal_code'])
lat=[]
lon=[]
for code in addresses:
    location = geolocate(code +', Toronto , Ontario')
    lat.append(location[0])
    lon.append(location[1])
Toronto['latitude']  = lat
Toronto['longitude'] = lon

In [25]:
Toronto.sort_values('longitude').head(30)

Unnamed: 0,postal_code,borough,neighbourhoods,latitude,longitude
89,M9V,Etobicoke,"South Steeles , Silverstone , Humbergate , Jam...",43.74453,-79.58624
94,M9W,Etobicoke Northwest,"Clairville , Humberwood , Woodbine Downs , Wes...",43.71174,-79.57941
17,M9C,Etobicoke,"Eringate , Bloordale Gardens , Old Burnhamthor...",43.64857,-79.57825
77,M9R,Etobicoke,"Kingsview Village , St. Phillips , Martin Grov...",43.68681,-79.55728
50,M9L,North York,Humber Summit,43.75948,-79.55707
11,M9B,Etobicoke,"West Deane Park , Princess Gardens , Martin Gr...",43.65034,-79.55362
93,M8W,Etobicoke,"Alderwood , Long Branch",43.60124,-79.53879
57,M9M,North York,"Humberlea , Emery",43.73367,-79.53769
70,M9P,Etobicoke,Westmount,43.6963,-79.52926
5,M9A,Etobicoke,Islington Avenue,43.66263,-79.52831


In [26]:
Toronto.borough.value_counts()

North York             24
Downtown Toronto       18
Scarborough            17
Etobicoke              11
Central Toronto         9
West Toronto            6
York                    5
East Toronto            5
East York               5
Queen's Park            1
Mississauga             1
Etobicoke Northwest     1
Name: borough, dtype: int64

## MAP of Toronto with Boroughs/Postal Codes

In [27]:
city = 'Toronto, Ontario'

coords = geolocate(city)
latitude_tor = coords[0]
longitude_tor = coords[1]

In [28]:
print('The geograpical coordinate of {} are {}, {}.'.format(city, latitude_tor, longitude_tor))

The geograpical coordinate of Toronto, Ontario are 43.648690000000045, -79.38543999999996.


In [29]:
map_toronto = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=10)

In [30]:
for lat, lng, borough, neighborhood in zip(Toronto['latitude'], Toronto['longitude'], Toronto['borough'], Toronto['neighbourhoods']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

In [31]:
map_toronto

In [32]:
Toronto

Unnamed: 0,postal_code,borough,neighbourhoods,latitude,longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.72327,-79.45042
4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.65319,-79.51113
99,M4Y,Downtown Toronto,Church and Wellesley,43.66659,-79.38133
100,M7Y,East Toronto,Enclave of M4L,43.64869,-79.38544
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.63278,-79.48945


In [33]:
Toronto.shape

(103, 5)

## Exploring Toronto Data for Segmentation

In [34]:
import json
import numpy as np

In [35]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [36]:
CLIENT_ID = 'IMYFYARYBRCQRG2ZPE0TPPZYEFNMVEABKME2UO2MAGRA03YK' # your Foursquare ID
CLIENT_SECRET = 'UGI4RZNBBAHAEWPE031E0YJ1QR4HISS5EMAB35YQXAALQQBK' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: IMYFYARYBRCQRG2ZPE0TPPZYEFNMVEABKME2UO2MAGRA03YK
CLIENT_SECRET:UGI4RZNBBAHAEWPE031E0YJ1QR4HISS5EMAB35YQXAALQQBK


In [37]:
Toronto.loc[0, 'neighbourhoods'] +' ' + Toronto.loc[0, 'postal_code']

'Parkwoods M3A'

In [38]:
postal_code_latitude = Toronto.loc[0, 'latitude'] # neighborhood latitude value
postal_code_longitude = Toronto.loc[0, 'longitude'] # neighborhood longitude value
postal_code_name = Toronto.loc[0, 'postal_code'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(postal_code_name, 
                                                               postal_code_latitude, 
                                                               postal_code_longitude))

Latitude and longitude values of M3A are 43.75245000000007, -79.32990999999998.


In [39]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    postal_code_latitude, 
    postal_code_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=IMYFYARYBRCQRG2ZPE0TPPZYEFNMVEABKME2UO2MAGRA03YK&client_secret=UGI4RZNBBAHAEWPE031E0YJ1QR4HISS5EMAB35YQXAALQQBK&v=20180605&ll=43.75245000000007,-79.32990999999998&radius=1000&limit=100'

In [40]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '60b14bcf844d0e45a549e8fb'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 23,
  'suggestedBounds': {'ne': {'lat': 43.76145000900008,
    'lng': -79.31747364773157},
   'sw': {'lat': 43.743449991000055, 'lng': -79.3423463522684}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
         

In [41]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [42]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  nearby_venues = json_normalize(venues) # flatten JSON


Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Allwyn's Bakery,Caribbean Restaurant,43.75984,-79.324719
2,Tim Hortons,Café,43.760668,-79.326368
3,Bruno's valu-mart,Grocery Store,43.746143,-79.32463
4,A&W,Fast Food Restaurant,43.760643,-79.326865


In [43]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [44]:
toronto_venues = getNearbyVenues(names=Toronto['postal_code'],
                                   latitudes=Toronto['latitude'],
                                   longitudes=Toronto['longitude']
                                  )

M3A
M4A
M5A
M6A
M7A
M9A
M1B
M3B
M4B
M5B
M6B
M9B
M1C
M3C
M4C
M5C
M6C
M9C
M1E
M4E
M5E
M6E
M1G
M4G
M5G
M6G
M1H
M2H
M3H
M4H
M5H
M6H
M1J
M2J
M3J
M4J
M5J
M6J
M1K
M2K
M3K
M4K
M5K
M6K
M1L
M2L
M3L
M4L
M5L
M6L
M9L
M1M
M2M
M3M
M4M
M5M
M6M
M9M
M1N
M2N
M3N
M4N
M5N
M6N
M9N
M1P
M2P
M4P
M5P
M6P
M9P
M1R
M2R
M4R
M5R
M6R
M7R
M9R
M1S
M4S
M5S
M6S
M1T
M4T
M5T
M1V
M4V
M5V
M8V
M9V
M1W
M4W
M5W
M8W
M9W
M1X
M4X
M5X
M8X
M4Y
M7Y
M8Y
M8Z


In [45]:
print(toronto_venues.shape)
toronto_venues.head()

(2371, 7)


Unnamed: 0,Postal Code,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.75245,-79.32991,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,43.75245,-79.32991,KFC,43.754387,-79.333021,Fast Food Restaurant
2,M3A,43.75245,-79.32991,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,M4A,43.73057,-79.31306,Wigmore Park,43.731023,-79.310771,Park
4,M4A,43.73057,-79.31306,Memories of Africa,43.726602,-79.312427,Grocery Store


In [46]:
toronto_venues['Postal Code'].value_counts()

M5K    100
M5H    100
M5X    100
M7R    100
M5B    100
      ... 
M2L      1
M3M      1
M5P      1
M1H      1
M1R      1
Name: Postal Code, Length: 101, dtype: int64

In [47]:
toronto_venues.groupby('Postal Code').count()

Unnamed: 0_level_0,Postal Code Latitude,Postal Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1B,2,2,2,2,2,2
M1C,2,2,2,2,2,2
M1E,3,3,3,3,3,3
M1G,4,4,4,4,4,4
M1H,1,1,1,1,1,1
...,...,...,...,...,...,...
M9N,7,7,7,7,7,7
M9P,5,5,5,5,5,5
M9R,2,2,2,2,2,2
M9V,15,15,15,15,15,15


In [48]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 267 uniques categories.


In [49]:
toronto_venues['Postal Code'] 

0       M3A
1       M3A
2       M3A
3       M4A
4       M4A
       ... 
2366    M8Z
2367    M8Z
2368    M8Z
2369    M8Z
2370    M8Z
Name: Postal Code, Length: 2371, dtype: object

In [50]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postal Code'] = toronto_venues['Postal Code'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Postal Code,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,...,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()
toronto_grouped

Unnamed: 0,Postal Code,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,American Restaurant,Antique Shop,Aquarium,Art Gallery,Art Museum,...,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,M9N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
97,M9P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
98,M9R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
99,M9V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
toronto_grouped.shape

(101, 268)

In [53]:
num_top_venues = 5

for hood in toronto_grouped['Postal Code']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Postal Code'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M1B----
                    venue  freq
0    Fast Food Restaurant   0.5
1  Furniture / Home Store   0.5
2                     ATM   0.0
3           Moving Target   0.0
4               Nightclub   0.0


----M1C----
                     venue  freq
0            Moving Target   0.5
1                      Bar   0.5
2                      ATM   0.0
3                Nightclub   0.0
4  New American Restaurant   0.0


----M1E----
                        venue  freq
0  Construction & Landscaping  0.33
1                        Park  0.33
2        Gym / Fitness Center  0.33
3                      Museum  0.00
4                Noodle House  0.00


----M1G----
                   venue  freq
0            Coffee Shop  0.25
1  Korean BBQ Restaurant  0.25
2                   Park  0.25
3       Business Service  0.25
4            Opera House  0.00


----M1H----
                     venue  freq
0                    Trail   1.0
1                      ATM   0.0
2            Moving Target   0.0
3       

                     venue  freq
0              Swim School   0.5
1                 Bus Line   0.5
2                      ATM   0.0
3                Nightclub   0.0
4  New American Restaurant   0.0


----M4P----
               venue  freq
0              Hotel  0.22
1   Department Store  0.11
2  Food & Drink Shop  0.11
3     Breakfast Spot  0.11
4            Dog Run  0.11


----M4R----
           venue  freq
0     Playground  0.33
1           Park  0.33
2       Gym Pool  0.33
3  Moving Target  0.00
4      Nightclub  0.00


----M4S----
                venue  freq
0        Dessert Shop  0.12
1         Pizza Place  0.08
2      Sandwich Place  0.08
3         Coffee Shop  0.08
4  Italian Restaurant  0.08


----M4T----
                     venue  freq
0                      Gym   0.5
1                   Lawyer   0.5
2                      ATM   0.0
3                Nightclub   0.0
4  New American Restaurant   0.0


----M4V----
                venue  freq
0         Coffee Shop  0.25
1  Light R

In [54]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [55]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postalcode_venues_sorted = pd.DataFrame(columns=columns)
postalcode_venues_sorted['Postal Code'] = toronto_grouped['Postal Code']

for ind in np.arange(toronto_grouped.shape[0]):
    postalcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postalcode_venues_sorted.head(20)

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Furniture / Home Store,ATM,Moving Target,Nightclub,New American Restaurant,Neighborhood,Music Venue,Museum,Movie Theater
1,M1C,Moving Target,Bar,ATM,Nightclub,New American Restaurant,Neighborhood,Music Venue,Museum,Movie Theater,Metro Station
2,M1E,Construction & Landscaping,Park,Gym / Fitness Center,Museum,Noodle House,Nightclub,New American Restaurant,Neighborhood,Music Venue,Moving Target
3,M1G,Coffee Shop,Korean BBQ Restaurant,Park,Business Service,Opera House,Noodle House,Nightclub,New American Restaurant,Neighborhood,ATM
4,M1H,Trail,ATM,Moving Target,Nightclub,New American Restaurant,Neighborhood,Music Venue,Museum,Movie Theater,Office
5,M1J,Spa,Restaurant,Indian Restaurant,Park,Grocery Store,ATM,Moving Target,New American Restaurant,Neighborhood,Music Venue
6,M1K,Convenience Store,Hobby Shop,Discount Store,Coffee Shop,Department Store,Monument / Landmark,Moroccan Restaurant,Movie Theater,Moving Target,Opera House
7,M1L,Bakery,Bus Line,Coffee Shop,Intersection,Bus Station,Soccer Field,Metro Station,Moving Target,Movie Theater,Opera House
8,M1M,Ice Cream Shop,Coffee Shop,Pizza Place,Sandwich Place,Liquor Store,Pharmacy,Discount Store,Restaurant,Music Venue,Nightclub
9,M1N,College Stadium,Café,Skating Rink,General Entertainment,ATM,Music Venue,Noodle House,Nightclub,New American Restaurant,Neighborhood


## Clustering Toronto Postal Codes

In [56]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


In [57]:
# set number of clusters
kclusters = 6

toronto_grouped_clustering = toronto_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([4, 0, 2, 0, 3, 2, 0, 0, 0, 0, 0, 5, 0, 4, 2, 4, 1, 0, 3, 1])

In [58]:
# add clustering labels
postalcode_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

postalcode_venues_sorted['Cluster Labels'] = postalcode_venues_sorted['Cluster Labels'].astype(int)

toronto_merged = Toronto

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(postalcode_venues_sorted.set_index('Postal Code'), on='postal_code')

toronto_merged.head() # check the last columns!

Unnamed: 0,postal_code,borough,neighbourhoods,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.75245,-79.32991,4.0,Food & Drink Shop,Fast Food Restaurant,Park,ATM,Moving Target,New American Restaurant,Neighborhood,Music Venue,Museum,Moroccan Restaurant
1,M4A,North York,Victoria Village,43.73057,-79.31306,2.0,German Restaurant,Park,Grocery Store,ATM,Office,Nightclub,New American Restaurant,Neighborhood,Music Venue,Museum
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65512,-79.36264,0.0,Coffee Shop,Breakfast Spot,Restaurant,Thrift / Vintage Store,Distribution Center,Pub,Electronics Store,Event Space,Spa,Food Truck
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.72327,-79.45042,0.0,Clothing Store,Cosmetics Shop,Food Court,Women's Store,Men's Store,Bookstore,Restaurant,Furniture / Home Store,Toy / Game Store,Metro Station
4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188,0.0,Coffee Shop,Café,Mediterranean Restaurant,Bank,Gastropub,Theater,Falafel Restaurant,Park,Fried Chicken Joint,Sandwich Place


In [217]:
postalcode_venues_sorted.loc[26,'Cluster Labels']

4

In [1]:
# create map
map_clusters = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

toronto_merged = toronto_merged[~toronto_merged['Cluster Labels'].isna()]  # Eliminates the postal codes those do not identify any venues


# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['latitude'], toronto_merged['longitude'], toronto_merged['postal_code'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    #print(cluster, poi)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

NameError: name 'folium' is not defined