# Segmenting and Clustering Neighborhoods in Toronto

### Install and access required libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

import json # library to handle JSON files

import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors # Matplotlib and associated plotting modules

from sklearn.cluster import KMeans # import k-means from clustering stage

In [2]:
from bs4 import BeautifulSoup # website scraping libraries and packages in Python from BeautifulSoup 

In [3]:
from geopy.geocoders import Nominatim

In [5]:
import folium # map rendering library

### Question 1

#### Download Dataset

From the Wikipage we need Canada postal code dataset. We will essentially need a dataset that contains the 5 boroughs and the neighborhoods that exist in each borough as well as the the latitude and longitude coordinates of each neighborhood.

Luckily, this dataset exists for free on the web. Feel free to try to find this dataset on your own, but here is the link to the dataset: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

#### Get the html from the wikipedia page with the postal codes of Canada

In [6]:
# GET request
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#### Use the BeautifulSoup package or any other way you are comfortable with to transform the data in the table on the Wikipedia page into the above pandas dataframe

In [7]:
soup = BeautifulSoup(data, 'html.parser')

#### 3 List in Array

In [8]:
postalCodeList = []
boroughList = []
neighborhoodList = []

#### Locate the table and postal code

In [9]:
soup.find('table').find_all('tr')

# find all the rows of the table
soup.find('table').find_all('tr')

# for each row of the table, find all the table data
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')

In [10]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text.rstrip('\n'))
        boroughList.append(cells[1].text.rstrip('\n'))
        neighborhoodList.append(cells[2].text.rstrip('\n'))

#### Load the pandas dataframe

In [11]:
toronto_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Remove no-assigned columns

In [12]:
toronto_df_drop = toronto_df[toronto_df.Borough != "Not assigned"].reset_index(drop=True)
toronto_df_drop.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Group neighbors in same borough

In [13]:
toronto_df_grouped = toronto_df_drop.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Neighborhood WHICH IS "Not assigned", make the value the same as Borough

In [14]:
for index, row in toronto_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
        
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Question 1 requirement table view

In [15]:
column_names = ["PostalCode", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(toronto_df_grouped[toronto_df_grouped["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Wexford, Maryvale"
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


In [16]:
toronto_df_grouped.shape

(103, 3)

### Question 2

#### Read the csv file via panda

In [17]:
coordinates = pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
coordinates.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge the data

In [19]:
toronto_df_new = toronto_df_grouped.merge(coordinates, on="PostalCode", how="left")
toronto_df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Check to make sure the coordinates are added as required by question 2

In [20]:
column_names = ["PostalCode", "Borough", "Neighborhood", "Latitude", "Longitude"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(toronto_df_new[toronto_df_new["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


### Question 3

#### Get Latitude and Longitude

In [22]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="Toronto")
location = geolocator.geocode(address)
latitude_toronto = location.latitude
longitude_toronto = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude_toronto, longitude_toronto))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Create a map and markers

In [23]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['Borough'], toronto_df_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version

In [25]:
CLIENT_ID = 'PLGWPPLWG4FVVMRXTIHS1TC1AJEIBIT021TI1DXSHDYU2CUU' # your Foursquare ID
CLIENT_SECRET = '4OY22IDKNMSG1EQJOZXMBIZPFG1OLTOVRMED5HNGFFODHRUO'  # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PLGWPPLWG4FVVMRXTIHS1TC1AJEIBIT021TI1DXSHDYU2CUU
CLIENT_SECRET:4OY22IDKNMSG1EQJOZXMBIZPFG1OLTOVRMED5HNGFFODHRUO


In [26]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['PostalCode'], toronto_df_new['Borough'], 
                                                  toronto_df_new['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id=PLGWPPLWG4FVVMRXTIHS1TC1AJEIBIT021TI1DXSHDYU2CUU&client_secret=4OY22IDKNMSG1EQJOZXMBIZPFG1OLTOVRMED5HNGFFODHRUO&v=20180605 \
     &ll=43.6534817,-79.3839347&radius=500&limit=100".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))


#### Convert venues list to new dataframe

In [27]:
venues_df = pd.DataFrame(venues)


venues_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(7931, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,Downtown Toronto,43.653232,-79.385296,Neighborhood
1,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,Nathan Phillips Square,43.65227,-79.383516,Plaza
2,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,Chatime 日出茶太,43.655542,-79.384684,Bubble Tea Shop
3,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,Textile Museum of Canada,43.654396,-79.3865,Art Museum
4,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,Indigo,43.653515,-79.380696,Bookstore


#### Check how many venues are returned

In [28]:
venues_df.groupby(["PostalCode", "Borough", "Neighborhood"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
PostalCode,Borough,Neighborhood,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M1B,Scarborough,"Malvern, Rouge",77,77,77,77,77,77
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",77,77,77,77,77,77
M1E,Scarborough,"Guildwood, Morningside, West Hill",77,77,77,77,77,77
M1G,Scarborough,Woburn,77,77,77,77,77,77
M1H,Scarborough,Cedarbrae,77,77,77,77,77,77
M1J,Scarborough,Scarborough Village,77,77,77,77,77,77
M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",77,77,77,77,77,77
M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",77,77,77,77,77,77
M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",77,77,77,77,77,77
M1N,Scarborough,"Birch Cliff, Cliffside West",77,77,77,77,77,77


#### Analysing each area

In [29]:
# one hot encoding
toronto_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_onehot['PostalCode'] = venues_df['PostalCode'] 
toronto_onehot['Borough'] = venues_df['Borough'] 
toronto_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_onehot.columns[-3:]) + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot

(7931, 60)


Unnamed: 0,PostalCode,Borough,Neighborhoods,American Restaurant,Art Museum,Bank,Bar,Bookstore,Breakfast Spot,Bubble Tea Shop,...,Steakhouse,Sushi Restaurant,Tanning Salon,Tea Room,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Women's Store
0,M1B,Scarborough,"Malvern, Rouge",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1B,Scarborough,"Malvern, Rouge",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1B,Scarborough,"Malvern, Rouge",0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,M1B,Scarborough,"Malvern, Rouge",0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1B,Scarborough,"Malvern, Rouge",0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,M1B,Scarborough,"Malvern, Rouge",0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,M1B,Scarborough,"Malvern, Rouge",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,M1B,Scarborough,"Malvern, Rouge",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,M1B,Scarborough,"Malvern, Rouge",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,M1B,Scarborough,"Malvern, Rouge",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [30]:
toronto_grouped = toronto_onehot.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()

print(toronto_grouped.shape)
toronto_grouped

(103, 60)


Unnamed: 0,PostalCode,Borough,Neighborhoods,American Restaurant,Art Museum,Bank,Bar,Bookstore,Breakfast Spot,Bubble Tea Shop,...,Steakhouse,Sushi Restaurant,Tanning Salon,Tea Room,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Women's Store
0,M1B,Scarborough,"Malvern, Rouge",0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,...,0.025974,0.012987,0.012987,0.012987,0.025974,0.025974,0.012987,0.012987,0.012987,0.012987
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,...,0.025974,0.012987,0.012987,0.012987,0.025974,0.025974,0.012987,0.012987,0.012987,0.012987
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,...,0.025974,0.012987,0.012987,0.012987,0.025974,0.025974,0.012987,0.012987,0.012987,0.012987
3,M1G,Scarborough,Woburn,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,...,0.025974,0.012987,0.012987,0.012987,0.025974,0.025974,0.012987,0.012987,0.012987,0.012987
4,M1H,Scarborough,Cedarbrae,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,...,0.025974,0.012987,0.012987,0.012987,0.025974,0.025974,0.012987,0.012987,0.012987,0.012987
5,M1J,Scarborough,Scarborough Village,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,...,0.025974,0.012987,0.012987,0.012987,0.025974,0.025974,0.012987,0.012987,0.012987,0.012987
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,...,0.025974,0.012987,0.012987,0.012987,0.025974,0.025974,0.012987,0.012987,0.012987,0.012987
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,...,0.025974,0.012987,0.012987,0.012987,0.025974,0.025974,0.012987,0.012987,0.012987,0.012987
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,...,0.025974,0.012987,0.012987,0.012987,0.025974,0.025974,0.012987,0.012987,0.012987,0.012987
9,M1N,Scarborough,"Birch Cliff, Cliffside West",0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,0.012987,...,0.025974,0.012987,0.012987,0.012987,0.025974,0.025974,0.012987,0.012987,0.012987,0.012987


#### Create a new dataframe and display the top 10 venues for each PostalCode

In [31]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_grouped['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_grouped['Neighborhoods']

for ind in np.arange(toronto_grouped.shape[0]):
    row_categories = toronto_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

# neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

(103, 13)


Unnamed: 0,PostalCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
3,M1G,Scarborough,Woburn,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
4,M1H,Scarborough,Cedarbrae,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant


#### Making Clusters for Neighborhood

In [32]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop(["PostalCode", "Borough", "Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=4).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

  return_n_iter=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [33]:
#create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
toronto_merged = toronto_df_new.copy()

# add clustering labels
toronto_merged["Cluster Labels"] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.drop(["Borough", "Neighborhoods"], 1).set_index("PostalCode"), on="PostalCode")

print(toronto_merged.shape)
toronto_merged.head() # check the last columns!

(103, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant


In [34]:
# sort the results by Cluster Labels
print(toronto_merged.shape)
toronto_merged.sort_values(["Cluster Labels"], inplace=True)
toronto_merged.head()

(103, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
74,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
73,M6C,York,Humewood-Cedarvale,43.693781,-79.428191,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
72,M6B,North York,Glencairn,43.709577,-79.445073,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
71,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant


#### Visualizing the clusters

In [38]:
# create map
map_clusters = folium.Map(location=[latitude_toronto, longitude_toronto], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examine Clusters

Cluster 1

In [39]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
74,York,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
73,York,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
72,North York,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
71,North York,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
70,Downtown Toronto,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
69,Downtown Toronto,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
68,Downtown Toronto,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
67,Downtown Toronto,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant
66,Downtown Toronto,0,Coffee Shop,Clothing Store,Restaurant,Steakhouse,Diner,Cosmetics Shop,Plaza,Hotel,Japanese Restaurant,Thai Restaurant


Cluster 2

In [40]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


Cluster 3

In [41]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


Cluster 4

In [42]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


Cluster 5

In [43]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
