# Segmenting and Clustering Neighborhoods in Toronto

## Problem 1: Scrape Wikipedia Data

In [329]:
#Needed to install fake_user agent before I could do imports
#!pip install fake_useragent

### Import necessary modules

In [330]:
# Import necessary modules
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Enter the selected URL

In [331]:
# URL to scrape data from
scrapeLink = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

### Pull HTML code from URL and find all tables

In [332]:
# Request data from URL and identify all tables
ua1 = UserAgent()
randomHeader = {'User-Agent':str(ua1.random)}
page = requests.get(scrapeLink, randomHeader)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find_all('table')[0]

### Convert HTML code to Pandas DataFrame

In [333]:
# Parse table and convert HTML to DataFrame
n_columns = 0
n_rows=0
column_names = []
    
# Find number of rows and columns
# we also find the column titles if we can
for row in table.find_all('tr'):
                
    # Determine the number of rows in the table
    td_tags = row.find_all('td')
    if len(td_tags) > 0:
        n_rows+=1
        if n_columns == 0:
            # Set the number of columns for our table
            n_columns = len(td_tags)
                        
    # Handle column names if we find them
    th_tags = row.find_all('th') 
    if len(th_tags) > 0 and len(column_names) == 0:
        for th in th_tags:
            column_names.append(th.get_text())
    
# Safeguard on Column Titles
if len(column_names) > 0 and len(column_names) != n_columns:
    raise Exception("Column titles do not match the number of columns")
    
columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns, index= range(0,n_rows))

row_marker = 0
for row in table.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td')
    for column in columns:
        df.iat[row_marker,column_marker] = column.get_text()
        column_marker += 1
    if len(columns) > 0:
        row_marker += 1
                    
df

Unnamed: 0,Postal Code\n,Borough\n,Neighbourhood\n
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
...,...,...,...
175,M5Z\n,Not assigned\n,Not assigned\n
176,M6Z\n,Not assigned\n,Not assigned\n
177,M7Z\n,Not assigned\n,Not assigned\n
178,M8Z\n,Etobicoke\n,"Mimico NW, The Queensway West, South of Bloor,..."


### Clean up the data

In [334]:
# Delete all newline characters ('\n')
df.columns = df.columns.str.replace(r"[\n]",'')
df.replace(['\n'], '', regex=True, inplace=True)
# Drop rows where Borough = "Not assigned"
df = df[df.Borough != 'Not assigned']
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


<b>*Note: The next two lines seem unnecessary because the data is grouped by Postal Codes in the HTML code, and there do not appear to be any rows where the Neighbourhood is unassigned, but the Borough is assigned. The instructions call for the data to be cleaned this way though, so I added the lines for completeness.*</b>

In [335]:
# Grouping by Postal Codes
df = df.groupby(['Postal Code', 'Borough'], as_index=False).agg({'Neighbourhood': ', '.join})
# Replacing "Not assigned" Neighbourhoods with the name of Borough
df.loc[df['Neighbourhood'] == "Not assigned", 'Neighbourhood'] = df['Borough']

In [336]:
# View final dataframe
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


### View the shape of the DataFrame

In [337]:
# View shape of DataFrame
df.shape

(103, 3)

## Problem 2: Add Longitude/Latitude to previous DataFrame

### First attempted using the geocoder package, but it was taking too long to run

In [338]:
# Needed to install geocoder
#! pip install geocoder

In [339]:
# import geocoder
import geocoder # import geocoder

In [340]:
# Function to get Long/Lat for Toronto postal codes from Geocoder
def get_geocoder(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude

### Read Longitude/Latitude for each postal code from CSV URL into new DataFrame

In [341]:
# Getting Long/Lat from geocoder was taking too long, so reading from this file instead
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
geo_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


### Merge Part 1 DataFrame with Longitude/Latitude DataFrame

In [342]:
# Merge Borough/Neighbourhood DataFrame with Long/Lat DataFrame
geo_data = pd.merge(df, geo_df, how='inner', on=['Postal Code'])
geo_data

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


## Problem 3: Explore the Data

### Import modules for getting longitude/latitude of Toronto, mapping, and analytics

In [343]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library
import numpy as np # array library
from sklearn.cluster import KMeans # k-means from clustering stage

<b>In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent <em>toronto_explorer</em>, as shown below.</b>

In [344]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Plot map of Toronto with markers for boroughs and neighborhoods

In [345]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(geo_data['Latitude'], geo_data['Longitude'], geo_data['Borough'], geo_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Deciding which borough to analyze

<b>*I decided to first see which borough corresponded with the most postal codes assuming it would be the largest and possibly contain the most venues.*</b>

In [346]:
# How many postal codes correspond with each borough
geo_data.groupby(['Borough']).size().sort_values(ascending=False)

Borough
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Mississauga          1
dtype: int64

<b>*North York corresponds with 24 different postal codes, so I selected that to investigate further.*</b>

In [347]:
# View neighborhoods in North York
northyork_df = geo_data[geo_data.Borough == 'North York'].reset_index(drop=True)
northyork_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"York Mills, Silver Hills",43.75749,-79.374714
4,M2M,North York,"Willowdale, Newtonbrook",43.789053,-79.408493
5,M2N,North York,"Willowdale, Willowdale East",43.77012,-79.408493
6,M2P,North York,York Mills West,43.752758,-79.400049
7,M2R,North York,"Willowdale, Willowdale West",43.782736,-79.442259
8,M3A,North York,Parkwoods,43.753259,-79.329656
9,M3B,North York,Don Mills,43.745906,-79.352188


<b>*Using the same logic, I decided to identify which neighborhood in North York corresponds with the most postal codes.*</b>

In [348]:
# Which neighborhoods in North York are represented the most in the DataFrame
northyork_df.groupby(['Neighbourhood']).size().sort_values(ascending=False)

Neighbourhood
Downsview                                          4
Don Mills                                          2
York Mills, Silver Hills                           1
York Mills West                                    1
Bayview Village                                    1
Bedford Park, Lawrence Manor East                  1
Fairview, Henry Farm, Oriole                       1
Glencairn                                          1
Hillcrest Village                                  1
Humber Summit                                      1
Humberlea, Emery                                   1
Lawrence Manor, Lawrence Heights                   1
North Park, Maple Leaf Park, Upwood Park           1
Northwood Park, York University                    1
Parkwoods                                          1
Victoria Village                                   1
Willowdale, Newtonbrook                            1
Willowdale, Willowdale East                        1
Willowdale, Willowdale West     

<b>*Downsview corresponds with 4 postal codes, which is the most of any neighborhood in North York*</b>

In [349]:
# Let's look at venues in Downview because it has the most postal codes
northyork_df.loc[14, 'Neighbourhood']

'Downsview'

### Define Foursquare Credentials and Version

In [375]:
CLIENT_ID = 'CREDENTIALS REDACTED FOR GITHUB' # your Foursquare ID
CLIENT_SECRET = 'CREDENTIALS REDACTED FOR GITHUB' # your Foursquare Secret
VERSION = 'CREDENTIALS REDACTED FOR GITHUB' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: CREDENTIALS REDACTED FOR GITHUB
CLIENT_SECRET:CREDENTIALS REDACTED FOR GITHUB


### Find longitude and latitude of selected neighborhood

In [351]:
neighbourhood_latitude = northyork_df.loc[14, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = northyork_df.loc[14, 'Longitude'] # neighborhood longitude value

neighbourhood_name = northyork_df.loc[14, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Downsview are 43.7390146, -79.5069436.


### Get the top 100 venues that are in Downsview within a radius of 500 meters.

In [376]:
# type your answer here
LIMIT = 100 # limit number of venues
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
#url # display URL

### Examine the results

In [353]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f261334ea6a751ce767363b'},
 'response': {'headerLocation': 'Downsview',
  'headerFullLocation': 'Downsview, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 5,
  'suggestedBounds': {'ne': {'lat': 43.7435146045, 'lng': -79.50072681933999},
   'sw': {'lat': 43.734514595499995, 'lng': -79.51316038066001}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4d6d1a3bcf7e41bd97c28285',
       'name': 'TD Canada Trust',
       'location': {'address': '2709 Jane St',
        'lat': 43.7402356,
        'lng': -79.5125502,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.7402356,
          'lng': -79.5125502}],
        'distance': 470,
        'postalCode': 'M3L 1S3',
        'cc': 'CA',
        'city'

### Define a function to get the category of each venue

In [354]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Clean up the json results and put into DataFrame

In [355]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,TD Canada Trust,Bank,43.740236,-79.51255
1,Giltspur Park,Park,43.735724,-79.507821
2,Price Chopper,Grocery Store,43.739908,-79.512261
3,jane sheppard mall,Shopping Mall,43.740104,-79.512552
4,Gecko Hospitality,Hotel,43.74267,-79.503958


### Print how many venues in Downsview were returned by Foursquare

In [356]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.


<b>*Only 5 venues were identified within 500 meters of Downsview. That's not very interesting, so I decided to investigate a different area of Toronto.*</b>

### Check number of postal codes that correspond to each Borough again

In [357]:
# How many postal codes correspond with each borough
geo_data.groupby(['Borough']).size().sort_values(ascending=False)

Borough
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Mississauga          1
dtype: int64

<b>*Downtown Toronto has the second highest number of postal codes, so I will check how many venues I can find in Downtown Toronto*</b>

In [358]:
# View neighborhoods in Toronto
dtToronto_df = geo_data[geo_data.Borough == 'Downtown Toronto'].reset_index(drop=True)
dtToronto_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752


### Function to get nearby venues for any neighborhood

In [359]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Identify venues in Downtown Toronto using the getNearbyVenues function

In [360]:
dtToronto_venues = getNearbyVenues(names=dtToronto_df['Neighbourhood'],
                                   latitudes=dtToronto_df['Latitude'],
                                   longitudes=dtToronto_df['Longitude']
                                  )

Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Queen's Park, Ontario Provincial Government


### Check shape of Downtown Toronto Venues DataFrame to see how many venues were found

In [361]:
print(dtToronto_venues.shape)
dtToronto_venues.head()

(1239, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"St. James Town, Cabbagetown",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


<b>*Foursquare found 1239 venues in Downtown Toronto, which is much more interesting than just 5 in Downsview.*</b>

### Group venues by Neighborhood

In [362]:
dtToronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,57,57,57,57,57,57
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",15,15,15,15,15,15
Central Bay Street,63,63,63,63,63,63
Christie,17,17,17,17,17,17
Church and Wellesley,75,75,75,75,75,75
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"Harbourfront East, Union Station, Toronto Islands",100,100,100,100,100,100
"Kensington Market, Chinatown, Grange Park",65,65,65,65,65,65


### Count how many unique categories of venues there are in Downtown Toronto

In [363]:
print('There are {} uniques categories.'.format(len(dtToronto_venues['Venue Category'].unique())))

There are 209 uniques categories.


### Analyze the venues in each neighborhood in Downtown Toronto

In [364]:
# one hot encoding
dtToronto_onehot = pd.get_dummies(dtToronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dtToronto_onehot['Neighbourhood'] = dtToronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [dtToronto_onehot.columns[-1]] + list(dtToronto_onehot.columns[:-1])
dtToronto_onehot = dtToronto_onehot[fixed_columns]

dtToronto_onehot.head()

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Rosedale,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rosedale,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,"St. James Town, Cabbagetown",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [365]:
dtToronto_onehot.shape

(1239, 210)

### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [366]:
dtToronto_grouped = dtToronto_onehot.groupby('Neighbourhood').mean().reset_index()
dtToronto_grouped

Unnamed: 0,Neighbourhood,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0
1,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.066667,0.066667,0.133333,0.2,0.133333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015873,0.0,0.0,0.015873,0.0,0.0,0.015873
3,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Church and Wellesley,0.013333,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026667
5,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0
6,"First Canadian Place, Underground city",0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0
7,"Garden District, Ryerson",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0
8,"Harbourfront East, Union Station, Toronto Islands",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0
9,"Kensington Market, Chinatown, Grange Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.061538,0.0,0.046154,0.015385,0.0,0.0,0.0


In [367]:
dtToronto_grouped.shape

(19, 210)

### Print each neighborhood along with the top 5 most common venues

In [368]:
num_top_venues = 5

for hood in dtToronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = dtToronto_grouped[dtToronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
            venue  freq
0     Coffee Shop  0.11
1    Cocktail Bar  0.04
2        Beer Bar  0.04
3  Farmers Market  0.04
4            Café  0.04


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
              venue  freq
0   Airport Service  0.20
1    Airport Lounge  0.13
2  Airport Terminal  0.13
3             Plane  0.07
4   Harbor / Marina  0.07


----Central Bay Street----
                 venue  freq
0          Coffee Shop  0.17
1       Sandwich Place  0.06
2                 Café  0.05
3   Italian Restaurant  0.05
4  Japanese Restaurant  0.05


----Christie----
                venue  freq
0       Grocery Store  0.24
1                Café  0.18
2                Park  0.12
3  Italian Restaurant  0.06
4               Diner  0.06


----Church and Wellesley----
                 venue  freq
0          Coffee Shop  0.11
1  Japanese Restaurant  0.07
2     Sushi Restaurant  0.05
3           Restaurant  0.

## Convert previous data into a DataFrame

### Function to sort the venues in descending order.

In [369]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Dataframe to display the top 10 venues for each neighborhood

In [370]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = dtToronto_grouped['Neighbourhood']

for ind in np.arange(dtToronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(dtToronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Café,Beer Bar,Bakery,Farmers Market,Cocktail Bar,Seafood Restaurant,Cheese Shop,Restaurant,Jazz Club
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Harbor / Marina,Bar,Plane,Rental Car Location,Sculpture Garden,Boat or Ferry,Airport
2,Central Bay Street,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Japanese Restaurant,Salad Place,Bubble Tea Shop,Burger Joint,Department Store,Donut Shop
3,Christie,Grocery Store,Café,Park,Athletics & Sports,Diner,Restaurant,Italian Restaurant,Baby Store,Candy Store,Nightclub
4,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Yoga Studio,Pub,Men's Store,Bubble Tea Shop,Mediterranean Restaurant


## Cluster Neighborhoods

### Run k-means to cluster the neighborhood into 5 clusters.

In [371]:
# set number of clusters
kclusters = 5

dtToronto_grouped_clustering = dtToronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dtToronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 0, 4, 0, 0, 0, 0, 0, 3], dtype=int32)

### Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [372]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

dtToronto_merged = dtToronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dtToronto_merged = dtToronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

dtToronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,2,Park,Playground,Trail,Yoga Studio,Dance Studio,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675,3,Restaurant,Coffee Shop,Café,Bakery,Pizza Place,Park,Italian Restaurant,Pub,Market,Diner
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Yoga Studio,Pub,Men's Store,Bubble Tea Shop,Mediterranean Restaurant
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Café,Theater,Yoga Studio,Cosmetics Shop,Shoe Store
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Café,Bubble Tea Shop,Italian Restaurant,Cosmetics Shop,Japanese Restaurant,Theater,Plaza,Hotel


### Import matplotlib modules to plot data

In [373]:
import matplotlib.cm as cm
import matplotlib.colors as colors

### Visualize resulting clusters

In [374]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dtToronto_merged['Latitude'], dtToronto_merged['Longitude'], dtToronto_merged['Neighbourhood'], dtToronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters