# Clustering Neighborhoods in Toronto

### Part 1 - Data Cleaning

In [1]:
import pandas as pd
import numpy as np

!conda install -c anaconda lxml --yes #needed for read_html

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
#get list of boroughs in Toronto
neighborhood_link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

toronto_neighborhoods = pd.read_html(neighborhood_link)[0]

print(toronto_neighborhoods.shape)
toronto_neighborhoods.head()


(180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Clean data by filtering out non assigned boroughs, grouping by PostalCode/Borough, and applying borough to non assigned neighborhoods

In [3]:
#filter out non assigned boroughs
toronto_neighborhoods_filtered = toronto_neighborhoods[toronto_neighborhoods["Borough"] != "Not assigned"]
toronto_neighborhoods_filtered.reset_index(inplace=True, drop=True)
toronto_neighborhoods_filtered.head(15)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [4]:
#group multiple rows down by postal code; doesnt do anything because they are already grouped
toronto_neighborhoods_grouped = toronto_neighborhoods_filtered.groupby(["Postal Code", "Borough"])["Neighborhood"].apply(','.join).reset_index()
toronto_neighborhoods_grouped

#assign neighborhood as borough for non assigned neighborhoods; this also doesnt do anything because all neighborhoods are assigned
toronto_neighborhoods_grouped.loc[toronto_neighborhoods_grouped["Neighborhood"] == "Not assigned", "Neighborhood"] = toronto_neighborhoods_grouped["Borough"]
toronto_neighborhoods_grouped.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
toronto_neighborhoods_grouped.shape

(103, 3)

### Part 2 - Longitude and Latitiude of Postal Codes

In [6]:
#when using geocoder I got the following message "<[REQUEST_DENIED] Google - Geocode [empty]>" so I am using the csv file

Long_lat_location = "https://cocl.us/Geospatial_data"
postal_code_long_lat = pd.read_csv(Long_lat_location)
postal_code_long_lat

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [7]:
toronto_neighborhoods_lat_long = toronto_neighborhoods_grouped.merge(right=postal_code_long_lat, on="Postal Code", how="left")
toronto_neighborhoods_lat_long


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [8]:
#test to check for bad merge
toronto_neighborhoods_lat_long[toronto_neighborhoods_lat_long["Latitude"].isnull()]

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude


### Part 3 - Neighborhood Clustering

Bring in function to search for venues

In [9]:
import requests # library to handle requests

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library


In [10]:
def getNearbyVenues(names, latitudes, longitudes, LIMIT=100 ,radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&oauth_token={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET,
            access_token,
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Find Venues in each neighborhood

In [12]:
CLIENT_ID = 'GM4DPPVKDVVQUZBP5TXFLOCAV52VMKQ5MA1GWAAGKO44UT0T' # your Foursquare ID
CLIENT_SECRET = 'SUVDODAGDHMYS4ICS2FSNEFK4MWYHL33DQA20TUPMICC2ADN' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
access_token = '2PSFAHXZY1APTB0VU3RR4QVP51RJ0UILZIMFNGSBBMJOGKJX'

toronto_venues = getNearbyVenues(names=toronto_neighborhoods_lat_long['Neighborhood'],
                                   latitudes=toronto_neighborhoods_lat_long['Latitude'],
                                   longitudes=toronto_neighborhoods_lat_long['Longitude']
                                  )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale, Willowdale East
York Mills West
Willowdale, Willowdale West
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto, Broadview North (Old East York)
The Danforth West, 

In [13]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))
toronto_venues

There are 321 uniques categories.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"Malvern, Rouge",43.806686,-79.194353,NT Home Service Inc.,43.806411,-79.197736,Home Service
2,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,RIGHT WAY TO GOLF,43.785177,-79.161108,Golf Course
3,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Chris Effects Painting,43.784343,-79.163742,Construction & Landscaping
4,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Great Shine Window Cleaning,43.783145,-79.157431,Home Service
...,...,...,...,...,...,...,...
3100,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437,Albion Cinemas,43.741940,-79.584988,Indie Movie Theater
3101,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437,The Albion Centre,43.741562,-79.584512,Shopping Mall
3102,"Northwest, West Humber - Clairville",43.706748,-79.594054,Economy Rent A Car,43.708471,-79.589943,Rental Car Location
3103,"Northwest, West Humber - Clairville",43.706748,-79.594054,Logistics Distribution,43.707554,-79.589252,Bar


One hot encode and categorize venues

In [14]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = ["Neighborhood"] + list(toronto_onehot.loc[:, toronto_onehot.columns != "Neighborhood"])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped


Unnamed: 0,Neighborhood,ATM,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,"Alderwood, Long Branch",0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,Bayview Village,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,"Bedford Park, Lawrence Manor East",0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.018182,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,"Willowdale, Willowdale West",0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
94,Woburn,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
95,Woodbine Heights,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
96,York Mills West,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [15]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [48]:

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Skating Rink,Hardware Store,Breakfast Spot,Fireworks Store,Latin American Restaurant,Lounge,Clothing Store,Empanada Restaurant,Donut Shop,Drugstore
1,"Alderwood, Long Branch",Pizza Place,Pharmacy,Pool,Skating Rink,Dance Studio,Athletics & Sports,Pub,Sandwich Place,Coffee Shop,Gym
2,"Bathurst Manor, Wilson Heights, Downsview North",Pharmacy,Ice Cream Shop,Mobile Phone Shop,Spa,Coffee Shop,Bank,Sushi Restaurant,Frozen Yogurt Shop,Fried Chicken Joint,Deli / Bodega
3,Bayview Village,Japanese Restaurant,Café,Chinese Restaurant,Bank,Spa,Yoga Studio,Electronics Store,Donut Shop,Drugstore,Dumpling Restaurant
4,"Bedford Park, Lawrence Manor East",Spa,Pizza Place,Italian Restaurant,Sandwich Place,Sushi Restaurant,Coffee Shop,Mobile Phone Shop,Juice Bar,Thai Restaurant,Hobby Shop


Cluster Neighborhoods

In [49]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 8

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 




array([2, 2, 2, 5, 2, 5, 5, 5, 2, 2], dtype=int32)

In [50]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_neighborhoods_lat_long

print(neighborhoods_venues_sorted["Cluster Labels"])

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood', how='inner')

print(toronto_merged["Cluster Labels"].value_counts())
toronto_merged["Cluster Labels"].fillna(0.0).astype("int64")


toronto_merged.head() # check the last columns!

0     2
1     2
2     2
3     5
4     2
     ..
93    2
94    5
95    2
96    0
97    1
Name: Cluster Labels, Length: 98, dtype: int32
2    48
5    34
1     6
0     6
3     5
7     1
6     1
4     1
Name: Cluster Labels, dtype: int64


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,3,Home Service,Fast Food Restaurant,Yoga Studio,Empanada Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,0,Construction & Landscaping,Moving Target,Home Service,Golf Course,Empanada Restaurant,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,2,Electronics Store,Moving Target,Breakfast Spot,Medical Center,Rental Car Location,Intersection,Spa,Mexican Restaurant,Bank,Eastern European Restaurant
3,M1G,Scarborough,Woburn,43.770992,-79.216917,5,Coffee Shop,Mexican Restaurant,Korean Restaurant,Insurance Office,Indian Restaurant,Pharmacy,Soccer Field,Convenience Store,Electronics Store,Donut Shop
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,2,Caribbean Restaurant,Medical Center,Lounge,Fried Chicken Joint,Gas Station,Athletics & Sports,Thai Restaurant,Spa,Bank,Bakery


In [51]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
toronto_lat = 43.6532
toronto_long = -79.3832
map_clusters = folium.Map(location=[toronto_lat, toronto_long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [55]:
print(toronto_merged["Cluster Labels"].value_counts())
toronto_merged[toronto_merged["Cluster Labels"] == 4]


2    48
5    34
1     6
0     6
3     5
7     1
6     1
4     1
Name: Cluster Labels, dtype: int64


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
93,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242,4,Pizza Place,Comic Shop,Doctor's Office,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Electronics Store
