# Segmenting and Clustering Neighborhoods in Toronto, Canada

## Part 1: Transform data into a dataframe and clean it

In [42]:
import numpy as np
import pandas as pd

In [43]:
# I have previously copy and pasted the table from Wikipedia into a csv file and moved it to my working directory
# now I read the data into a pandas dataframe :)
column_names = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.read_csv('toronto_postalcode.csv', names=column_names) 
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn


In [44]:
# drop cells where Borough is 'Not assigned'
df.drop(df[df['Borough']=='Not assigned'].index, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae


In [45]:
# reset the index
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [46]:
# check for neighborhoods where the value is 'Not assigned'
df[df['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


It looks like there is none, so we are good

In [47]:
# lastly, check the shape of the final dataframe
df.shape

(103, 3)

## Part 2: Get latitude and longitude coordinates

In [48]:
# I will use the given csv file to obtain coordinates
df_coordinates = pd.read_csv('Geospatial_Coordinates.csv')
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [49]:
df_coordinates.shape

(103, 3)

In [50]:
sum(df_coordinates['Postal Code']==df['PostalCode'])

103

It looks the rows in df_coordinates match that in df exactly. Thus we can add the latitude and longitude columns directly

In [51]:
df['Latitude'] = df_coordinates['Latitude']
df['Longitude'] = df_coordinates['Longitude']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3: Cluster and visualize

In [52]:
import folium 
from geopy.geocoders import Nominatim

First, let's visualize the city of Toronto with all the neighborhoods

In [53]:
# get Toronto coordinates
geolocator = Nominatim(user_agent = 'TO_explorer')
location = geolocator.geocode('Toronto, Ontario')
latitude, longitude = (location.latitude, location.longitude)

In [54]:
# create map
toronto_map = folium.Map(location=[latitude,longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

Use Foursquare API to obtain location data

In [55]:
import json
import requests
from pandas.io.json import json_normalize

In [56]:
# define FourSquare credentials
CLIENT_ID = 'UOKAVDOTLU2BYG4VCCLWIEG0AAB5VYEVRQAJPELLVDNL1INK' 
CLIENT_SECRET = '0A0IDAKB4FC1DHBWXR50HFIA4RCCQYIRCPJLWNHIMZGY5VMF' 
VERSION = '20210119' 
LIMIT = 100 

In [57]:
# practice with one neighborhood first
lat_to, lng_to = (df.loc[0, 'Latitude'], df.loc[0, 'Longitude'])

In [58]:
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat_to, lng_to, VERSION, radius, LIMIT)

In [59]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '6007786e1583b9317beda99a'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4bb6b9446edc76b0d771311c-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/fastfood_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d16e941735',
         'name': 'Fast Food Restaurant',
         'pluralName': 'Fast Food Restaurants',
         'primary': True,
         'shortName': 'Fast Food'}],
       'id': '4bb6b9446edc76b0d771311c',
       'location': {'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'Morningside & Sheppard',
        'distance': 387,
        'formattedAddress': ['Toronto ON', 'Canada'],
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.80744841934756,
          'ln

In [60]:
# borrow the get_category_type function
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [61]:
# Clean the json
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) 

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy’s,Fast Food Restaurant,43.807448,-79.199056
1,Interprovincial Group,Print Shop,43.80563,-79.200378


In [62]:
# do the same for all neighborhoods in Toronto
# borrow function from the lab :)
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [63]:
# create a new dataframe called toronto_vanues to store the results
names_list = df.loc[:,'PostalCode']+ " " +df.loc[:,'Neighborhood'] # I want to postal code to be included in the names
lat_list = df.loc[:,'Latitude']
long_list = df.loc[:,'Longitude']

toronto_venues = getNearbyVenues(names_list, lat_list, long_list)

M1B Malvern, Rouge
M1C Rouge Hill, Port Union, Highland Creek
M1E Guildwood, Morningside, West Hill
M1G Woburn
M1H Cedarbrae
M1J Scarborough Village
M1K Kennedy Park, Ionview, East Birchmount Park
M1L Golden Mile, Clairlea, Oakridge
M1M Cliffside, Cliffcrest, Scarborough Village West
M1N Birch Cliff, Cliffside West
M1P Dorset Park, Wexford Heights, Scarborough Town Centre
M1R Wexford, Maryvale
M1S Agincourt
M1T Clarks Corners, Tam O'Shanter, Sullivan
M1V Milliken, Agincourt North, Steeles East, L'Amoreaux East
M1W Steeles West, L'Amoreaux West
M1X Upper Rouge
M2H Hillcrest Village
M2J Fairview, Henry Farm, Oriole
M2K Bayview Village
M2L York Mills, Silver Hills
M2M Willowdale, Newtonbrook
M2N Willowdale, Willowdale East
M2P York Mills West
M2R Willowdale, Willowdale West
M3A Parkwoods
M3B Don Mills
M3C Don Mills
M3H Bathurst Manor, Wilson Heights, Downsview North
M3J Northwood Park, York University
M3K Downsview
M3L Downsview
M3M Downsview
M3N Downsview
M4A Victoria Village
M4B Parkvie

In [64]:
# get shape of new dataframe
print(toronto_venues.shape)
toronto_venues.head()

(2129, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"M1B Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"M1B Malvern, Rouge",43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,"M1C Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"M1C Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,SEBS Engineering Inc. (Sustainable Energy and ...,43.782371,-79.15682,Construction & Landscaping
4,"M1E Guildwood, Morningside, West Hill",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank


In [65]:
# get number of venues in each neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"M1B Malvern, Rouge",2,2,2,2,2,2
"M1C Rouge Hill, Port Union, Highland Creek",2,2,2,2,2,2
"M1E Guildwood, Morningside, West Hill",8,8,8,8,8,8
M1G Woburn,3,3,3,3,3,3
M1H Cedarbrae,8,8,8,8,8,8
M1J Scarborough Village,2,2,2,2,2,2
"M1K Kennedy Park, Ionview, East Birchmount Park",5,5,5,5,5,5
"M1L Golden Mile, Clairlea, Oakridge",10,10,10,10,10,10
"M1M Cliffside, Cliffcrest, Scarborough Village West",3,3,3,3,3,3
"M1N Birch Cliff, Cliffside West",4,4,4,4,4,4


^Note how there are only 101 unique neighborhoods now. Compared to 103 from before. I believe this is because there are two neighborhoods where no venues were returned.

In [66]:
# get number of unique venue categories
len(toronto_venues['Venue Category'].unique())

271

Prepare for clustering

In [67]:
# create a new dataframe that is based on the venue category

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# insert new column at the beginning for the names of the neighborhoods
toronto_onehot.insert(0, 'Names', toronto_venues['Neighborhood'])

toronto_onehot.head()

Unnamed: 0,Names,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"M1B Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"M1B Malvern, Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"M1C Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"M1C Rouge Hill, Port Union, Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"M1E Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
# group by names and calculate the mean of the frequency of occurence of each category
toronto_grouped = toronto_onehot.groupby('Names').mean().reset_index()
toronto_grouped

Unnamed: 0,Names,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"M1B Malvern, Rouge",0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,"M1C Rouge Hill, Port Union, Highland Creek",0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
2,"M1E Guildwood, Morningside, West Hill",0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,M1G Woburn,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
4,M1H Cedarbrae,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
5,M1J Scarborough Village,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
6,"M1K Kennedy Park, Ionview, East Birchmount Park",0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
7,"M1L Golden Mile, Clairlea, Oakridge",0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
8,"M1M Cliffside, Cliffcrest, Scarborough Village...",0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.333333,0.000000,...,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
9,"M1N Birch Cliff, Cliffside West",0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.00,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000


In [69]:
# want to create a dataframe that shows the top 10 venue categories for each neighborhood

# I changed the below function from the lab so that if the number of venue types returned for a neighborhood is less than 
#   num_top_venues, then only the index of the existing venue types will be included
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    top=row_categories_sorted[0:num_top_venues]
    index_values = top.index.tolist()
    for i in np.arange(top.shape[0]):
        if top[i]==0:
            index_values[i]='None'
    top.index=index_values
    return top.index.values

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Names']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Names'] = toronto_grouped['Names']

for ind in np.arange(toronto_grouped.shape[0]):
    venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

venues_sorted.head()

Unnamed: 0,Names,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"M1B Malvern, Rouge",Fast Food Restaurant,Print Shop,,,
1,"M1C Rouge Hill, Port Union, Highland Creek",Bar,Construction & Landscaping,,,
2,"M1E Guildwood, Morningside, West Hill",Restaurant,Rental Car Location,Breakfast Spot,Medical Center,Mexican Restaurant
3,M1G Woburn,Coffee Shop,Korean BBQ Restaurant,,,
4,M1H Cedarbrae,Hakka Restaurant,Gas Station,Bank,Fried Chicken Joint,Thai Restaurant


Finally, it's time to run kmeans!

In [70]:
from sklearn.cluster import KMeans

In [71]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Names', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe 
kmeans.labels_[0:10] 

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [72]:
# create a new dataframe that includes the cluster labels

# add clustering labels
venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_, allow_duplicates=True)

venues_sorted

Unnamed: 0,Cluster Labels,Names,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,1,"M1B Malvern, Rouge",Fast Food Restaurant,Print Shop,,,
1,0,"M1C Rouge Hill, Port Union, Highland Creek",Bar,Construction & Landscaping,,,
2,0,"M1E Guildwood, Morningside, West Hill",Restaurant,Rental Car Location,Breakfast Spot,Medical Center,Mexican Restaurant
3,0,M1G Woburn,Coffee Shop,Korean BBQ Restaurant,,,
4,0,M1H Cedarbrae,Hakka Restaurant,Gas Station,Bank,Fried Chicken Joint,Thai Restaurant
5,0,M1J Scarborough Village,Grocery Store,Playground,,,
6,0,"M1K Kennedy Park, Ionview, East Birchmount Park",Department Store,Chinese Restaurant,Coffee Shop,Hobby Shop,Convenience Store
7,0,"M1L Golden Mile, Clairlea, Oakridge",Bus Line,Bakery,Soccer Field,Ice Cream Shop,Bus Station
8,0,"M1M Cliffside, Cliffcrest, Scarborough Village...",Motel,Intersection,American Restaurant,,
9,0,"M1N Birch Cliff, Cliffside West",Café,General Entertainment,College Stadium,Skating Rink,


Initial observations from the above df: whoa there are a lot of category 0's :o

In [73]:
# want to add the corresponding longitude and latitude values from the original df to venues_sorted
df['Names']= df.loc[:,'PostalCode']+ " " +df.loc[:,'Neighborhood']
latitudes=[]
longitudes=[]
names_list=df.Names.tolist()
for i in np.arange(venues_sorted.shape[0]):
    name=venues_sorted.loc[i,'Names']
    idx=names_list.index(name)
    latitudes.append(df.loc[idx,'Latitude'])
    longitudes.append(df.loc[idx,'Longitude'])
venues_sorted['Latitude']=latitudes
venues_sorted['Longitude']=longitudes
venues_sorted.head()

Unnamed: 0,Cluster Labels,Names,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Latitude,Longitude
0,1,"M1B Malvern, Rouge",Fast Food Restaurant,Print Shop,,,,43.806686,-79.194353
1,0,"M1C Rouge Hill, Port Union, Highland Creek",Bar,Construction & Landscaping,,,,43.784535,-79.160497
2,0,"M1E Guildwood, Morningside, West Hill",Restaurant,Rental Car Location,Breakfast Spot,Medical Center,Mexican Restaurant,43.763573,-79.188711
3,0,M1G Woburn,Coffee Shop,Korean BBQ Restaurant,,,,43.770992,-79.216917
4,0,M1H Cedarbrae,Hakka Restaurant,Gas Station,Bank,Fried Chicken Joint,Thai Restaurant,43.773136,-79.239476


Visualize Toronto with the clusters

In [74]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [75]:
# create map
map_clusters = folium.Map(location=[lat_to, lng_to], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(venues_sorted['Latitude'], venues_sorted['Longitude'], venues_sorted['Names'], venues_sorted['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters