# Segmenting and Clustering Neighborhoods in Toronto

## Table of Contents

<div class="alert alert-block alert-info" style="margin-top: 20px">

<font size = 3>

1. <a href="#item1">Download and Explore Dataset</a>

2. <a href="#item2">Explore & Visualize Neighborhoods in Toronto</a>

3. <a href="#item3">Analyze Each Neighborhood</a>

4. <a href="#item4">Cluster Neighborhoods</a>

5. <a href="#item5">Examine Clusters</a>    
</font>
</div>

In [440]:
import pandas as pd
import numpy as np
import geocoder
import requests
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors

In [441]:
# set display options
pd.options.display.max_rows = 25
pd.options.display.max_columns  = 200
pd.options.display.max_colwidth = 2000

## 1. Download and Explore Dataset

#### Read the html table from the Wikipedia page using pandas

In [442]:
url = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
results = pd.read_html(url, attrs={'class': 'wikitable'})

#### Get the first dataframe from the resulting list and explore the first few rows

In [443]:
tor_boroughs = results[0]
print('Shape all records',tor_boroughs.shape)
tor_boroughs.head()

Shape all records (287, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Drop rows with Boroughs Not Assigned 

In [444]:
tor_boroughs_drop_na = tor_boroughs.drop(
    tor_boroughs[tor_boroughs.Borough == 'Not assigned'].index,
    axis=0).reset_index(drop=True)
print('Shape after dropping na',tor_boroughs_drop_na.shape)
tor_boroughs_drop_na.head(10)

Shape after dropping na (210, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


#### Combine Neighborghoods where more than one neighborhood exists in one postal code area

In [445]:
postcodes = tor_boroughs_drop_na['Postcode'].unique()

In [446]:
boroughs = []
neighborhoods = []

for code in postcodes:
    temp_postcode_df = tor_boroughs_drop_na[tor_boroughs_drop_na['Postcode'] == code]
    boroughs.append(temp_postcode_df['Borough'].sort_values().unique()[0])
    neighborhoods.append(', '.join(temp_postcode_df['Neighborhood'].sort_values().unique()))

In [447]:
tor_boroughs_combined = pd.DataFrame({'Postcode': postcodes, 'Borough': boroughs,
                                      'Neighborhood': neighborhoods}) 

In [448]:
print('Shape after combining neighborhoods',tor_boroughs_combined.shape)
tor_boroughs_combined.head(10)

Shape after combining neighborhoods (103, 3)


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Not assigned
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 

In [449]:
# select all neighborhoods with Not Assigned and replace with value of Borough
tor_boroughs_combined.loc[
    tor_boroughs_combined[tor_boroughs_combined['Neighborhood'] == 'Not assigned'].index,'Neighborhood']= \
    tor_boroughs_combined[tor_boroughs_combined['Neighborhood'] == 'Not assigned']['Borough']

In [450]:
# Check sample if replaced
tor_boroughs_combined[tor_boroughs_combined['Borough'] == "Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighborhood
5,M9A,Queen's Park,Queen's Park


In [451]:
tor_boroughs_combined.shape

(103, 3)

#### Define function to get latitude and longitude from postal code

In [452]:
def get_postal_code_lat_lon(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('Toronto, ON {}, Canada'.format(postal_code))
      lat_lng_coords = g.latlng
    
    return lat_lng_coords

#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]

In [453]:
## Test function with sample postal code

In [454]:
# print('Sample latitude and longitude',get_postal_code_lat_lon('M4B'))

#### Use provide geocoded CSV due to issues with geocoder (failed above)

In [455]:
tor_boroughs_geocodes = pd.read_csv(r'https://cocl.us/Geospatial_data')
print('Shape geocodes', tor_boroughs_geocodes.shape)
tor_boroughs_geocodes.head(10)

Shape geocodes (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


#### Join Geocoded Postal codes with previous dataframe records

In [456]:
tor_boroughs_geocoded =  tor_boroughs_combined.merge(tor_boroughs_geocodes, how='left', left_on='Postcode', right_on='Postal Code')
tor_boroughs_geocoded.drop('Postal Code',axis=1,inplace=True)
print('Shape after joining lat lons geocodes', tor_boroughs_geocodes.shape)
tor_boroughs_geocoded.head(10)

Shape after joining lat lons geocodes (103, 3)


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## 2. Explore & Visualize Neighborhoods in Toronto

### Visualize Toronto Neighborhoods on a map

In [457]:
CLIENT_ID = 'VYRGXLH1CYLIRXGALW2GVEE1FTWG4EVTJ2LXJCFA0130WEDS' # your Foursquare ID
CLIENT_SECRET = 'HFYLKPQORHLRKKJ5HSARPFLZAJNC2CMCKOFDQHKPGMEB2RJY' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: VYRGXLH1CYLIRXGALW2GVEE1FTWG4EVTJ2LXJCFA0130WEDS
CLIENT_SECRET:HFYLKPQORHLRKKJ5HSARPFLZAJNC2CMCKOFDQHKPGMEB2RJY


#### Get map center

In [458]:
address = 'Toronto, Ontario, Canada'
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#### Map boroughs with name Toronto

In [459]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# get a subset of boroughs just the ones with Toronto in the name
tor_boroughs = tor_boroughs_geocoded[tor_boroughs_geocoded['Borough'].str.contains('Toronto')]

# add markers to map
for lat, lng, label in zip(tor_boroughs['Latitude'], tor_boroughs['Longitude'], tor_boroughs['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto


### Explore venues Neighborhoods in Toronto

#### Define a function to get the first top 100 venues that are in a given postcode within a radius of 500 meters.

In [460]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Get venues for select toronto postcodes

In [None]:
tor_venues = getNearbyVenues(names=tor_boroughs['Neighborhood'],
                                   latitudes=tor_boroughs['Latitude'],
                                   longitudes=tor_boroughs['Longitude']
                                  )

Harbourfront
Queen's Park
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
Riverdale, The Danforth West
Design Exchange, Toronto Dominion Centre


#### Check the size of the resulting dataframe

In [None]:
print(tor_venues.shape)
tor_venues.head()

#### How many venues were returned for each postcode

In [None]:
tor_venues.groupby('Neighborhood').count()

#### Find out how many unique categories can be curated from all the returned venues

In [None]:
print('There are {} uniques categories.'.format(len(tor_venues['Venue Category'].unique())))

## 3. Analyze each Neighborhood

In [None]:
# one hot encoding
tor_onehot = pd.get_dummies(tor_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = tor_venues['Neighborhood'] 

# # move neighborhood column to the first column
tor_cols = tor_onehot.columns.values
neigh_col = np.where(tor_cols == 'Neighborhood')[0][0]
fixed_columns = (
    [tor_onehot.columns[neigh_col]] + 
    list(tor_onehot.columns[:neigh_col]) + 
    list(tor_onehot.columns[(neigh_col +1):])
)
tor_onehot = tor_onehot[fixed_columns]

tor_onehot.head()

Examine the size of the dataframe

In [None]:
tor_onehot.shape

#### Group rows by neighborhood  by taking the mean of the frequency of occurrence of each category

In [None]:
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()
tor_grouped

#### Check the size of the new dataframe

In [None]:
tor_grouped.shape

#### Print each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in tor_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = tor_grouped[tor_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

#### Define a function to put the top n most common venues into a dataframe

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Display the top 10 most common venues for each neighborhood in a datafarame

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

## 4. Cluster Neighborhoods

#### Run k-means to cluster neighborhoods into 5 clusters

In [None]:
# set number of clusters
kclusters = 5

tor_grouped_clustering = tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100] 

In [None]:
tor_grouped_clustering.head()

#### Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tor_merged = tor_boroughs

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
tor_merged = tor_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

tor_merged.head() # check the last columns!

#### Map the resulting cluster

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighborhood'], tor_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

#### Cluster 1

In [None]:
tor_merged.loc[tor_merged['Cluster Labels'] == 0, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

#### Cluster 2

In [None]:
tor_merged.loc[tor_merged['Cluster Labels'] == 1, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

#### Cluster 3

In [None]:
tor_merged.loc[tor_merged['Cluster Labels'] == 2, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

#### Cluster 4

In [None]:
tor_merged.loc[tor_merged['Cluster Labels'] == 3, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

#### Cluster 5

In [None]:
tor_merged.loc[tor_merged['Cluster Labels'] == 4, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]