# Segmenting and Clustering Neighborhoods in Toronto

## Part 1: Create the dataframe of postal codes of Toronto

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# load wikipedia web page content
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page_response = requests.get(url, timeout=5)
soup = BeautifulSoup(page_response.content, "html.parser")
table = soup.table.tbody # the List of postal codes of Toronto

# load columns from table to a list 'col'
table_head = table.contents[0]
col = [] 

# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
for child in table_head.children:
    if child != '\n':
        col.append(child.string.replace('\n',''))

# load table elements into a 2-dimension list 'r'
elements = table.contents[1:]
r = []
for e in elements:
    if e != '\n':
        ele_of_r = []
        for s in e.strings:
            if s != '\n':
                s = s.replace('\n','')
                ele_of_r.append(s)
        # Ignore cells with a borough that is Not assigned
        if ele_of_r[1] != 'Not assigned':
            if ele_of_r[2] == 'Not assigned':
                # If a cell has a borough but a Not assigned neighborhood, 
                # then the neighborhood will be the same as the borough. 
                ele_of_r[2] = ele_of_r[1]
            r.append(ele_of_r)
toronto_DF = pd.DataFrame(data=r,columns=col)

# More than one neighborhood can exist in one postal code area. 
# These rows will be combined into one row with the neighborhoods separated with a comma.
r = []
grouped_df = toronto_DF.groupby("Postcode")
for name, sub_df in grouped_df:
    a = [None] * 3
    a[0] = sub_df.iloc[0][0]
    a[1] = sub_df.iloc[0][1]
    a[2] = sub_df['Neighbourhood'].str.cat(sep = ', ') 
    r.append(a)
toronto_df2 = pd.DataFrame(columns=col, data=r)


print(toronto_df2.shape) # Use the .shape method to print the number of rows
toronto_df2 

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Part 2: Get the geographical coordinates and add into the dataframe

In [3]:
!pip install geocoder
import geocoder # import geocoder



In [4]:
# try to use the Geocoder Python package
def get_geo_code(postal_code=None, loop_limit=9):
    
    if postal_code != None:
        
        # initialize the variable to None
        lat_lng_coords = None

        # loop until  get the coordinates
        i = 0 # set loop limit
        while(lat_lng_coords is None):
            g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
            lat_lng_coords = g.latlng
            i = i + 1
            # Given that Geocoder can be very unreliable, 
            # when it's not able to get the coordinates using Geocoder,
            # we will use csv file to get the coordinates of each postal code
            if i > loop_limit: # check the loop time
                return get_geo_code_csv(postal_code)

        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]

        return (latitude, longitude)
    
    else:
        return (None, None)
    
geo_df = pd.read_csv('Geospatial_Coordinates.csv')

# use csv file to get the coordinates
def get_geo_code_csv(postal_code=None):
    return(
        geo_df[geo_df['Postal Code']==postal_code]['Latitude'].values[0],
        geo_df[geo_df['Postal Code']==postal_code]['Longitude'].values[0], 
          )

row, col = toronto_df2.shape
lat_list = [None] * row
long_list = [None] * row

# get the coordinates
for i in range(row):
    postal = toronto_df2.iloc[i]['Postcode']
    lat_list[i], long_list[i] = get_geo_code(postal_code = postal)
    
# insert the latitude list and longitude list into the dataframe
toronto_df2['Latitude'] = lat_list
toronto_df2['Longitude'] = long_list
toronto_df2

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Part 3: Explore and cluster the neighborhoods

In [5]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

### 0. Trim down the Dataframe to Neighborhoods in Toronto only

In [6]:
temp_df = pd.DataFrame()
for i, row in toronto_df2.iterrows():
    if 'Toronto' in row['Borough']:
        temp_df = temp_df.append(row, ignore_index=True)
        
toronto_df2 = temp_df[['Postcode','Borough','Neighbourhood','Latitude','Longitude']]
toronto_df2

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


### 1. Explore Neighborhoods in Toronto

In [7]:
# Define Foursquare Credentials and Version

CLIENT_ID = '5P4U2KHYRYF4YZIKIGSGT45I2ILKGBKE5TKNH2LIDZAPLO3J' 
CLIENT_SECRET = 'EF2SFFD0UYUVDKCLI0BVKPCTZGR50BMKTE5KTNQIUF4CWZHB' 
VERSION = '20180605' # Foursquare API version


In [8]:
# Function to get the top 100 venues that are in a neighborhood within a radius of 1000 meters

LIMIT = 100 # top 100 venues

def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
# Run the above function on each neighborhood and create a new dataframe called toronto_venues

toronto_venues = getNearbyVenues(
                                names=toronto_df2['Neighbourhood'],
                                latitudes=toronto_df2['Latitude'],
                                longitudes=toronto_df2['Longitude']
                                )

### 2. Analyze Each Neighborhood

In [11]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [12]:
# Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

In [13]:
# in case there are no venue in a neighborhood within a radius of 1000 meters,
# add the neighborhood into the grouped dataframe and assign 0s.

r,dummy = toronto_df2.shape
dummy,c = toronto_grouped.shape
rows = []
for i in range(r):
    if not(toronto_df2.iloc[i]['Neighbourhood'] in toronto_grouped['Neighbourhood'].values):
        row_list = [0] * c
        row_list[0] = toronto_df2.iloc[i]['Neighbourhood']
        rows.append(row_list)
        
df = pd.DataFrame(data = rows, columns = toronto_grouped.columns)      
toronto_grouped = toronto_grouped.append(df, ignore_index=True)


In [14]:
# create the new dataframe with the top 10 venues for each neighborhood

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    

### 3. Cluster Neighborhoods

In [15]:
# run k-means to cluster Neighbourhoods into 5 clusters

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)



In [16]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df2

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

In [17]:
# draw the map with clustering marks

address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [18]:
# display the top 10 venues for each neighborhood with cluster labels
toronto_merged

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Coffee Shop,Pub,Pizza Place,Beach,Park,Breakfast Spot,Bar,Caribbean Restaurant,Sandwich Place,Japanese Restaurant
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Coffee Shop,Pub,Café,Fast Food Restaurant,Spa,Pizza Place,Ice Cream Shop,Diner,Bakery
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,1,Indian Restaurant,Coffee Shop,Beach,Café,Burrito Place,Burger Joint,Restaurant,Italian Restaurant,Bakery,Brewery
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Coffee Shop,Bar,American Restaurant,Bakery,Café,Diner,Italian Restaurant,Vietnamese Restaurant,Brewery,Juice Bar
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Park,Bookstore,College Quad,College Gym,Coffee Shop,Gym / Fitness Center,Café,Trail,Event Space,Dumpling Restaurant
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197,1,Coffee Shop,Fast Food Restaurant,Italian Restaurant,Café,Pizza Place,Gym,Sushi Restaurant,Dessert Shop,Pharmacy,Mexican Restaurant
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678,1,Italian Restaurant,Coffee Shop,Sporting Goods Shop,Mexican Restaurant,Diner,Café,Skating Rink,Burger Joint,Baseball Field,Sushi Restaurant
7,M4S,Central Toronto,Davisville,43.704324,-79.38879,1,Coffee Shop,Sushi Restaurant,Italian Restaurant,Indian Restaurant,Café,Dessert Shop,Fast Food Restaurant,Restaurant,Sandwich Place,Gym
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316,2,Coffee Shop,Italian Restaurant,Café,Grocery Store,Park,Pizza Place,Sandwich Place,Pub,Bank,Thai Restaurant
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049,2,Coffee Shop,Sushi Restaurant,Italian Restaurant,Park,Gym / Fitness Center,Pizza Place,Café,Thai Restaurant,Grocery Store,Bagel Shop
