In [1]:
from IPython.display import display
from tqdm import tqdm

# Segmenting and Clustering Neighborhoods in Toronto

This assignment is about data clustering on example of Toronto neigborhoods.


## Part 1 - Creating dataset
------
Before we can start with machine learning algorithms, we have to create and prepare our dataset. This is what I will do in thist part. So after importing necessary libraries this code will scarpe [wikipedia page](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M) containing needed data. To do so I have used pandas function `read_html`, which works like magic.

In [2]:
import pandas as pd 
import numpy as np

tqdm.pandas()
import requests

In [3]:
link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

r=requests.get(link)

tables = pd.read_html(r.text)
raw_df = tables[0]
raw_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


I is necessary to handle `null` values according to assignment. Even thought assignment says that there could be some postal codes with borought assigned but neighbourhood not assigned, I have noticed that this doesn't occur in this case. Therefore part of code responsible for handling this situation won't have any impact on the dataset.

In [4]:
df = raw_df.copy().replace({'Not assigned': None})
df.dropna(subset=['Borough'],inplace=True)
df['Neighbourhood'][df['Neighbourhood'].isnull()] = df['Borough'][df['Neighbourhood'].isnull()]
df.reset_index(drop=True,inplace=True)
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Now we can check first 10 rows of our dataset and next cell, will show shape of dataset. This closes part 1.

In [5]:
df.shape

(103, 3)

## Part 2 - Adding geospatial data
-----
In this part I add geospatial data to dataset. Unfortunatelly I was unable to use geocoder, so use of csv file is necessary. 

In [6]:
df = df.join(pd.read_csv('Geospatial_Coordinates.csv').set_index('Postal Code'), on='Postal Code')
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part 3 - Final segmentation and clustering
-------
In this part I will pull data about venues in each distinct of Toronto and than cluster distincts according to those data.

At first some necesarry imports.

In [7]:
import sklearn
import folium

In [8]:
longitude=(df['Longitude'].max()+df['Longitude'].min())/2
latitude=(df['Latitude'].max()+df['Latitude'].min())/2

Now lest show distincts on map to see what we already have

In [9]:
f = folium.Figure(width=600, height=400)
my_map = folium.Map(location=[latitude, longitude], zoom_start=10).add_to(f)

hoods = folium.map.FeatureGroup()
for index, row in df.iterrows():
    label = '{}, {}'.format(row['Borough'], row['Neighbourhood'])
    label = folium.Popup(label)
    hoods.add_child(
        folium.CircleMarker(
        [row['Latitude'],row['Longitude']],
        radius=5,
        popup=label,
        parse_html=False)
    )

my_map.add_child(hoods)
display(f)

Using foursquare I will get as much venues as possible for each Toronto distinct. 

In [10]:
CLIENT_ID = 'ZGBSSJDBVOA5V1KT5BKZJSLJMN2RIRJDS0BRABFHTD0XXVKJ'
CLIENT_SECRET = 'PLPRZCGGX3BFDT13Y4FPS2RGSIA2L3PTIQHGMD5MMMM3ZHE4'
ACCESS_TOKEN = '' # your FourSquare Access Token
VERSION = '20210101'
LIMIT = 300

In [11]:
search_query = 'Toronto, Toronto'
radius = 3
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)

# url = 'https://api.foursquare.com/v2/venues/search?near={}&limit={}&client_id={}&client_secret={}&v={}'.format(search_query, LIMIT, CLIENT_ID, CLIENT_SECRET, VERSION)

results = requests.get(url).json()

In [12]:
filtered_columns = ['location.postalCode', 'location.lat', 'location.lng', 'categories']

# venues = pd.json_normalize(results['response']['venues']).loc[:, filtered_columns]
venues = pd.DataFrame()

for index, row in df.iterrows():
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, row[['Latitude', 'Longitude']].values[0], row[['Latitude', 'Longitude']].values[1], VERSION, radius, LIMIT)
    results = requests.get(url).json()
    venues = venues.append(pd.json_normalize(results['response']['venues']).loc[:, filtered_columns])

display(venues.head())
display(venues.tail())

len(venues)

Unnamed: 0,location.postalCode,location.lat,location.lng,categories
0,,43.751976,-79.33214,"[{'id': '4bf58dd8d48988d163941735', 'name': 'P..."
1,,43.752672,-79.326351,"[{'id': '52f2ab2ebcbc57f1066b8b4f', 'name': 'B..."
2,M3A 1Z5,43.760334,-79.326906,"[{'id': '4bf58dd8d48988d1c5941735', 'name': 'S..."
3,M3A 1Z5,43.760341,-79.325519,"[{'id': '52dea92d3cf9994f4e043dbb', 'name': 'D..."
4,M3A 1Z5,43.75984,-79.324719,"[{'id': '4bf58dd8d48988d144941735', 'name': 'C..."


Unnamed: 0,location.postalCode,location.lat,location.lng,categories
104,,43.631302,-79.518623,"[{'id': '4bf58dd8d48988d1fd941735', 'name': 'S..."
105,M8C 2G9,43.62693,-79.527209,"[{'id': '4bf58dd8d48988d11d951735', 'name': 'B..."
106,M8Z 2G9,43.626726,-79.526092,"[{'id': '4bf58dd8d48988d124951735', 'name': 'A..."
107,,43.630783,-79.519706,"[{'id': '4bf58dd8d48988d196941735', 'name': 'H..."
108,M8Z 2Z2,43.634029,-79.519712,"[{'id': '4bf58dd8d48988d16a941735', 'name': 'B..."


11546

Now there is about 11.5k rows in venues table. In next cell I get main categories of each venue and than drop venues without category and duplicate rows.

In [13]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

venues['categories'] = venues.apply(get_category_type, axis=1)
venues.dropna(inplace= True, subset= ['categories'])
venues.head()

venues.drop_duplicates(inplace=True)
len(venues)

10324

Now there is about 10k rows in venues table. It seems insufficient to me to sort venues by distance from center of distinct only so I will use postal code from foursquare and if missing find it by location using geolocator. This can be quite time consuming (about one hour of runtime).

In [14]:
from geopy.geocoders import Nominatim
import re
geolocator = Nominatim(user_agent="foursquare_agent")

In [15]:
def clear_post_code(row):
    postal_code = str(row['location.postalCode'])
    if postal_code == 'nan':
        location = geolocator.reverse(row[['location.lat', 'location.lng']])
        return str.upper(re.findall('(, Ontario, )(...)', location.address)[0][1])
    return str.upper(postal_code[0:3])

venues['location.postalCode'] = venues.progress_apply(clear_post_code, axis= 1)
venues.head()

100%|██████████| 10324/10324 [50:18<00:00,  3.42it/s]


Unnamed: 0,location.postalCode,location.lat,location.lng,categories
0,M3A,43.751976,-79.33214,Park
1,M3A,43.752672,-79.326351,Bus Stop
2,M3A,43.760334,-79.326906,Sandwich Place
3,M3A,43.760341,-79.325519,Discount Store
4,M3A,43.75984,-79.324719,Caribbean Restaurant


Now I will create pivot table counting number of venue category in each distinct. 

In [18]:
categories = venues['categories'].drop_duplicates().values
venues_pivot = venues[['location.postalCode', 'categories']].pivot_table(index='location.postalCode', columns= 'categories', aggfunc=np.count_nonzero).fillna(0)/2

In [19]:
pd.DataFrame(venues_pivot.to_records())

Unnamed: 0,location.postalCode,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,...,Watch Shop,Water Park,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CAN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,G7X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,N4W,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144,N7S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145,O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,ON,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
joined = df.join(pd.DataFrame(venues_pivot.to_records()).set_index('location.postalCode'), on='Postal Code').fillna(0)

Finally I can use kmeans to cluster distincts. 

In [23]:
from sklearn.cluster import KMeans 

In [43]:
kclusters = 5

df_grouped_clustering = joined.drop(['Postal Code',	'Borough',	'Neighbourhood',	'Latitude',	'Longitude'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

In [44]:
try:
    df.insert(0, 'Cluster Labels', kmeans.labels_)
except ValueError as e:
    df['Cluster Labels'] = kmeans.labels_

Now lets take a look at our new map of clustered distincts:

In [47]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [48]:
f2 = folium.Figure(width=600, height=400)
my_map2 = folium.Map(location=[latitude, longitude], zoom_start=10).add_to(f2)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

hoods = folium.map.FeatureGroup()
for index, row in df.iterrows():
    label = '{}, {}'.format(row['Borough'], row['Neighbourhood'])
    label = folium.Popup(label)
    cluster = row['Cluster Labels']
    hoods.add_child(
        folium.CircleMarker(
        [row['Latitude'],row['Longitude']],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7,
        parse_html=False)
    )

my_map2.add_child(hoods)
display(f2)