# Section 1: Prepare source data

Import required libraries

In [99]:
#!pip3 install pandas
#!pip3 install lxml
import pandas as pd
import numpy as np
import lxml

Read html to dataframe

In [114]:
df_source = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df_working = df_source
df_working.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Ignore 'Not assigned' values of field 'Borough'

In [117]:
df_working = df_working[(df_working['Borough'] !='Not assigned')]
df_working.reset_index(drop=True, inplace=True)
df_working1 = df_working
df_working1.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


Combine repeated 'Postal Code' into one row 'Neighborhood' with delimetr ','

In [130]:
#combine
df_working2 = pd.DataFrame(df_working1.groupby('Postal code')['Neighborhood'].apply(', '.join).reset_index())

#merge with original dataframe
df_working2 = df_working2.rename(columns = {"Neighborhood": "Neighborhood_new"})
df_working2 = pd.merge(df_working1, df_working2, on='Postal code')

#drop duplications
df_working2.drop(['Neighborhood'],axis=1,inplace=True)
df_working2.drop_duplicates(inplace=True)

#clean result dataframe
df_working2['Neighborhood_new'].replace(" / ", ", ",regex=True, inplace=True)
df_working3 = df_working2.rename(columns = {"Neighborhood_new": "Neighborhood"})

df_working3.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Updating 'Not assigned' values of field 'Neighborhood' by values of field 'Borough'

In [131]:
df_working3['Neighborhood'] = np.where(df_working3['Neighborhood']=='Not assigned',df_working3['Borough'],df_working3['Neighborhood'])
df_working4 = df_working3
df_working4.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Retrieve dataframe shape

In [133]:
df_output1 = df_working4
df_output1.shape

(103, 3)

--------------------------

# Section 2: Merge Geo Data

Read csv to dataframe

In [134]:
df_geo_source = pd.read_csv('https://cocl.us/Geospatial_data')
df_geo_working = df_geo_source
df_geo_working.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge 2 dataframes

In [136]:
#merge 2 dataframes
df_geo_working1 = df_output1.merge(df_geo_working, left_on='Postal code', right_on='Postal Code')

#drop duplications
df_geo_working1.drop('Postal Code', axis=1, inplace=True)

df_geo_working2 = df_geo_working1
df_geo_working2.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Retrieve dataframe shape

In [137]:
df_output2 = df_geo_working2
df_output2.shape

(103, 5)

-----------------------------

# Section 3: Cluster Neighborhoods in Toronto

Import required libraries

In [147]:
import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranforming json file into a pandas dataframe library

import matplotlib.cm as cm  # Matplotlib and associated plotting modules
import matplotlib.colors as colors  # Matplotlib and associated plotting modules

#!pip3 install sklearn
from sklearn.cluster import KMeans

import folium # map rendering library

Filter only 'Borough' containing 'Toronto'

In [162]:
df_source3 = df_output2
df_tor_working1 = df_source3[df_source3['Borough'].str.contains("Toronto")]
df_tor_working1.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


Show map of Toronto

In [152]:
# retrieve Toronto latitude, longitude 
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_ontario")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [157]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, postal, borough, neighborhood in zip(df_tor_working1['Latitude'], df_tor_working1['Longitude'], df_tor_working1['Postal code'], df_tor_working1['Borough'], df_tor_working1['Neighborhood']):
    label = '{} - {} - {}'.format(postal, neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Define Foursquare Parametrs

In [166]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605'
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

Create dataframe with venues

In [175]:
venues_list = []
for lat, lng, postal, borough, neighborhood in zip(df_tor_working1['Latitude'], df_tor_working1['Longitude'], df_tor_working1['Postal code'], df_tor_working1['Borough'], df_tor_working1['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    for venue in results:
        venues_list.append((
            postal, 
            borough,
            neighborhood,
            lat, 
            lng, 
            venue['venue']['name'], 
            venue['venue']['categories'][0]['name'],
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng']
            ))

df_ven_working1 = pd.DataFrame(venues_list)
df_ven_working1.columns = ['Postal code', 'Borough', 'Neighborhood', 'Latitude', 'Longitude', 'Venue_Name', 'Venue_Category', 'Venue_Latitude', 'Venue_Longitude']
df_ven_working1.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Venue_Name,Venue_Category,Venue_Latitude,Venue_Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,Bakery,43.653447,-79.362017
1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,Spa,43.654735,-79.359874


Analyze Each 'Neighborhood'

In [204]:
# one hot encoding
df_ven_working3 = pd.get_dummies(df_ven_working1[['Venue_Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_ven_working3['Neighborhood_'] = df_ven_working1['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [df_ven_working3.columns[-1]] + list(df_ven_working3.columns[:-1])
df_ven_working3 = df_ven_working3[fixed_columns]

# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
df_ven_working4 = df_ven_working3.groupby('Neighborhood_').mean().reset_index()
df_ven_working4.head()

Unnamed: 0,Neighborhood_,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,...,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing CentrE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.071429,0.071429,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.016129,0.0,0.0,0.0,0.0,0.0,0.016129


Put top 10 venues into dataframe

In [205]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [245]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df_ven_working4['Neighborhood_']

for ind in np.arange(df_ven_working4.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_ven_working4.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Farmers Market,Seafood Restaurant,Restaurant,Bakery,Italian Restaurant,Beer Bar,Café
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Nightclub,Grocery Store,Stadium,Burrito Place,Restaurant,Climbing Gym,Performing Arts Venue
2,Business reply mail Processing CentrE,Light Rail Station,Auto Workshop,Pizza Place,Recording Studio,Restaurant,Butcher,Burrito Place,Brewery,Skate Park,Spa
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Airport,Harbor / Marina,Plane,Rental Car Location,Sculpture Garden,Boat or Ferry,Boutique
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Burger Joint,Ice Cream Shop,Fried Chicken Joint,Bubble Tea Shop,Salad Place,Falafel Restaurant


Run *k*-means to cluster the neighborhood into 5 clusters

In [246]:
# set number of clusters
kclusters = 5

# prepare dataframe for clustering
toronto_grouped_clustering = df_ven_working4.drop('Neighborhood_', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:]

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 3, 0, 0, 1, 1, 0, 1, 1, 2, 0,
       1, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1])

Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [247]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Farmers Market,Seafood Restaurant,Restaurant,Bakery,Italian Restaurant,Beer Bar,Café
1,0,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Nightclub,Grocery Store,Stadium,Burrito Place,Restaurant,Climbing Gym,Performing Arts Venue
2,1,Business reply mail Processing CentrE,Light Rail Station,Auto Workshop,Pizza Place,Recording Studio,Restaurant,Butcher,Burrito Place,Brewery,Skate Park,Spa
3,1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Airport,Harbor / Marina,Plane,Rental Car Location,Sculpture Garden,Boat or Ferry,Boutique
4,0,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Burger Joint,Ice Cream Shop,Fried Chicken Joint,Bubble Tea Shop,Salad Place,Falafel Restaurant


In [241]:
df_tor_working1.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [248]:
# take main dataframe with Toronto data
toronto_merged = df_tor_working1

# merge Toronto data with latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Park,Pub,Bakery,Breakfast Spot,Café,Theater,Mexican Restaurant,Shoe Store,Restaurant
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Sushi Restaurant,Diner,Yoga Studio,Burrito Place,Spa,Beer Bar,Italian Restaurant,Juice Bar,Sandwich Place
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Café,Middle Eastern Restaurant,Bubble Tea Shop,Restaurant,Cosmetics Shop,Japanese Restaurant,Tea Room,Hotel
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Italian Restaurant,Cocktail Bar,Gastropub,American Restaurant,Department Store,Gym,Clothing Store,Farmers Market
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Trail,Neighborhood,Pub,Health Food Store,Dog Run,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Yoga Studio


Visualize the resulting clusters

In [249]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup('Cluster ' + str(cluster+1) + ': ' + str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters