# Capstone: Segmenting and Clustering Neighborhoods in Toronto

This notebook satisfies Coursera Data Science Capstone Week 3 Project.

## Part 1: Scrape Web for Dataset and Cleanse Table

In [1]:
# Import packages
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re

In [2]:
# Scrape Wikipedia for information about Toronto Neighborhoods using BeautifulSoup
target_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
target_url_content = requests.get(target_url).content
soup = BeautifulSoup(target_url_content,'lxml') 
html_table = soup.find_all('table')[0] 

# Convert HTML table to a data frame
df = pd.read_html(str(html_table))[0]

# Assign the column names to the first row, then trim the first row
df.columns = df.iloc[0]
df = df[1:]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [3]:
df.shape

(288, 3)

In [4]:
# Remove the rows where the Borough == 'Not Assigned'
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [5]:
df.shape

(211, 3)

In [6]:
# Group Neighborhoods by Postcode (Usually avoid lambda functions but I think it makes more sense to use one here)
grouped_df = df.groupby(['Postcode','Borough'], as_index=False).agg(lambda x: ', '.join(x))

grouped_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# If there aren't any neighborhoods listed, assign the neighborhood as the borough name

# Reassign rows where boolean table == true
no_neighborhood_rows_df = grouped_df.Neighbourhood == 'Not assigned'
grouped_df.loc[no_neighborhood_rows_df, 'Neighbourhood'] = grouped_df.loc[no_neighborhood_rows_df, 'Borough']  
grouped_df[no_neighborhood_rows_df]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


In [8]:
# Rename dataframe
cleansed_df = grouped_df
cleansed_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
cleansed_df.shape

(103, 3)

## Part 2: Adding Latitude & Longitude from CSV

In [10]:
coordinates_df = pd.read_csv('http://cocl.us/Geospatial_data')
coordinates_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
cleansed_coord_df = cleansed_df.join(coordinates_df.set_index('Postal Code'), on='Postcode')
cleansed_coord_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3: Exploratory Data Analysis with Foursquare API

In [12]:
import folium
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
print('imports successful')

imports successful


In [13]:
# Get the coordinates for the City of Toronto (for folium)
address = "Toronto"
geolocator = Nominatim(user_agent="toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The coordinates for the City of Toronto are {}, {}.'.format(latitude, longitude))

The coordinates for the City of Toronto are 43.653963, -79.387207.


First, let's map out the boroughs and neighborhoods on a map of the city.

In [14]:
# Create a folium map around the coordinates of the city
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10.3)

# Add markers for each neighbourhood
for lat, long, borough, neighbourhood in zip(cleansed_coord_df['Latitude'], cleansed_coord_df['Longitude'], cleansed_coord_df['Borough'], cleansed_coord_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, long],
                        radius=5,
                        popup=label,
                        color='blue',
                        fill=True,
                        fill_opacity=0.1,
                        parse_html=False).add_to(map_toronto)  

# Show map with labels on neighbourhood 
map_toronto

Now that we have a map of the city, we can use Foursquare to explore venues.

In [15]:
# Provide API Keys for connection
CLIENT_ID = "2Q2AQSZBXD1TK0EA5SGGOYPSSP2Q2IGHLDFY4YSZ0UL0EIDS"
CLIENT_SECRET_KEY= "OQUJ0TNNX1MSNBTPZWGHJIL20DMH1WVLFC2Y1FGFOSYA154A"
VERSION = "20180604" # Older API Version
LIMIT = 100
radius = 500

In [16]:
# Make a list to store the venues (to convert to df later)
venues_list = []

# Loop thru the cleansed coordinate dataframe, get the venues in them, then get the coordinates and categories of venues
for lat, long, post, borough, neighbourhood in zip(cleansed_coord_df['Latitude'], cleansed_coord_df['Longitude'], cleansed_coord_df['Postcode'], cleansed_coord_df['Borough'], cleansed_coord_df['Neighbourhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET_KEY,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    results = requests.get(url).json()['response']['groups'][0]['items']
    for venue in results:
        venues_list.append((post, 
                            borough,
                            neighbourhood,
                            lat, 
                            long, 
                            venue['venue']['name'], 
                            venue['venue']['location']['lat'], 
                            venue['venue']['location']['lng'],  
                            venue['venue']['categories'][0]['name']))

In [17]:
# Convert the venues list to a dataframe and rename columns
venues_df = pd.DataFrame(venues_list)
venues_df.columns = ['Postcode', 'Borough', 'Neighbourhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
venues_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Affordable Toronto Movers,43.787919,-79.162977,Moving Target
3,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [18]:
# Check the number of venues
venues_df.shape

(2258, 9)

Show the number of venues by postal code

In [19]:
# Get the number of venues in each postal code
venues_df.groupby(['Postcode', 'Borough', 'Neighbourhood'])['VenueName'].count()

Postcode  Borough           Neighbourhood                                                                                                                         
M1B       Scarborough       Rouge, Malvern                                                                                                                             1
M1C       Scarborough       Highland Creek, Rouge Hill, Port Union                                                                                                     2
M1E       Scarborough       Guildwood, Morningside, West Hill                                                                                                          8
M1G       Scarborough       Woburn                                                                                                                                     3
M1H       Scarborough       Cedarbrae                                                                                                                            

### K-Means clustering for venues (categorical data)

Generate a one-hot encoded table of venue categories by postal code

In [20]:
# Perform one hot encoding on venues category
onehot_df = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
onehot_df['Postcode'] = venues_df['Postcode'] 
onehot_df['Borough'] = venues_df['Borough'] 
onehot_df['Neighborhoods'] = venues_df['Neighbourhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(onehot_df.columns[-3:]) + list(onehot_df.columns[:-3])
onehot_df = onehot_df[fixed_columns]

print(onehot_df.shape)
onehot_df.head()

(2258, 280)


Unnamed: 0,Postcode,Borough,Neighborhoods,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,Scarborough,"Rouge, Malvern",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1E,Scarborough,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1E,Scarborough,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Group the one hot values by postal code
venues_freq_df = onehot_df.groupby(['Postcode', 'Borough', 'Neighborhoods']).mean().reset_index()
print(venues_freq_df.shape)
venues_freq_df.head()

(99, 280)


Unnamed: 0,Postcode,Borough,Neighborhoods,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,Scarborough,"Rouge, Malvern",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,Scarborough,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,Scarborough,Cedarbrae,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Number of top venues to record (ie top 10)
num_top_venues = 10

# Generate column names
area_cols = ['Postcode', 'Borough', 'Neighborhoods']
freq_cols = []
for i in range(num_top_venues):
    freq_cols.append('#{} Most Common Venue Category'.format(i + 1))

cols = area_cols + freq_cols

# Build the dataframe
neighborhood_venues_df = pd.DataFrame(columns=cols)
neighborhood_venues_df['Postcode'] = venues_freq_df['Postcode']
neighborhood_venues_df['Borough'] = venues_freq_df['Borough']
neighborhood_venues_df['Neighborhoods'] = venues_freq_df['Neighborhoods']

print(venues_freq_df.shape)
for i in range(venues_freq_df.shape[0]):
    row_categories = venues_freq_df.iloc[i, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhood_venues_df.iloc[i, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhood_venues_df.sort_values(freq_cols, inplace=True)
neighborhood_venues_df

(99, 280)


Unnamed: 0,Postcode,Borough,Neighborhoods,#1 Most Common Venue Category,#2 Most Common Venue Category,#3 Most Common Venue Category,#4 Most Common Venue Category,#5 Most Common Venue Category,#6 Most Common Venue Category,#7 Most Common Venue Category,#8 Most Common Venue Category,#9 Most Common Venue Category,#10 Most Common Venue Category
65,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Terminal,Airport Lounge,Plane,Coffee Shop,Sculpture Garden,Boat or Ferry,Boutique,Bar,Airport Gate
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",American Restaurant,Motel,Diner,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Yoga Studio
30,M3N,North York,Downsview Northwest,Athletics & Sports,Gym / Fitness Center,Liquor Store,Grocery Store,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore
4,M1H,Scarborough,Cedarbrae,Bakery,Bank,Caribbean Restaurant,Hakka Restaurant,Athletics & Sports,Thai Restaurant,Fried Chicken Joint,Dumpling Restaurant,Doner Restaurant,Donut Shop
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",Bakery,Bus Line,Soccer Field,Metro Station,Fast Food Restaurant,Bus Station,Intersection,Park,Empanada Restaurant,Ethiopian Restaurant
73,M6H,West Toronto,"Dovercourt Village, Dufferin",Bakery,Pharmacy,Supermarket,Park,Bar,Furniture / Home Store,Café,Music Venue,Middle Eastern Restaurant,Bank
90,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ...",Bank,Yoga Studio,Electronics Store,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Empanada Restaurant,Discount Store
74,M6J,West Toronto,"Little Portugal, Trinity",Bar,Coffee Shop,Asian Restaurant,Bakery,New American Restaurant,French Restaurant,Vietnamese Restaurant,Restaurant,Men's Store,Café
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",Bar,Moving Target,Yoga Studio,Eastern European Restaurant,Dog Run,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Electronics Store
93,M9M,North York,"Emery, Humberlea",Baseball Field,Yoga Studio,Electronics Store,Doner Restaurant,Donut Shop,Drugstore,Dumpling Restaurant,Eastern European Restaurant,Empanada Restaurant,Discount Store


In [23]:
kclusters = 3

venues_freq_clustering_df = venues_freq_df.drop(['Postcode', 'Borough', 'Neighborhoods'], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venues_freq_clustering_df)

clustered_df = coordinates_df

print(clustered_df.shape)
print(kmeans.labels_)
clustered_df['Cluster'] = kmeans.labels_

clustered_df = clustered_df.join(neighborhood_venues_df.drop(['Borough', 'Neighborhoods'], 1).set_index('Postcode'), on='Postcode')
clustered_df.sort_values(['Cluster'] + freq_cols, inplace=True)
clustered_df

(103, 3)
[0 0 0 0 0 2 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


ValueError: Length of values does not match length of index