# Introduction

Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Import the Data

Get the wiki page with the table that needs to be extracted

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)

In [3]:
soup = BeautifulSoup(r.text, 'lxml')
table = soup.tbody
rows = table.find_all('tr')

# Column Headers and Row Extraction

Get the column headers from the html file and put them as the header for the data frame.
Then extract each rows data and put in in the data frame.  Remove the newline at the end of each row.

In [4]:
columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]
print (columns)

['Postcode', 'Borough', 'Neighbourhood']


In [5]:
df = pd.DataFrame(columns=columns)

In [6]:
for i in range (1, len(rows)):
    tds = rows[i].find_all('td')
    
    values = [tds[0].text, tds[1].text, tds[2].text.replace('\n','')]
    
    df = df.append(pd.Series(values, index=columns), ignore_index=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


# Remove Borough == 'Not assigned"

In [7]:
df = df[~df.Borough.str.contains("Not assigned")]
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [9]:
df.Neighbourhood = df.Borough.where(df.Neighbourhood=='Not assigned', df.Neighbourhood)


# Combine postcodes

Combine the Postcodes when there are multiple 'Neighbourhoods' that share the same Postcode

In [10]:
temp_df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(list)

In [11]:
temp_df

Postcode  Borough    
M1B       Scarborough                                     [Rouge, Malvern]
M1C       Scarborough             [Highland Creek, Rouge Hill, Port Union]
M1E       Scarborough                  [Guildwood, Morningside, West Hill]
M1G       Scarborough                                             [Woburn]
M1H       Scarborough                                          [Cedarbrae]
                                               ...                        
M9N       York                                                    [Weston]
M9P       Etobicoke                                            [Westmount]
M9R       Etobicoke      [Kingsview Village, Martin Grove Gardens, Rich...
M9V       Etobicoke      [Albion Gardens, Beaumond Heights, Humbergate,...
M9W       Etobicoke                                            [Northwest]
Name: Neighbourhood, Length: 103, dtype: object

In [12]:
temp_df = temp_df.sample(frac=1).reset_index()
temp_df['Neighbourhood'] = temp_df['Neighbourhood'].str.join(',')

In [13]:
temp_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar..."
1,M9C,Etobicoke,"Bloordale Gardens,Eringate,Markland Wood,Old B..."
2,M1X,Scarborough,Upper Rouge
3,M7A,Downtown Toronto,Queen's Park
4,M5M,North York,"Bedford Park,Lawrence Manor East"
...,...,...,...
98,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto"
99,M5S,Downtown Toronto,"Harbord,University of Toronto"
100,M5E,Downtown Toronto,Berczy Park
101,M1B,Scarborough,"Rouge,Malvern"


In [14]:
temp_df.shape

(103, 3)

# Start of Geocoder Section

Could not get geocoder to work, so going to import the postal codes and merge the data frames

In [15]:
postal = pd.read_csv("Geospatial_Coordinates.csv", sep=',')
postal.rename(columns={'Postal Code':'Postcode'}, inplace=True)

In [16]:
df_loc = pd.merge(temp_df, postal, how='outer', on='Postcode')

In [17]:
df_loc

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar...",43.650943,-79.554724
1,M9C,Etobicoke,"Bloordale Gardens,Eringate,Markland Wood,Old B...",43.643515,-79.577201
2,M1X,Scarborough,Upper Rouge,43.836125,-79.205636
3,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
4,M5M,North York,"Bedford Park,Lawrence Manor East",43.733283,-79.419750
...,...,...,...,...,...
98,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321
99,M5S,Downtown Toronto,"Harbord,University of Toronto",43.662696,-79.400049
100,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
101,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353


In [20]:
import json # library to handle JSON files

In [21]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [22]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [23]:
from sklearn.cluster import KMeans

In [25]:
import folium

# Nearby Venues

Get the nearby venues for all of our Postcodes.  Using the function from the Manhattan lab.

In [26]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [31]:
CLIENT_ID = 'NLHE5JXR3MDBVECTR4WM2E1ZIS5KIELT2QLPHMHF3NEFVCQU' # your Foursquare ID
CLIENT_SECRET = 'JNGLD0ZMVSCESEJZ0ETDL44ZTN0OXTF4M0OMDV3WDYQBZGD3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

# type your answer here
LIMIT = 100 # limit of number of venues returned by Foursquare API



Your credentails:
CLIENT_ID: NLHE5JXR3MDBVECTR4WM2E1ZIS5KIELT2QLPHMHF3NEFVCQU
CLIENT_SECRET:JNGLD0ZMVSCESEJZ0ETDL44ZTN0OXTF4M0OMDV3WDYQBZGD3


In [32]:
toronto_venues = getNearbyVenues(names=df_loc['Borough'],
                                   latitudes=df_loc['Latitude'],
                                   longitudes=df_loc['Longitude']
                                  )

Etobicoke
Etobicoke
Scarborough
Downtown Toronto
North York
Etobicoke
Etobicoke
Scarborough
North York
North York
York
North York
North York
Etobicoke
Scarborough
North York
West Toronto
Central Toronto
Scarborough
North York
Central Toronto
East York
Scarborough
Scarborough
Downtown Toronto
Etobicoke
West Toronto
Etobicoke
North York
North York
North York
York
Downtown Toronto
East Toronto
Downtown Toronto
North York
Downtown Toronto
West Toronto
West Toronto
Central Toronto
North York
East York
Queen's Park
Downtown Toronto
Scarborough
North York
North York
Central Toronto
Downtown Toronto
North York
North York
Scarborough
West Toronto
North York
North York
Downtown Toronto
Central Toronto
Scarborough
East Toronto
Etobicoke
Etobicoke
Central Toronto
Downtown Toronto
East York
Etobicoke
Downtown Toronto
East York
Central Toronto
Scarborough
Downtown Toronto
Central Toronto
Downtown Toronto
East Toronto
Downtown Toronto
York
North York
Scarborough
East York
Scarborough
Downtown Toronto

In [33]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [35]:
print(toronto_venues.shape)
toronto_venues.head()

(2237, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Etobicoke,43.650943,-79.554724,Seaforth Golf Club,43.651183,-79.556107,Golf Course
1,Etobicoke,43.643515,-79.577201,LCBO,43.642099,-79.576592,Liquor Store
2,Etobicoke,43.643515,-79.577201,Starbucks,43.641312,-79.576924,Coffee Shop
3,Etobicoke,43.643515,-79.577201,The Beer Store,43.641313,-79.576925,Beer Store
4,Etobicoke,43.643515,-79.577201,Shoppers Drug Mart,43.641312,-79.576924,Cosmetics Shop


In [36]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Toronto,117,117,117,117,117,117
Downtown Toronto,1309,1309,1309,1309,1309,1309
East Toronto,124,124,124,124,124,124
East York,77,77,77,77,77,77
Etobicoke,75,75,75,75,75,75
Mississauga,12,12,12,12,12,12
North York,238,238,238,238,238,238
Scarborough,95,95,95,95,95,95
West Toronto,171,171,171,171,171,171
York,19,19,19,19,19,19


In [37]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 270 uniques categories.


# Analyze the neighborhood

In [38]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Central Toronto,0.008547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.008547,0.0,0.0,0.008547,0.0,0.0,0.0,0.0,0.0
1,Downtown Toronto,0.003056,0.0,0.000764,0.000764,0.000764,0.000764,0.001528,0.002292,0.001528,...,0.002292,0.011459,0.001528,0.0,0.004584,0.0,0.006875,0.0,0.001528,0.000764
2,East Toronto,0.016129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.008065,0.0,0.0,0.0
3,East York,0.012987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.012987,0.0,0.0,0.0,0.0
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013333,0.0
5,Mississauga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North York,0.0,0.004202,0.0,0.004202,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.004202,0.004202,0.008403,0.0,0.0,0.0,0.004202,0.012605
7,Scarborough,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.021053,0.0,0.0,0.0,0.0,0.0
8,West Toronto,0.011696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.011696,0.0,0.0,0.011696,0.0,0.005848,0.005848,0.0,0.0
9,York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Print the most common venues for each neighborhood



In [41]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
            venue  freq
0     Coffee Shop  0.07
1  Sandwich Place  0.06
2            Café  0.05
3            Park  0.05
4     Pizza Place  0.04


----Downtown Toronto----
         venue  freq
0  Coffee Shop  0.10
1         Café  0.05
2       Bakery  0.03
3   Restaurant  0.03
4          Bar  0.02


----East Toronto----
                venue  freq
0    Greek Restaurant  0.07
1         Coffee Shop  0.06
2  Italian Restaurant  0.05
3             Brewery  0.04
4                Café  0.04


----East York----
                 venue  freq
0          Coffee Shop  0.05
1         Burger Joint  0.04
2             Pharmacy  0.04
3                 Park  0.04
4  Sporting Goods Shop  0.04


----Etobicoke----
                  venue  freq
0           Pizza Place  0.09
1        Sandwich Place  0.07
2           Coffee Shop  0.07
3  Fast Food Restaurant  0.04
4         Grocery Store  0.04


----Mississauga----
                 venue  freq
0                Hotel  0.17
1          Cof

In [42]:
# Write a function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [44]:
import numpy as np

In [46]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(15)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Coffee Shop,Sandwich Place,Café,Park,Pizza Place,Restaurant,Sushi Restaurant,Dessert Shop,Clothing Store,Pub
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Bakery,Hotel,Italian Restaurant,Japanese Restaurant,Bar,Park,Seafood Restaurant
2,East Toronto,Greek Restaurant,Coffee Shop,Italian Restaurant,Brewery,Café,Ice Cream Shop,Pizza Place,Pub,Sandwich Place,Bookstore
3,East York,Coffee Shop,Burger Joint,Park,Pharmacy,Pizza Place,Sporting Goods Shop,Bank,Beer Store,Fast Food Restaurant,Supermarket
4,Etobicoke,Pizza Place,Coffee Shop,Sandwich Place,Pharmacy,Fast Food Restaurant,Café,Grocery Store,Gym,Bakery,Park
5,Mississauga,Hotel,Coffee Shop,American Restaurant,Burrito Place,Intersection,Middle Eastern Restaurant,Sandwich Place,Fried Chicken Joint,Mediterranean Restaurant,Gym
6,North York,Coffee Shop,Clothing Store,Restaurant,Fast Food Restaurant,Japanese Restaurant,Pizza Place,Sandwich Place,Park,Café,Bank
7,Scarborough,Coffee Shop,Chinese Restaurant,Fast Food Restaurant,Breakfast Spot,Bakery,Pizza Place,Pharmacy,Thai Restaurant,Bank,Playground
8,West Toronto,Bar,Café,Coffee Shop,Italian Restaurant,Restaurant,Bakery,Pizza Place,Music Venue,Mexican Restaurant,Asian Restaurant
9,York,Park,Convenience Store,Grocery Store,Trail,Bus Line,Sandwich Place,Caribbean Restaurant,Pool,Hockey Arena,Field


# Clustering neighborhoods

Cluster the neighborhoods into 8 clusters

In [48]:
# set number of clusters
kclusters = 8

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 4, 7, 5, 6, 0, 2, 1, 4, 3])

In [54]:
# create map
latitude = 43.653226
longitude = -79.383184

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [57]:
# add markers to map
for lat, lng, borough, neighbourhood in zip(df_loc['Latitude'], df_loc['Longitude'], df_loc['Borough'], df_loc['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto