## This notebook will be used for the Segmenting and Clustering Neighborhoods in Toronto assignment in the Coursera Capstone project

Version 3

*This version will be the final version which includes: scrapping the Toronto postal data from Wikipedia with the cordinates, plotting the neighborhoods (segmention and clustering) on a map and making some conclusions (analysis)

In [64]:
# installing packages from system
import sys
!{sys.executable} -m pip install geocoder
!{sys.executable} -m pip install folium

print('Packages installed.')

import pandas as pd # data analysis library
import numpy as np # library for handeling data in a vectorized manner
import requests # for handeling API requests
from bs4 import BeautifulSoup # for scraping HTML content
import geocoder # import geocoder
import folium # map rendering library
from sklearn.cluster import KMeans # import k-means from clustering stage scikit-learn 
from pandas.io.json import json_normalize # for normalizing json files

# Matplotlib plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries are imported')

Packages installed.
Libraries are imported


In [45]:
# setup URL and request type
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL)

# setup scraper
scraper = BeautifulSoup(r.content, 'html.parser')
html_table = scraper.find('div', attrs = {'id':'container'})

print('Page is scrapped')

Page is scrapped


In [46]:
# collecting data from scrapped page
torontoPostalCodes = [];
boroughs= [];
neighborhoods = [];
columnNumber = 1;
passValidation = False

for row in scraper.find_all('td'):
    for cell in row:
        if cell.string and cell.string[0].isalpha() and len(cell.string) > 2:
            passValidation = False
            if columnNumber == 1:
                if passValidation == False and cell.string[1].isdigit():
                    torontoPostalCodes.append(cell.string);   
                    columnNumber = 2
                else:
                    continue
            elif columnNumber == 2 :
                if cell.string == 'Not assigned':
                    passValidation = True
                    del torontoPostalCodes[-1]
                    columnNumber = 1
                    continue
                else:
                    boroughs.append(cell.string);      
                    columnNumber = 3
            elif columnNumber == 3 :
                if cell.string == 'Not assigned\n':
                    neighborhoods.append(boroughs[-1])
                else:
                    neighborhoods.append(cell.string); 
                columnNumber = 1
                
print('Data is collected')

Data is collected


In [47]:
# setup panda datagrame

# define the dataframe columns
column_names = ['PostalCode', 
                'Borough', 
                'Neighborhood',
                'Latitude',
                'Longitude'] 

# instantiate the dataframe
neighbors = pd.DataFrame(columns=column_names)

print('Dataframe is initialized')

Dataframe is initialized


In [49]:
lat_lng_coords = None

# fill dataframe with data from scraper
for data in range(0, len(neighborhoods)-1):
    code = torontoPostalCodes[data]
    borough = boroughs[data]
    neighborhood_name = neighborhoods[data]
    
    # setup geocoder and cordinates
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
    lat_lng_coords = g.latlng

    neighbors = neighbors.append({ 'PostalCode': code,
                                   'Borough': borough,
                                   'Neighborhood': neighborhood_name, 
                                   'Latitude': lat_lng_coords[0], 
                                  'Longitude': lat_lng_coords[1]
                                 }, ignore_index=True)
    
print('Dataframe is filled with data including cordinates')

Dataframe is filled with data including cordinates


In [50]:
# display first five neighbors
neighbors.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75242,-79.329242
1,M4A,North York,Victoria Village,43.7306,-79.313265
2,M5A,Downtown Toronto,Harbourfront,43.650295,-79.359166
3,M6A,North York,Lawrence Heights,43.72327,-79.451286
4,M6A,North York,Lawrence Manor,43.72327,-79.451286


In [52]:
# display shape of neighbors
neighbors.shape

(229, 5)

In [59]:
# *got api keys and settings from the previous excersize lab

# set variables of the foursquare api
radius = 500 # set radius
LIMIT = 100 # set limit of number of venues returned
VERSION = '20180605' # Foursquare API version
CLIENT_ID = '*************************HIDDEN'
CLIENT_SECRET = '*************************HIDDEN'

# extract first neighborhood to test API
neighborhood_name = neighbors.loc[0, 'Neighborhood'] # neighborhood name
neighborhood_latitude = neighbors.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = neighbors.loc[0, 'Longitude'] # neighborhood longitude value

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5dd160fd760a7f001bddcf8f'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.75692000450003,
    'lng': -79.32302427998279},
   'sw': {'lat': 43.74791999550003, 'lng': -79.33546062601711}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 238,
        'cc': 'CA',
       

In [62]:
# function that extracts the category of the venue *function from the Foursquare alb
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [79]:
# extract category of venue
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

print('categories returned by the Foursquare API: ', nearby_venues.shape[0])

categories returned by the Foursquare API:  3


In [81]:
# define function to get nearby venues
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [85]:
# get nearby venues
toronto_venues = getNearbyVenues(names=neighborhoods,
                                 latitudes=neighbors['Latitude'],
                                 longitudes=neighbors['Longitude'])


Parkwoods
Victoria Village
Harbourfront
Lawrence Heights
Lawrence Manor
Queen's Park
Queen's Park
Rouge
Malvern
Don Mills North

Woodbine Gardens
Parkview Hill
Ryerson

Garden District

Glencairn

Cloverdale

Islington
Martin Grove

Princess Gardens
West Deane Park
Highland Creek
Rouge Hill
Port Union
Flemingdon Park
Don Mills South

Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens

Eringate

Markland Wood
Old Burnhamthorpe

Guildwood

Morningside
West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks

Woburn
Leaside
Central Bay Street

Christie

Cedarbrae

Hillcrest Village
Bathurst Manor
Downsview North

Wilson Heights
Thorncliffe Park
Adelaide

King

Richmond

Dovercourt Village
Dufferin

Scarborough Village
Fairview

Henry Farm
Oriole

Northwood Park
York University
East Toronto
Harbourfront East

Toronto Islands
Union Station
Little Portugal
Trinity
East Birchmount Park

Ionview
Kennedy Park
Bayview Village
CFB Toronto
Downsview East

The Danforth West

Riverdal

In [89]:
# check results and size of resulting dataframe
print('Total entries in toronto_venues dataframe: ', toronto_venues.shape[0])
print('Total columns in toronto_venues dataframe: ', toronto_venues.shape[1])

print('get first five results of toronto_venues dataframe:')
toronto_venues.head()

Total entries in toronto_venues dataframe:  630
Total columns in toronto_venues dataframe:  7
get first five results of toronto_venues dataframe:


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.75242,-79.329242,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.75242,-79.329242,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.75242,-79.329242,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Victoria Village,43.7306,-79.313265,Brookbanks Park,43.751976,-79.33214,Park
4,Victoria Village,43.7306,-79.313265,Variety Store,43.751974,-79.333114,Food & Drink Shop


In [96]:
# find out how many unique categories can be curated from all the returned venues

print('There are {} categories who are unique'.format(len(toronto_venues['Venue Category'].unique())))

There are 3 categories who are unique


In [99]:
# set one hot encoding by panda's get dummies
toronto_onehot = pd.get_dummies(
    toronto_venues[['Venue Category']], 
    prefix="", 
    prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

print('New dataframe size: ', toronto_onehot.shape)

New dataframe size:  (630, 4)


In [110]:
# filter group rows by neighborhood, set the mean of the frequency of each category

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

#show entries and show the first five in dataframe to test
print('Entries and columns inside toronto_grouped dataframe: ', toronto_grouped.shape)
toronto_grouped.head()

Entries and columns inside toronto_grouped dataframe:  (208, 4)


Unnamed: 0,Neighborhood,Bus Stop,Food & Drink Shop,Park
0,Adelaide,0.333333,0.333333,0.333333
1,Agincourt,0.333333,0.333333,0.333333
2,Agincourt North,0.333333,0.333333,0.333333
3,Albion Gardens,0.333333,0.333333,0.333333
4,Alderwood,0.333333,0.333333,0.333333


In [126]:
# print top 3 most common venues of each neighborhood 

num_top_venues = 3

for hood in toronto_grouped['Neighborhood']:
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(hood+':')
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')
    
    


Adelaide
:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Agincourt:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Agincourt North:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Albion Gardens
:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Alderwood:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Bathurst Manor:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Bathurst Quay
:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Bayview Village:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Beau

Henry Farm:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


High Park:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Highland Creek:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Hillcrest Village:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Humber Bay:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Humber Bay Shores
:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Humber Summit:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Humbergate
:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Hu

Richview Gardens
:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Riverdale:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Roncesvalles:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Rosedale:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Roselawn
:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Rouge:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Rouge Hill:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Royal York South East:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Royal York So

York Mills:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


York Mills West
:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


York University:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33


Yorkville:
               venue  freq
0           Bus Stop  0.33
1  Food & Drink Shop  0.33
2               Park  0.33




In [118]:
# sort the venues in descending order.

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

print('Venues sorted!')

Venues sorted!


In [127]:
# create the new dataframe and display the top 10 venues for each neighborhood.

num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

# show first five of neighborhoods_venues_sorted to test    
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Adelaide,Park,Food & Drink Shop,Bus Stop
1,Agincourt,Park,Food & Drink Shop,Bus Stop
2,Agincourt North,Park,Food & Drink Shop,Bus Stop
3,Albion Gardens,Park,Food & Drink Shop,Bus Stop
4,Alderwood,Park,Food & Drink Shop,Bus Stop


In [128]:
# Run k-means to cluster the neighborhood into 5 clusters.

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

  return_n_iter=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [129]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neighbors

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M3A,North York,Parkwoods,43.75242,-79.329242,0,Park,Food & Drink Shop,Bus Stop
1,M4A,North York,Victoria Village,43.7306,-79.313265,0,Park,Food & Drink Shop,Bus Stop
2,M5A,Downtown Toronto,Harbourfront,43.650295,-79.359166,0,Park,Food & Drink Shop,Bus Stop
3,M6A,North York,Lawrence Heights,43.72327,-79.451286,0,Park,Food & Drink Shop,Bus Stop
4,M6A,North York,Lawrence Manor,43.72327,-79.451286,0,Park,Food & Drink Shop,Bus Stop


In [132]:
# create map
map_clusters = folium.Map(location=[43.75692000450003, -79.32302427998279], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
# show map
map_clusters

In [137]:
# Show shape of first cluster and display first five entries for test

toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, 
                   toronto_merged.columns[[1] + 
                                          list(range(5, toronto_merged.shape[1]))]]

print('Entries and columns inside toronto_merged: ', toronto_merged.shape)
toronto_merged.head()

Entries and columns inside toronto_merged:  (229, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M3A,North York,Parkwoods,43.75242,-79.329242,0,Park,Food & Drink Shop,Bus Stop
1,M4A,North York,Victoria Village,43.7306,-79.313265,0,Park,Food & Drink Shop,Bus Stop
2,M5A,Downtown Toronto,Harbourfront,43.650295,-79.359166,0,Park,Food & Drink Shop,Bus Stop
3,M6A,North York,Lawrence Heights,43.72327,-79.451286,0,Park,Food & Drink Shop,Bus Stop
4,M6A,North York,Lawrence Manor,43.72327,-79.451286,0,Park,Food & Drink Shop,Bus Stop
