## Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

In [146]:
! pip install pandas
import pandas as pd
import numpy as np
import requests
! pip install lxml
import lxml



### Part 1: Loading the table from Wikipedia to a Dataframe

In [147]:
wiki_page_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_table_df = pd.read_html(wiki_page_url, header=0)[0]

In [148]:
wiki_table_df.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

In [149]:
wiki_table_df = wiki_table_df[wiki_table_df['Borough'] != 'Not assigned']
wiki_table_df['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [150]:
wiki_table_df.reset_index(drop=True, inplace=True)
wiki_table_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [151]:
# checking for duplicate PostalCode rows
wiki_table_df.groupby(by='PostalCode').agg({"Borough": "nunique", "Neighborhood": "nunique"}).sort_values('Borough', ascending=False)

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,1,1
M5R,1,1
M6G,1,1
M6E,1,1
M6C,1,1
...,...,...
M3L,1,1
M3K,1,1
M3J,1,1
M3H,1,1


In [152]:
# checking for Not assigned neighborhoods, there are no such
wiki_table_df[wiki_table_df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


<b>Comments:</b>  
Not all the cleaning had to be done that were requested in the description. I didn't find any duplicate rows for PostalCode. Neither did I find Not assigned Neighborhoods. The size of the final table is as follows:

In [153]:
wiki_table_df.shape

(103, 3)

### Part 2: Loading the coordinates for Toronto neighborhoods

In [154]:
# !pip install geocoder
# import geocoder # import geocoder

##### I installed and tried to use geocoder, but it didn't work for me. It was just running endlessly without delivering anything.

In [155]:
# so I used the provided back-up file
geospc_url = 'http://cocl.us/Geospatial_data'
geospc_df=pd.read_csv(geospc_url, index_col=0)

In [156]:
wiki_table_df = wiki_table_df.join(geospc_df, on ='PostalCode')

In [157]:
wiki_table_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [158]:
wiki_table_df.shape

(103, 5)

### Part 3: Exploring and clustering the neighborhoods in Toronto

In [159]:
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
import json 
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



First, I visualize all the neighbourhoods on the map of Toronto.

In [160]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Canada are 43.6534817, -79.3839347.


In [161]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(wiki_table_df['Latitude'], wiki_table_df['Longitude'], wiki_table_df['Borough'], wiki_table_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Secondly, I connect to the Foursquare API and load the venue category data for all neighbourhoods.

In [182]:
# CLIENT_ID and CLIENT_SECRET removed from submission
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [163]:
# function fully copied from leacture lab
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [164]:
toronto_venues = getNearbyVenues(names=wiki_table_df['Neighborhood'],
                                   latitudes=wiki_table_df['Latitude'],
                                   longitudes=wiki_table_df['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview
The Danforth West, Ri

In [165]:
print(toronto_venues.shape)
toronto_venues.head()

(2128, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [166]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 273 uniques categories.


Thirdly, I process the venue category data and proceed with clustering.

In [167]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1]) # move neighborhood column to the first column
toronto_onehot = toronto_onehot[fixed_columns]
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.shape

(93, 273)

In [168]:
# function fully copied from leacture lab
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [169]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Breakfast Spot,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
1,"Alderwood, Long Branch",Pizza Place,Gym,Sandwich Place,Pharmacy,Pool,Pub,Coffee Shop,Airport Lounge,Falafel Restaurant,Ethiopian Restaurant
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Park,Fried Chicken Joint,Shopping Mall,Sandwich Place,Bridal Shop,Diner,Restaurant,Deli / Bodega
3,Bayview Village,Café,Bank,Chinese Restaurant,Japanese Restaurant,Department Store,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run
4,"Bedford Park, Lawrence Manor East",Sandwich Place,Italian Restaurant,Coffee Shop,Breakfast Spot,Thai Restaurant,Butcher,Café,Sushi Restaurant,Japanese Restaurant,Pizza Place


In [170]:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [171]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = wiki_table_df
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,Park,Food & Drink Shop,Women's Store,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,1.0,Portuguese Restaurant,Pizza Place,Coffee Shop,Financial or Legal Service,Hockey Arena,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,1.0,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Café,Theater,Mexican Restaurant,Shoe Store,Restaurant
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1.0,Clothing Store,Women's Store,Coffee Shop,Boutique,Miscellaneous Shop,Furniture / Home Store,Event Space,Accessories Store,Vietnamese Restaurant,Construction & Landscaping
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1.0,Coffee Shop,Sushi Restaurant,College Cafeteria,Gym,Distribution Center,Park,Music Venue,Mexican Restaurant,Japanese Restaurant,Italian Restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,0.0,Park,Pool,River,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Doner Restaurant,Ethiopian Restaurant,Dance Studio
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,1.0,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant,Mediterranean Restaurant,Hotel,Gastropub,Café,Yoga Studio
100,M7Y,East Toronto,Business reply mail Processing Centre,43.662744,-79.321558,1.0,Park,Auto Workshop,Comic Shop,Pizza Place,Butcher,Recording Studio,Restaurant,Burrito Place,Brewery,Skate Park
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,2.0,Construction & Landscaping,Baseball Field,Women's Store,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore


Finally, I visualize the clusters on the map.

In [172]:
toronto_merged[['Cluster Labels', 'PostalCode']].fillna(-1).groupby(by='Cluster Labels').count()

Unnamed: 0_level_0,PostalCode
Cluster Labels,Unnamed: 1_level_1
-1.0,5
0.0,11
1.0,81
2.0,3
3.0,2
4.0,1


In [173]:
#droping those neighbourhoods for which there was no venue data for the clustering.
toronto_merged = toronto_merged.dropna().reset_index()

In [174]:
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype('int')
toronto_merged['Cluster Labels'].dtype

dtype('int64')

In [175]:
toronto_merged.shape

(98, 17)

In [176]:
# create map

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [177]:
# cluster 0 (looks like the focus is the Park)
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(6, toronto_merged.shape[1]-5))]]

Unnamed: 0,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,0,Park,Food & Drink Shop,Women's Store,Dog Run,Dessert Shop
19,M6E,0,Park,Pool,Women's Store,Golf Course,Curling Ice
33,M4J,0,Park,Metro Station,Convenience Store,Ethiopian Restaurant,Electronics Store
57,M4N,0,Park,Swim School,Bus Line,Dog Run,Dim Sum Restaurant
60,M9N,0,Park,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant
62,M2P,0,Park,Bank,Convenience Store,Ethiopian Restaurant,Electronics Store
73,M9R,0,Park,Mobile Phone Shop,Sandwich Place,Discount Store,Department Store
79,M4T,0,Park,Trail,Restaurant,Ethiopian Restaurant,Electronics Store
81,M1V,0,Park,Playground,Dog Run,Department Store,Dessert Shop
87,M4W,0,Park,Playground,Trail,Electronics Store,Eastern European Restaurant


In [178]:
# cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(6, toronto_merged.shape[1]-5))]]

Unnamed: 0,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,M4A,1,Portuguese Restaurant,Pizza Place,Coffee Shop,Financial or Legal Service,Hockey Arena
2,M5A,1,Coffee Shop,Bakery,Pub,Park,Breakfast Spot
3,M6A,1,Clothing Store,Women's Store,Coffee Shop,Boutique,Miscellaneous Shop
4,M7A,1,Coffee Shop,Sushi Restaurant,College Cafeteria,Gym,Distribution Center
6,M3B,1,Gym,Restaurant,Café,Japanese Restaurant,Coffee Shop
...,...,...,...,...,...,...,...
91,M4X,1,Restaurant,Coffee Shop,Pub,Park,Chinese Restaurant
92,M5X,1,Coffee Shop,Café,Gym,Hotel,Restaurant
94,M4Y,1,Coffee Shop,Sushi Restaurant,Japanese Restaurant,Gay Bar,Restaurant
95,M7Y,1,Park,Auto Workshop,Comic Shop,Pizza Place,Butcher


In [179]:
# cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(6, toronto_merged.shape[1]-5))]]

Unnamed: 0,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
30,M1J,2,Playground,Construction & Landscaping,Women's Store,Doner Restaurant,Dessert Shop
53,M9M,2,Baseball Field,Women's Store,Dim Sum Restaurant,Diner,Discount Store
96,M8Y,2,Construction & Landscaping,Baseball Field,Women's Store,Donut Shop,Diner


In [180]:
# cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(6, toronto_merged.shape[1]-5))]]

Unnamed: 0,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
9,M6B,3,Park,Pub,Sushi Restaurant,Japanese Restaurant,Eastern European Restaurant
64,M5P,3,Trail,Park,Sushi Restaurant,Jewelry Store,Dog Run


In [181]:
# cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(6, toronto_merged.shape[1]-5))]]

Unnamed: 0,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
5,M1B,4,Fast Food Restaurant,Print Shop,Women's Store,Dog Run,Dessert Shop
