# TORONTO Neighbourhood Clustering Project

## Gathering Data

In [139]:
#import relevant packages
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests 
import random # library for random number generation


!pip install geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize


! pip install folium==0.5.0
import folium # plotting library



In [140]:
#extract information as beautiful soup object
toronto_data = pd.DataFrame()
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
toronto_neigh = requests.get(url).text
soup = BeautifulSoup(toronto_neigh, 'html5lib')
toronto_rows = soup.find_all('tr')

In [141]:
# creating a pandas dataframe 
## 1. Extract all data
postal_code_dict={}


for row in soup.find_all('tr'):
    for col in row.find_all('td'):
        try:
            ps_code=col.p.b.text
            burrough=col.p.span.text
            if burrough == "Not assigned":
                pass;
            else:
                postal_code_dict[ps_code]={}
                postal_code_dict[ps_code]['borough']=burrough.split("(")[0]
                neighbourhood = burrough.split("(")[1]
                neighbourhood = neighbourhood.replace(")", " ")
                neighbourhood_names = neighbourhood.split("/")
                postal_code_dict[ps_code]['neigh']=neighbourhood_names
        except:
            pass;

In [142]:
## Append the dataframe
for ind, postal_code in enumerate(postal_code_dict):
    borough = postal_code_dict[postal_code]['borough']
    neighborhood = ", ".join(postal_code_dict[postal_code]['neigh'])
    toronto_data = toronto_data.append({"Postal Code": postal_code, 
                                        "Borough": borough, 
                                        "Neighborhood": neighborhood},
                                        ignore_index=True)
toronto_data = toronto_data[['Postal Code', 'Borough', 'Neighborhood']]

In [143]:
#shape command
toronto_data.shape

(103, 3)

## Adding Latitudes and Longitudes

In [144]:
longitude_Data = pd.read_csv("Geospatial_Coordinates.csv")

In [145]:
longitude_Data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [146]:
toronto_data = toronto_data.merge(longitude_Data, how='inner', on='Postal Code')

## Part 3 repeating the new york assignment on Toronto

In [147]:
import numpy as np
import os
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium 
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [148]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


## Map of Toronto with neighbourhoods superimposed on top

In [149]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [151]:
# @hidden_cell
## use foursquare
CLIENT_ID = 'YRQ5K2STSKH3WTVNDNWKPFXSXRXPZBROAV40M1RP0AVV3BUI' # your Foursquare ID
CLIENT_SECRET = '4ELG5BTXF1CGTUAV1JWTBJ11RRQ05NVH0STBEUTJ3SPRRN02' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [152]:
radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
results = requests.get(url).json()

In [153]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [154]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Parkwoods 
Victoria Village 
Regent Park ,  Harbourfront 
Lawrence Manor ,  Lawrence Heights 
Ontario Provincial Government 
Islington Avenue 
Malvern ,  Rouge 
Don Mills North
Parkview Hill ,  Woodbine Gardens 
Garden District, Ryerson 
Glencairn 
West Deane Park ,  Princess Gardens ,  Martin Grove ,  Islington ,  Cloverdale 
Rouge Hill ,  Port Union ,  Highland Creek 
Don Mills South
Woodbine Heights 
St. James Town 
Humewood-Cedarvale 
Eringate ,  Bloordale Gardens ,  Old Burnhamthorpe ,  Markland Wood 
Guildwood ,  Morningside ,  West Hill 
The Beaches 
Berczy Park 
Caledonia-Fairbanks 
Woburn 
Leaside 
Central Bay Street 
Christie 
Cedarbrae 
Hillcrest Village 
Bathurst Manor ,  Wilson Heights ,  Downsview North 
Thorncliffe Park 
Richmond ,  Adelaide ,  King 
Dufferin ,  Dovercourt Village 
Scarborough Village 
Fairview ,  Henry Farm ,  Oriole 
Northwood Park ,  York University 
The Danforth  East 
Harbourfront East ,  Union Station ,  Toronto Islands 
Little Portugal ,  Trinity 

In [117]:
print(toronto_venues.shape)
toronto_venues.head()

(1984, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
1,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,Parkwoods,43.753259,-79.329656,Towns On The Ravine,43.754754,-79.332552,Hotel
3,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
4,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena


In [86]:
toronto_venues.groupby('Neighborhood').count()

(1984, 7)


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood , Long Branch",7,7,7,7,7,7
"Bathurst Manor , Wilson Heights , Downsview North",20,20,20,20,20,20
Bayview Village,4,4,4,4,4,4
"Bedford Park , Lawrence Manor East",21,21,21,21,21,21
...,...,...,...,...,...,...
Willowdale South,28,28,28,28,28,28
Willowdale West,6,6,6,6,6,6
Woburn,4,4,4,4,4,4
Woodbine Heights,5,5,5,5,5,5


In [118]:
print('There are {} uniques categories.'.format(len(manhattan_venues['Venue Category'].unique())))

There are 256 uniques categories.


## One hot encoding

In [119]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
toronto_onehot.shape

(1984, 256)

In [120]:
# define function for most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]
#top 10 most common venues
num_top_venues = 10

indicators = ['st', 'nd', 'rd']


columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Breakfast Spot,Latin American Restaurant,Skating Rink,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Women's Store
1,"Alderwood , Long Branch",Pizza Place,Pharmacy,Skating Rink,Pub,Sandwich Place,Coffee Shop,German Restaurant,Dog Run,Distribution Center,Discount Store
2,"Bathurst Manor , Wilson Heights , Downsview ...",Bank,Coffee Shop,Convenience Store,Ice Cream Shop,Sushi Restaurant,Sandwich Place,Chinese Restaurant,Bridal Shop,Diner,Pizza Place
3,Bayview Village,Chinese Restaurant,Bank,Café,Japanese Restaurant,Drugstore,Donut Shop,Doner Restaurant,Dog Run,Department Store,Distribution Center
4,"Bedford Park , Lawrence Manor East",Sandwich Place,Coffee Shop,Restaurant,Café,Comfort Food Restaurant,Fast Food Restaurant,Italian Restaurant,Butcher,Indian Restaurant,Sushi Restaurant


In [125]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

In [126]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

ValueError: cannot insert Cluster Labels, already exists

In [136]:
toronto_merged = toronto_merged.dropna(axis=0)

In [138]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters