#### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

#### Get the raw content of Toronto wiki page

In [2]:
# Get the raw content of Toronto wiki page
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_random_wikipedia_page = requests.get(wikipedia_link)
page = raw_random_wikipedia_page.text

#### Extract the toronto table using BeautifulSoup

In [3]:
# Extract the toronto table using BeautifulSoup
toronto_soup = BeautifulSoup(page)
toronto_table = toronto_soup.find('table', class_='wikitable sortable')

#### Transform the BeautifulSoup table to a dataframe

In [4]:
# Transform the BeautifulSoup table to a dataframe
df = pd.read_html(str(toronto_table))[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Preprocessing 
The data preprocessing contains 3 steps: 1.) remove records with a borough 'Not assigned'; 2.) set 'Neighbourhood' the same value as Borough, if 'Not assigned'; and 3.) combine Neighborhoods within the same Postcode group.

In [5]:
# Data Preprocessing 

# 1. Remove records with a borough 'Not assigned'.
df = df[df['Borough'] != 'Not assigned']
# 2. Set 'Neighbourhood' the same value as Borough, if 'Not assigned'
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] \
            = df.loc[df['Neighbourhood'] == 'Not assigned', 'Borough']
# 3. Combine Neighborhoods within the same Postcode group
df = df.groupby('Postcode', as_index=False).agg({'Borough': 'first', 'Neighbourhood': ', '.join})

#### Reset index for the dataframe

In [6]:
# Reset index
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Print the number of rows of the dataframe

In [7]:
# Print the number of rows of the dataframe
print('The number of rows of your dataframe: {}'.format(df.shape[0]))

The number of rows of your dataframe: 103


#### Given geocoder is highly unreliable, using the coordinate csv file to get latitude and longitude of each postcode. 

In [8]:
# Given geocoder is highly unreliable, use the coordinate csv file instead 
df_lat_lng = pd.read_csv('Geospatial_Coordinates.csv')
df_lat_lng.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
df_lat_lng.head()

df = df.merge(df_lat_lng)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Cluster Toronto neighborhood ('Postcode')

In this assignment, we only focus the boroughs that contain the word 'Toronto'. The neighborhoods are retrieved based on their postcodes. 

#### 1. Define Foursquare Credentials and Version

In [9]:
CLIENT_ID = '3W4WIVY453MAWA3SMLL2MY3OY0NBWB3TDCBLASXJ4RFXH2CP'
CLIENT_SECRET = 'NNWKMCE0TFL0BTUVA5GD5S0F5GJE5QL5DHSQOLLAXDL0IVOZ'
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3W4WIVY453MAWA3SMLL2MY3OY0NBWB3TDCBLASXJ4RFXH2CP
CLIENT_SECRET:NNWKMCE0TFL0BTUVA5GD5S0F5GJE5QL5DHSQOLLAXDL0IVOZ


#### 2. Define a function to get venues around a certain location

In [10]:
# Define a function to get venues around a certain location
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### 3. Retreive only boroughs that contain the word 'Toronto' and get venues for each neighborhood (postcode)

In [11]:
# Retreive only boroughs that contain the word 'Toronto' 
df_tor = df[df['Borough'].str.contains('Toronto')]

# Get venues for each postcode
toronto_venues = getNearbyVenues(names=df_tor['Postcode'],
                                   latitudes=df_tor['Latitude'],
                                   longitudes=df_tor['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,M4E,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,M4E,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
3,M4E,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


#### 4. Analyze each neighborhood (Postcode)

In [12]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add Postcode column back to dataframe
toronto_onehot.insert(0, "Postcode", toronto_venues['Postcode'], True) 
toronto_onehot.head()

Unnamed: 0,Postcode,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4K,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's group rows by Postcode and by taking the mean of the frequency of occurrence of each category

In [13]:
toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postcode,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,...,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054054,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027027
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Write a function to sort the venues in descending order

In [14]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [15]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postcode_venues_sorted = pd.DataFrame(columns=columns)
postcode_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    postcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postcode_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Health Food Store,Coffee Shop,Pub,Neighborhood,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
1,M4K,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Yoga Studio,Juice Bar,Japanese Restaurant,Sports Bar,Spa
2,M4L,Pizza Place,Brewery,Burrito Place,Sushi Restaurant,Italian Restaurant,Movie Theater,Pub,Ice Cream Shop,Fish & Chips Shop,Fast Food Restaurant
3,M4M,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Gastropub,Yoga Studio,Cheese Shop,Latin American Restaurant,Bookstore
4,M4N,Park,Lake,Swim School,Bus Line,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


Run k-means to cluster the neighborhood into 5 clusters.

In [16]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 2, 0, 0, 0, 3, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each Postcode.

In [17]:
# add clustering labels
postcode_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# add latitude/longitude for each postcode
toronto_merged = toronto_merged.merge(postcode_venues_sorted.set_index('Postcode'), on='Postcode')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Coffee Shop,Pub,Neighborhood,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Yoga Studio,Juice Bar,Japanese Restaurant,Sports Bar,Spa
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Pizza Place,Brewery,Burrito Place,Sushi Restaurant,Italian Restaurant,Movie Theater,Pub,Ice Cream Shop,Fish & Chips Shop,Fast Food Restaurant
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Gastropub,Yoga Studio,Cheese Shop,Latin American Restaurant,Bookstore
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,Park,Lake,Swim School,Bus Line,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


Finally, let's visualize the resulting clusters

In [18]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude


# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], \
                                  toronto_merged['Postcode'] + ', ' + toronto_merged['Borough'], \
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + '; Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### 5. Examine Clusters

Cluster 1

In [19]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0,  toronto_merged.columns[[0, 1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,0,Health Food Store,Coffee Shop,Pub,Neighborhood,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
1,M4K,East Toronto,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Furniture / Home Store,Yoga Studio,Juice Bar,Japanese Restaurant,Sports Bar,Spa
2,M4L,East Toronto,0,Pizza Place,Brewery,Burrito Place,Sushi Restaurant,Italian Restaurant,Movie Theater,Pub,Ice Cream Shop,Fish & Chips Shop,Fast Food Restaurant
3,M4M,East Toronto,0,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant,Gastropub,Yoga Studio,Cheese Shop,Latin American Restaurant,Bookstore
5,M4P,Central Toronto,0,Hotel,Gym,Park,Breakfast Spot,Sandwich Place,Asian Restaurant,Burger Joint,Clothing Store,Food & Drink Shop,Dumpling Restaurant
6,M4R,Central Toronto,0,Sporting Goods Shop,Clothing Store,Coffee Shop,Yoga Studio,Mexican Restaurant,Rental Car Location,Italian Restaurant,Salon / Barbershop,Sandwich Place,Fast Food Restaurant
7,M4S,Central Toronto,0,Sandwich Place,Dessert Shop,Pizza Place,Pharmacy,Italian Restaurant,Restaurant,Café,Sushi Restaurant,Coffee Shop,Gourmet Shop
9,M4V,Central Toronto,0,Coffee Shop,Pub,Pizza Place,Spa,Bagel Shop,Fried Chicken Joint,Sports Bar,Sushi Restaurant,American Restaurant,Convenience Store
11,M4X,Downtown Toronto,0,Coffee Shop,Restaurant,Pizza Place,Pharmacy,Café,Italian Restaurant,Pub,Chinese Restaurant,Bakery,Beer Store
12,M4Y,Downtown Toronto,0,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Nightclub,Bubble Tea Shop,Men's Store,Café,Mediterranean Restaurant


In [20]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1,  toronto_merged.columns[[0, 1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,M5N,Central Toronto,1,Garden,Yoga Studio,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [21]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2,  toronto_merged.columns[[0, 1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,M4N,Central Toronto,2,Park,Lake,Swim School,Bus Line,Yoga Studio,Diner,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


In [22]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3,  toronto_merged.columns[[0, 1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,M4T,Central Toronto,3,Playground,Park,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop
10,M4W,Downtown Toronto,3,Park,Playground,Trail,Yoga Studio,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


In [23]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4,  toronto_merged.columns[[0, 1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Postcode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,M5P,Central Toronto,4,Mexican Restaurant,Trail,Sushi Restaurant,Jewelry Store,Yoga Studio,Dim Sum Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant


**Observation**: Most of the postcode neighbourhoods are labelled as Cluster 0, with high frequencies of 'Coffee Shop' and 'Café' venue categories.

**How these postcode neighborhood are clustered?** <br>
Apparently, the clustering is based on the frequency of venue categories in each postcode neighborhood.