# 1. Answer to Question 1: scrape Toronto neighborhood website to get neighborhoods information

In [None]:
## Import libaries that will be used in the task

In [102]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import csv
import geocoder

## Use BeautifulSoup to scrape the website to get the table of neighborhoods

In [103]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
#print(soup)
iTable = soup.find('table', attrs={"class":"wikitable sortable"})
#print(iTable)

## Write the content of the table into a csv file. And ignore boroghs with "Not assigned"

In [104]:
with open('tasks/neighborhoods_6.csv','w', newline='') as csvfile:
    output_file = csv.writer(csvfile, delimiter=',')
    output_file.writerow(['PostalCode','Borough','Neighborhood'])
    
    # handle each row
    for tr in iTable.tbody.find_all("tr"):
        #print(tr)
        tds = []
        for td in tr.find_all("td"):
            tds.append(td.text.replace('\n','').strip())
        
        if (len(tds) > 0):  # avoid empty list from th tag
            if (tds[1] == 'Not assigned'):
                print("drop this line: ", tds[0])
            else:
                output_file.writerow(tds)

drop this line:  M1A
drop this line:  M2A
drop this line:  M8A
drop this line:  M2B
drop this line:  M7B
drop this line:  M8B
drop this line:  M2C
drop this line:  M7C
drop this line:  M8C
drop this line:  M2E
drop this line:  M3E
drop this line:  M7E
drop this line:  M8E
drop this line:  M9E
drop this line:  M2G
drop this line:  M3G
drop this line:  M7G
drop this line:  M8G
drop this line:  M9G
drop this line:  M7H
drop this line:  M8H
drop this line:  M9H
drop this line:  M7J
drop this line:  M8J
drop this line:  M9J
drop this line:  M7K
drop this line:  M8K
drop this line:  M9K
drop this line:  M7L
drop this line:  M8L
drop this line:  M7M
drop this line:  M8M
drop this line:  M7N
drop this line:  M8N
drop this line:  M3P
drop this line:  M7P
drop this line:  M8P
drop this line:  M3R
drop this line:  M8R
drop this line:  M2S
drop this line:  M3S
drop this line:  M7S
drop this line:  M8S
drop this line:  M9S
drop this line:  M2T
drop this line:  M3T
drop this line:  M6T
drop this lin

## Read csv file into pandas dataframe. There are 103 rows and 3 columns in the dataframe.

In [141]:
df = pd.read_csv('tasks/neighborhoods_6.csv')
print(df.head())

  PostalCode           Borough                                 Neighborhood
0        M3A        North York                                    Parkwoods
1        M4A        North York                             Victoria Village
2        M5A  Downtown Toronto                    Regent Park, Harbourfront
3        M6A        North York             Lawrence Manor, Lawrence Heights
4        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government


In [142]:
df.shape

(103, 3)

## Now a dataframe of Toronto neighborhoods has been established.

# 2. Answer to Question 2: Join coordinates into the dataframe

## First, try to use Geocoder to get latitude/longitude of each postal code

In [107]:
#postal_code = df['PostalCode']
#print(postal_code)
#lat_lng_coords = None
#while(lat_lng_coords is None):
#    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#    lat_lng_coords = g.latlng
#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]


## But Geocoder doesn't work. So I comment out the code above. 

## Now I use the existing csv file which includes latitude/longitude I need

## First, read coordinates into a dataframe called df_coordinate

In [143]:
# read coordinates from file
df_coordinate = pd.read_csv('tasks/Geospatial_Coordinates.csv')
print(df_coordinate.head())
print("done")
df_coordinate.shape

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476
done


(103, 3)

## Rename "Postal Code" to "PostalCode"

In [144]:
df_coordinate.rename(columns = {'Postal Code': 'PostalCode'}, inplace=True)

In [145]:
df_coordinate.columns.values

array(['PostalCode', 'Latitude', 'Longitude'], dtype=object)

## Then, merge the two dataframes based on column "PostalCode" 

In [146]:
df_total = pd.merge(df, df_coordinate, on='PostalCode', how='left')

In [147]:
print(df_total.head())
df_total.shape

  PostalCode           Borough                                 Neighborhood  \
0        M3A        North York                                    Parkwoods   
1        M4A        North York                             Victoria Village   
2        M5A  Downtown Toronto                    Regent Park, Harbourfront   
3        M6A        North York             Lawrence Manor, Lawrence Heights   
4        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government   

    Latitude  Longitude  
0  43.753259 -79.329656  
1  43.725882 -79.315572  
2  43.654260 -79.360636  
3  43.718518 -79.464763  
4  43.662301 -79.389494  


(103, 5)

In [149]:
print('The dataframe has {} boroughs and {} postal codes.'.format(
        len(df_total['Borough'].unique()),
        df_total.shape[0]
    )
)

The dataframe has 10 boroughs and 103 postal codes.


In [150]:
print("Unique boroughs in the dataframe:")
print(df_total['Borough'].unique())

Unique boroughs in the dataframe:
['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Mississauga']


# Now a dataframe with both neighborhood and latitude/longitude information has been created.

# 3. Answer to Question 3: Explore and Cluster neighborhoods in Toronto

## First, explore neighborhoods in Toronto by creating a map

In [151]:
# Get coordinates of Toronto, CA

from geopy.geocoders import Nominatim

address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [152]:
# create map of Toronto using folium library

import folium # map rendering library

Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers and popup wondin to map
for lat, lng, postalcode, borough, neighborhood in zip(df_total['Latitude'], df_total['Longitude'], df_total['PostalCode'], df_total['Borough'], df_total['Neighborhood']):
    label = '{}, {}, {}'.format(postalcode, borough, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_map)  
    
Toronto_map

In [None]:
## Note: Folium map can't be rendered in Github. But you can review map at this link:

## Now we need to request venues near each postal code in entire Toronto

## First, define Foursquare Credentials and Version

In [153]:
CLIENT_ID = 'S3PRMDQX0MOCKBHZ40GXLRO3NWJ3EIOY0F0GIN52JDY5FQ0J' # your Foursquare ID
CLIENT_SECRET = 'OVMTVGTK5EW0WM5OIMT2CMJ4MGN5EQNUUKRKU1IRCCAQ0MCL' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

## Now, request venues within 500 meter from coordinates of each postal code with Foursquare service

In [169]:
def getNearbyVenues(names, latitudes, longitudes, neighborhoods, radius=500):
    LIMIT = 100
    venues_list=[]
    for name, lat, lng, neighborhood in zip(names, latitudes, longitudes, neighborhoods):
        #print(postalcode)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        #print (results[0])
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            neighborhood,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    #nearby_venues.columns = ['Neighborhood', 
    #              'Neighborhood Latitude', 
    #              'Neighborhood Longitude', 
    nearby_venues.columns = ['Postal Code', 
                  'PostalCode Latitude', 
                  'PostalCode Longitude', 
                  'Neighborhood',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [170]:
Toronto_venues = getNearbyVenues(names=df_total['PostalCode'],
                                   latitudes=df_total['Latitude'],
                                   longitudes=df_total['Longitude'],
                                   neighborhoods=df_total['Neighborhood']
                                )
  

In [171]:
print(Toronto_venues.shape)

(2146, 8)


In [172]:
Toronto_venues.head()

Unnamed: 0,Postal Code,PostalCode Latitude,PostalCode Longitude,Neighborhood,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,43.753259,-79.329656,Parkwoods,Brookbanks Park,43.751976,-79.33214,Park
1,M3A,43.753259,-79.329656,Parkwoods,TTC stop #8380,43.752672,-79.326351,Bus Stop
2,M3A,43.753259,-79.329656,Parkwoods,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,M4A,43.725882,-79.315572,Victoria Village,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,M4A,43.725882,-79.315572,Victoria Village,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [181]:
print(Toronto_venues['Venue Category'].unique())

['Park' 'Bus Stop' 'Food & Drink Shop' 'Hockey Arena' 'Coffee Shop'
 'Portuguese Restaurant' 'Intersection' 'Pizza Place' 'Bakery'
 'Distribution Center' 'Restaurant' 'Spa' 'Pub' 'Historic Site'
 'Breakfast Spot' 'Gym / Fitness Center' 'Farmers Market' 'Chocolate Shop'
 'Performing Arts Venue' 'Dessert Shop' 'Mexican Restaurant' 'Theater'
 'French Restaurant' 'Café' 'Yoga Studio' 'Event Space' 'Shoe Store'
 'Ice Cream Shop' 'Art Gallery' 'Cosmetics Shop' 'Electronics Store'
 'Bank' 'Beer Store' 'Hotel' 'Antique Shop' 'Boutique'
 'Furniture / Home Store' 'Vietnamese Restaurant' 'Accessories Store'
 'Clothing Store' 'Italian Restaurant' 'Beer Bar' 'Sushi Restaurant'
 'Persian Restaurant' 'Creperie' 'Arts & Crafts Store' 'Hobby Shop'
 'Diner' 'Fried Chicken Joint' 'Chinese Restaurant' 'Smoothie Shop'
 'Sandwich Place' 'Gym' 'College Auditorium' 'Bar' 'Fast Food Restaurant'
 'Caribbean Restaurant' 'Japanese Restaurant' 'Baseball Field'
 'Athletics & Sports' 'Gastropub' 'Pharmacy' 'Pet Stor

In [182]:
print(len(Toronto_venues['Venue Category'].unique()))

266


In [187]:
print(Toronto_venues.columns)

Index(['Postal Code', 'PostalCode Latitude', 'PostalCode Longitude',
       'Neighborhood', 'Venue', 'Venue Latitude', 'Venue Longitude',
       'Venue Category'],
      dtype='object')


In [212]:
toronto_venues1 = Toronto_venues[['Postal Code', 'Neighborhood']]
toronto_venues1.head()

Unnamed: 0,Postal Code,Neighborhood
0,M3A,Parkwoods
1,M3A,Parkwoods
2,M3A,Parkwoods
3,M4A,Victoria Village
4,M4A,Victoria Village


In [213]:
toronto_venues1.shape

(2146, 2)

## Create a new dataframe by encoding values of Venue Category

In [215]:
# one hot encoding
toronto_onehot1 = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

print(toronto_onehot1.columns)

Index(['Accessories Store', 'Afghan Restaurant', 'Airport',
       'Airport Food Court', 'Airport Gate', 'Airport Lounge',
       'Airport Service', 'Airport Terminal', 'American Restaurant',
       'Antique Shop',
       ...
       'Trail', 'Train Station', 'Vegetarian / Vegan Restaurant',
       'Video Game Store', 'Vietnamese Restaurant', 'Warehouse Store',
       'Wine Bar', 'Wings Joint', 'Women's Store', 'Yoga Studio'],
      dtype='object', length=266)


In [205]:
toronto_onehot1.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [216]:

# add postal code column back to dataframe
toronto_onehot1['Postal Code'] = Toronto_venues['Postal Code'] 

# move postal code column to the first column
fixed_columns = [toronto_onehot1.columns[-1]] + list(toronto_onehot1.columns[:-1])
toronto_onehot1 = toronto_onehot1[fixed_columns]

toronto_onehot1.head()

Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [207]:
#toronto_venues1 = pd.concat([toronto_venues1, toronto_onehot1])
#toronto_venues_final = pd.merge(toronto_venues1, toronto_onehot1, on='Postal Code', how='inner')
#toronto_venues_final.head()

#results are not correct

In [217]:
toronto_onehot1.shape

(2146, 267)

## Group Toronto neighborhoods by Postal Code

In [218]:
toronto_grouped = Toronto_onehot.groupby('Postal Code').mean().reset_index()
toronto_grouped
#Toronto_grouped.shape

Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,M9N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,M9P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,M9R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,M9V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Print each neighborhood (postal code area) along with the top 5 most common venues

In [219]:
num_top_venues = 5

for postalarea in toronto_grouped['Postal Code']:
    print("----"+postalarea+"----")
    temp = toronto_grouped[toronto_grouped['Postal Code'] == postalarea].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M1B----
                             venue  freq
0             Fast Food Restaurant   1.0
1  Molecular Gastronomy Restaurant   0.0
2                    Luggage Store   0.0
3                 Malay Restaurant   0.0
4                           Market   0.0


----M1C----
                        venue  freq
0                         Bar   1.0
1           Accessories Store   0.0
2               Metro Station   0.0
3  Modern European Restaurant   0.0
4           Mobile Phone Shop   0.0


----M1E----
                 venue  freq
0           Restaurant  0.12
1    Electronics Store  0.12
2  Rental Car Location  0.12
3                 Bank  0.12
4         Intersection  0.12


----M1G----
                        venue  freq
0                 Coffee Shop  0.67
1           Korean Restaurant  0.33
2           Accessories Store  0.00
3  Modern European Restaurant  0.00
4           Mobile Phone Shop  0.00


----M1H----
                 venue  freq
0     Hakka Restaurant  0.11
1      Thai Restaurant

## Create a new dataframe for top 10 venues in each neighborhood (postal code area)

In [220]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [222]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postal Code'] = toronto_grouped['Postal Code']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop
1,M1C,Bar,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,Farm
2,M1E,Electronics Store,Rental Car Location,Restaurant,Bank,Intersection,Medical Center,Mexican Restaurant,Breakfast Spot,Doner Restaurant,Donut Shop
3,M1G,Coffee Shop,Korean Restaurant,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Dumpling Restaurant
4,M1H,Fried Chicken Joint,Thai Restaurant,Bank,Athletics & Sports,Caribbean Restaurant,Gas Station,Hakka Restaurant,Lounge,Bakery,Eastern European Restaurant


## Cluster Neighborhoods (Postal Code Area)

In [223]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

Toronto_grouped_clustering = toronto_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 2, 1, 1, 1, 1, 1, 1, 1, 1])

In [229]:
# add clustering labels

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,4,M1B,Fast Food Restaurant,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop
1,2,M1C,Bar,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,Farm
2,1,M1E,Electronics Store,Rental Car Location,Restaurant,Bank,Intersection,Medical Center,Mexican Restaurant,Breakfast Spot,Doner Restaurant,Donut Shop
3,1,M1G,Coffee Shop,Korean Restaurant,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Dumpling Restaurant
4,1,M1H,Fried Chicken Joint,Thai Restaurant,Bank,Athletics & Sports,Caribbean Restaurant,Gas Station,Hakka Restaurant,Lounge,Bakery,Eastern European Restaurant


In [239]:
neighborhoods_venues_sorted.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

In [240]:
neighborhoods_venues_sorted.columns

Index(['Cluster Labels', 'PostalCode', '1st Most Common Venue',
       '2nd Most Common Venue', '3rd Most Common Venue',
       '4th Most Common Venue', '5th Most Common Venue',
       '6th Most Common Venue', '7th Most Common Venue',
       '8th Most Common Venue', '9th Most Common Venue',
       '10th Most Common Venue'],
      dtype='object')

In [241]:
df_merged1 = df_total

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_merged1 = df_merged1.join(neighborhoods_venues_sorted.set_index('PostalCode'), on='PostalCode')

df_merged1.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,Park,Food & Drink Shop,Bus Stop,Colombian Restaurant,Dessert Shop,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore
1,M4A,North York,Victoria Village,43.725882,-79.315572,1.0,Intersection,Coffee Shop,Portuguese Restaurant,Hockey Arena,Pizza Place,Yoga Studio,Distribution Center,Dim Sum Restaurant,Diner,Discount Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1.0,Coffee Shop,Bakery,Park,Café,Pub,Restaurant,Theater,Breakfast Spot,Event Space,Shoe Store
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1.0,Furniture / Home Store,Clothing Store,Accessories Store,Event Space,Coffee Shop,Vietnamese Restaurant,Boutique,Doner Restaurant,Diner,Discount Store
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1.0,Coffee Shop,Yoga Studio,Persian Restaurant,Distribution Center,Italian Restaurant,Smoothie Shop,Beer Bar,Diner,Portuguese Restaurant,Café


In [244]:
#df_merged1.drop(columns=['Cluster Labels'], axis=1)

## Create cluster map for Toronto

In [264]:
# import color map and colors
import matplotlib.cm as cm
import matplotlib.colors as colors
k = 5
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, postalcode, neibor, cluster in zip(df_merged1['Latitude'], df_merged1['Longitude'], df_merged1['PostalCode'], df_merged1['Neighborhood'], df_merged1['Cluster Labels']):
    #print (poi, cluster)
    label = folium.Popup(str(postalcode) + ',' + str(neibor) + '.', parse_html=True)
    
    # handle when cluster is nan
    cluster_ind = -1
    
    if (not (np.isnan(cluster))):
        cluster_ind = int(cluster)
        
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster_ind-1],
        fill=True,
        fill_color=rainbow[cluster_ind-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Try again with 3 as cluster number 

In [249]:
# set number of clusters
kclusters = 3

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:30] 

array([1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1])

In [250]:
# add clustering labels

neighborhoods_venues_sorted.insert(0, 'Cluster Labels3', kmeans.labels_)
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels3,Cluster Labels,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,4,M1B,Fast Food Restaurant,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop
1,2,2,M1C,Bar,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,Farm
2,1,1,M1E,Electronics Store,Rental Car Location,Restaurant,Bank,Intersection,Medical Center,Mexican Restaurant,Breakfast Spot,Doner Restaurant,Donut Shop
3,1,1,M1G,Coffee Shop,Korean Restaurant,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Dumpling Restaurant
4,1,1,M1H,Fried Chicken Joint,Thai Restaurant,Bank,Athletics & Sports,Caribbean Restaurant,Gas Station,Hakka Restaurant,Lounge,Bakery,Eastern European Restaurant


## Merge the result with df_total

In [252]:
df_merged3 = df_total

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_merged3 = df_merged3.join(neighborhoods_venues_sorted.set_index('PostalCode'), on='PostalCode')

df_merged3.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels3,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,0.0,Park,Food & Drink Shop,Bus Stop,Colombian Restaurant,Dessert Shop,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore
1,M4A,North York,Victoria Village,43.725882,-79.315572,1.0,1.0,Intersection,Coffee Shop,Portuguese Restaurant,Hockey Arena,Pizza Place,Yoga Studio,Distribution Center,Dim Sum Restaurant,Diner,Discount Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1.0,1.0,Coffee Shop,Bakery,Park,Café,Pub,Restaurant,Theater,Breakfast Spot,Event Space,Shoe Store
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1.0,1.0,Furniture / Home Store,Clothing Store,Accessories Store,Event Space,Coffee Shop,Vietnamese Restaurant,Boutique,Doner Restaurant,Diner,Discount Store
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1.0,1.0,Coffee Shop,Yoga Studio,Persian Restaurant,Distribution Center,Italian Restaurant,Smoothie Shop,Beer Bar,Diner,Portuguese Restaurant,Café


## Create a cluster map

In [263]:
# import color map and colors
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, postalcode, neibor, cluster in zip(df_merged3['Latitude'], df_merged3['Longitude'], df_merged3['PostalCode'], df_merged3['Neighborhood'], df_merged3['Cluster Labels3']):
    #print (poi, cluster)
    label = folium.Popup(str(postalcode) + ',' + str(neibor) + '.', parse_html=True)
    
    # handle when cluster is nan
    cluster_ind = -1
    
    if (not (np.isnan(cluster))):
        cluster_ind = int(cluster)
        
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster_ind-1],
        fill=True,
        fill_color=rainbow[cluster_ind-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine clusters

In [275]:
df_merged3.drop(columns=['Cluster Labels'], inplace=True)

## Cluster 1 - Recreation and fitness places

In [277]:
# explore venues with cluster label 0
df_merged3.loc[df_merged3['Cluster Labels3'] == 0, df_merged3.columns[[0] + list(range(5, df_merged3.shape[1]))]]

Unnamed: 0,PostalCode,Cluster Labels3,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,0.0,Park,Food & Drink Shop,Bus Stop,Colombian Restaurant,Dessert Shop,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore
16,M6C,0.0,Trail,Park,Field,Hockey Arena,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center
21,M6E,0.0,Park,Women's Store,Pool,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant
35,M4J,0.0,Park,Convenience Store,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Department Store,Donut Shop
40,M3K,0.0,Park,Airport,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore
52,M2M,0.0,Park,Department Store,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant
61,M4N,0.0,Park,Bus Line,Jewelry Store,Swim School,Yoga Studio,Dog Run,Dim Sum Restaurant,Diner,Discount Store,Distribution Center
64,M9N,0.0,Park,Jewelry Store,Convenience Store,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Ethiopian Restaurant,Dessert Shop,Drugstore
66,M2P,0.0,Park,Convenience Store,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Department Store,Donut Shop
68,M5P,0.0,Trail,Park,Jewelry Store,Sushi Restaurant,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center


## Cluster 2 - Food and drink places

In [278]:
# explore venues with cluster label 1
df_merged3.loc[df_merged3['Cluster Labels3'] == 1, df_merged3.columns[[1] + list(range(5, df_merged3.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels3,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,1.0,Intersection,Coffee Shop,Portuguese Restaurant,Hockey Arena,Pizza Place,Yoga Studio,Distribution Center,Dim Sum Restaurant,Diner,Discount Store
2,Downtown Toronto,1.0,Coffee Shop,Bakery,Park,Café,Pub,Restaurant,Theater,Breakfast Spot,Event Space,Shoe Store
3,North York,1.0,Furniture / Home Store,Clothing Store,Accessories Store,Event Space,Coffee Shop,Vietnamese Restaurant,Boutique,Doner Restaurant,Diner,Discount Store
4,Downtown Toronto,1.0,Coffee Shop,Yoga Studio,Persian Restaurant,Distribution Center,Italian Restaurant,Smoothie Shop,Beer Bar,Diner,Portuguese Restaurant,Café
6,Scarborough,1.0,Fast Food Restaurant,Yoga Studio,Dessert Shop,Ethiopian Restaurant,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop
...,...,...,...,...,...,...,...,...,...,...,...,...
98,Etobicoke,1.0,River,Yoga Studio,Department Store,Escape Room,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant
99,Downtown Toronto,1.0,Coffee Shop,Gay Bar,Japanese Restaurant,Sushi Restaurant,Restaurant,Yoga Studio,Pub,Café,Bubble Tea Shop,Hotel
100,East Toronto,1.0,Light Rail Station,Fast Food Restaurant,Auto Workshop,Pizza Place,Restaurant,Brewery,Garden,Garden Center,Smoke Shop,Park
101,Etobicoke,1.0,Construction & Landscaping,Baseball Field,Yoga Studio,Drugstore,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant


## Cluster 3 - Bar

In [279]:
# explore venues with cluster label 2
df_merged3.loc[df_merged3['Cluster Labels3'] == 2, df_merged3.columns[[1] + list(range(5, df_merged3.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels3,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Scarborough,2.0,Bar,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,Farm
