In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import csv
import geocoder

## Use BeautifulSoup to scrape the website to get the table of neighborhoods

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
#print(soup)
iTable = soup.find('table', attrs={"class":"wikitable sortable"})
#print(iTable)

## Write the content of the table into a csv file

In [3]:
with open('tasks/neighborhoods_6.csv','w', newline='') as csvfile:
    output_file = csv.writer(csvfile, delimiter=',')
    output_file.writerow(['PostalCode','Borough','Neighborhood'])
    
    # handle each row
    for tr in iTable.tbody.find_all("tr"):
        #print(tr)
        tds = []
        for td in tr.find_all("td"):
            tds.append(td.text.replace('\n','').strip())
        
        if (len(tds) > 0):  # avoid empty list from th tag
            if (tds[1] == 'Not assigned'):
                print("drop this line: ", tds[0])
            else:
                output_file.writerow(tds)

drop this line:  M1A
drop this line:  M2A
drop this line:  M8A
drop this line:  M2B
drop this line:  M7B
drop this line:  M8B
drop this line:  M2C
drop this line:  M7C
drop this line:  M8C
drop this line:  M2E
drop this line:  M3E
drop this line:  M7E
drop this line:  M8E
drop this line:  M9E
drop this line:  M2G
drop this line:  M3G
drop this line:  M7G
drop this line:  M8G
drop this line:  M9G
drop this line:  M7H
drop this line:  M8H
drop this line:  M9H
drop this line:  M7J
drop this line:  M8J
drop this line:  M9J
drop this line:  M7K
drop this line:  M8K
drop this line:  M9K
drop this line:  M7L
drop this line:  M8L
drop this line:  M7M
drop this line:  M8M
drop this line:  M7N
drop this line:  M8N
drop this line:  M3P
drop this line:  M7P
drop this line:  M8P
drop this line:  M3R
drop this line:  M8R
drop this line:  M2S
drop this line:  M3S
drop this line:  M7S
drop this line:  M8S
drop this line:  M9S
drop this line:  M2T
drop this line:  M3T
drop this line:  M6T
drop this lin

## Read csv file into pandas dataframe

In [4]:
df = pd.read_csv('tasks/neighborhoods_6.csv')
print(df.head())

  PostalCode           Borough                                 Neighborhood
0        M3A        North York                                    Parkwoods
1        M4A        North York                             Victoria Village
2        M5A  Downtown Toronto                    Regent Park, Harbourfront
3        M6A        North York             Lawrence Manor, Lawrence Heights
4        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government


In [5]:
df.shape

(103, 3)

## Get long/lat of postal codes with geocoder 

In [6]:
postal_code = df['PostalCode']
print(postal_code)


0      M3A
1      M4A
2      M5A
3      M6A
4      M7A
      ... 
98     M8X
99     M4Y
100    M7Y
101    M8Y
102    M8Z
Name: PostalCode, Length: 103, dtype: object


## Geocoder doesn't work. Use coordinates from existing csv file instead

In [7]:
#lat_lng_coords = None
#while(lat_lng_coords is None):
#    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#    lat_lng_coords = g.latlng
#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

# read coordinates from file
df_coordinate = pd.read_csv('tasks/Geospatial_Coordinates.csv')
print(df_coordinate.head())
print("done")

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476
done


In [8]:
df_coordinate.rename(columns = {'Postal Code': 'PostalCode'}, inplace=True)

In [9]:
df_coordinate.columns.values

array(['PostalCode', 'Latitude', 'Longitude'], dtype=object)

In [10]:
df_total = pd.merge(df, df_coordinate, on='PostalCode', how='inner')

In [11]:
print(df_total.head())

  PostalCode           Borough                                 Neighborhood  \
0        M3A        North York                                    Parkwoods   
1        M4A        North York                             Victoria Village   
2        M5A  Downtown Toronto                    Regent Park, Harbourfront   
3        M6A        North York             Lawrence Manor, Lawrence Heights   
4        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government   

    Latitude  Longitude  
0  43.753259 -79.329656  
1  43.725882 -79.315572  
2  43.654260 -79.360636  
3  43.718518 -79.464763  
4  43.662301 -79.389494  


In [14]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_total['Borough'].unique()),
        df_total.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [17]:
print("Unique boroughs in the dataframe:")
print(df_total['Borough'].unique())

Unique boroughs in the dataframe:
['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Mississauga']


## Create map of neighborhoods in Toronto 

In [12]:
from geopy.geocoders import Nominatim

address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [78]:
# create map of Toronto using latitude and longitude values

import folium # map rendering library

Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, postalcode in zip(df_total['Latitude'], df_total['Longitude'], df_total['Borough'], df_total['Neighborhood'], df_total['PostalCode']):
    label = '{}, {}, {}'.format(neighborhood, borough, postalcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_map)  
    
Toronto_map

## Explore the data and segment neighborhoods

### Define Foursquare Credentials and Version

In [19]:
CLIENT_ID = 'S3PRMDQX0MOCKBHZ40GXLRO3NWJ3EIOY0F0GIN52JDY5FQ0J' # your Foursquare ID
CLIENT_SECRET = 'OVMTVGTK5EW0WM5OIMT2CMJ4MGN5EQNUUKRKU1IRCCAQ0MCL' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: S3PRMDQX0MOCKBHZ40GXLRO3NWJ3EIOY0F0GIN52JDY5FQ0J
CLIENT_SECRET:OVMTVGTK5EW0WM5OIMT2CMJ4MGN5EQNUUKRKU1IRCCAQ0MCL


In [29]:
def getNearbyVenues(names, latitudes, longitudes, postalcodes, radius=500):
    LIMIT = 100
    venues_list=[]
    for name, lat, lng, postalcode in zip(names, latitudes, longitudes, postalcodes):
        #print(postalcode)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            postalcode,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'PostalCode',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [30]:
Toronto_venues = getNearbyVenues(names=df_total['Neighborhood'],
                                   latitudes=df_total['Latitude'],
                                   longitudes=df_total['Longitude'],
                                   postalcodes=df_total['PostalCode']
                                  )


In [32]:
print(Toronto_venues.shape)
Toronto_venues.head()

(2149, 8)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,PostalCode,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,M3A,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,M3A,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,M4A,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,M4A,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,M4A,Portugril,43.725819,-79.312785,Portuguese Restaurant


In [39]:
print('There are {} neighborhoods.'.format(len(Toronto_venues['Neighborhood'].unique())))
Toronto_venues.groupby('Neighborhood').count()

There are 95 neighborhoods.


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,PostalCode,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Agincourt,4,4,4,4,4,4,4
"Alderwood, Long Branch",9,9,9,9,9,9,9
"Bathurst Manor, Wilson Heights, Downsview North",22,22,22,22,22,22,22
Bayview Village,4,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",22,22,22,22,22,22,22
...,...,...,...,...,...,...,...
"Willowdale, Willowdale East",34,34,34,34,34,34,34
"Willowdale, Willowdale West",5,5,5,5,5,5,5
Woburn,3,3,3,3,3,3,3
Woodbine Heights,8,8,8,8,8,8,8


In [35]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 269 uniques categories.


In [37]:
print(Toronto_venues['Venue Category'].unique())

['Park' 'Food & Drink Shop' 'Hockey Arena' 'Coffee Shop'
 'Portuguese Restaurant' 'Intersection' 'Pizza Place' 'Bakery'
 'Distribution Center' 'Spa' 'Restaurant' 'Pub' 'Breakfast Spot'
 'Historic Site' 'Gym / Fitness Center' 'Farmers Market' 'Chocolate Shop'
 'Dessert Shop' 'Performing Arts Venue' 'Theater' 'Mexican Restaurant'
 'French Restaurant' 'Café' 'Yoga Studio' 'Event Space' 'Shoe Store'
 'Ice Cream Shop' 'Art Gallery' 'Cosmetics Shop' 'Bank'
 'Electronics Store' 'Beer Store' 'Hotel' 'Health Food Store' 'Wine Shop'
 'Antique Shop' 'Boutique' 'Furniture / Home Store'
 'Vietnamese Restaurant' 'Accessories Store' 'Clothing Store'
 'Italian Restaurant' 'Sushi Restaurant' 'Persian Restaurant' 'Creperie'
 'Beer Bar' 'Arts & Crafts Store' 'Hobby Shop' 'Japanese Restaurant'
 'Diner' 'Fried Chicken Joint' 'Chinese Restaurant' 'Smoothie Shop'
 'Sandwich Place' 'Gym' 'College Auditorium' 'Bar'
 'Vegetarian / Vegan Restaurant' 'Fast Food Restaurant'
 'Caribbean Restaurant' 'Baseball Field'

### Analyze Each Neighborhood

In [43]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
Toronto_onehot.shape

(2149, 269)

In [46]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped
#Toronto_grouped.shape

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,...,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,"Willowdale, Willowdale East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000,0.029412,0.0,0.0,0.0,0.0,0.0
91,"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,0.0
92,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,0.0
93,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.125,0.000000,0.0,0.0,0.0,0.0,0.0


### Print each neighborhood along with the top 5 most common venues

In [47]:
num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0  Latin American Restaurant  0.25
1                     Lounge  0.25
2               Skating Rink  0.25
3             Breakfast Spot  0.25
4              Metro Station  0.00


----Alderwood, Long Branch----
          venue  freq
0   Pizza Place  0.22
1      Pharmacy  0.11
2  Skating Rink  0.11
3           Gym  0.11
4   Coffee Shop  0.11


----Bathurst Manor, Wilson Heights, Downsview North----
              venue  freq
0              Bank  0.09
1       Coffee Shop  0.09
2    Sandwich Place  0.05
3  Sushi Restaurant  0.05
4       Supermarket  0.05


----Bayview Village----
                 venue  freq
0  Japanese Restaurant  0.25
1                 Café  0.25
2                 Bank  0.25
3   Chinese Restaurant  0.25
4                Motel  0.00


----Bedford Park, Lawrence Manor East----
                venue  freq
0  Italian Restaurant  0.09
1         Coffee Shop  0.09
2      Sandwich Place  0.09
3             Butcher  0.05
4        

### create a new dataframe for top 10 venues in each neighborhood

In [50]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [51]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Breakfast Spot,Skating Rink,Latin American Restaurant,Lounge,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant
1,"Alderwood, Long Branch",Pizza Place,Pool,Coffee Shop,Skating Rink,Gym,Pharmacy,Pub,Sandwich Place,Women's Store,Department Store
2,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Ice Cream Shop,Pharmacy,Supermarket,Middle Eastern Restaurant,Sushi Restaurant,Mobile Phone Shop,Restaurant,Fried Chicken Joint
3,Bayview Village,Café,Japanese Restaurant,Bank,Chinese Restaurant,Women's Store,Dim Sum Restaurant,Discount Store,Distribution Center,Dog Run,Doner Restaurant
4,"Bedford Park, Lawrence Manor East",Sandwich Place,Coffee Shop,Italian Restaurant,Liquor Store,Juice Bar,Pub,Butcher,Café,Sushi Restaurant,Restaurant


## Cluster Neighborhoods

In [83]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4])

In [84]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_merged = df_total

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_merged = df_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

df_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels2,Cluster Labels1,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2.0,0.0,2.0,Food & Drink Shop,Park,Women's Store,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,4.0,1.0,4.0,Intersection,Coffee Shop,Pizza Place,Portuguese Restaurant,Hockey Arena,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4.0,1.0,4.0,Coffee Shop,Bakery,Park,Café,Pub,Breakfast Spot,Theater,Restaurant,Event Space,Shoe Store
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,4.0,1.0,4.0,Clothing Store,Event Space,Accessories Store,Vietnamese Restaurant,Coffee Shop,Boutique,Furniture / Home Store,Doner Restaurant,Dim Sum Restaurant,Diner
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4.0,1.0,4.0,Coffee Shop,Yoga Studio,Café,Fried Chicken Joint,Sushi Restaurant,Mexican Restaurant,Bank,Bar,Portuguese Restaurant,Italian Restaurant


In [96]:
df_merged.drop(columns=['Cluster Labels1', 'Cluster Labels2'], axis=1)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2.0,Food & Drink Shop,Park,Women's Store,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,4.0,Intersection,Coffee Shop,Pizza Place,Portuguese Restaurant,Hockey Arena,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,4.0,Coffee Shop,Bakery,Park,Café,Pub,Breakfast Spot,Theater,Restaurant,Event Space,Shoe Store
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,4.0,Clothing Store,Event Space,Accessories Store,Vietnamese Restaurant,Coffee Shop,Boutique,Furniture / Home Store,Doner Restaurant,Dim Sum Restaurant,Diner
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4.0,Coffee Shop,Yoga Studio,Café,Fried Chicken Joint,Sushi Restaurant,Mexican Restaurant,Bank,Bar,Portuguese Restaurant,Italian Restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,2.0,Park,River,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,College Gym,Dog Run
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,4.0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Bubble Tea Shop,Mediterranean Restaurant,Café,Men's Store,Yoga Studio
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,4.0,Light Rail Station,Yoga Studio,Auto Workshop,Garden Center,Garden,Fast Food Restaurant,Farmers Market,Comic Shop,Park,Pizza Place
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,2.0,Park,Baseball Field,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Donut Shop,Farmers Market


In [86]:
df_merged['Cluster Labels'].unique()

array([ 2.,  4., nan,  1.,  0.,  3.])

### Create cluster map for Toronto

In [75]:
# import color map and colors
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    #print (poi, cluster)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    
    # handle when cluster is nan
    cluster_ind = -1
    if (np.isnan(cluster)):
        cluster_ind = 5
    else:
        cluster_ind = int(cluster)
        
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster_ind-1],
        fill=True,
        fill_color=rainbow[cluster_ind-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters