In [207]:
import requests
import lxml.html as lh
import pandas as pd
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import numpy as np
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from pandas.io.json import json_normalize

In [208]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [209]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighbourhood
"


In [210]:
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [211]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

#Removing \n from the dataframe
df = df.replace('\n',' ', regex=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [212]:
df.columns=['Postal Code','Borough','Neighbourhood']
#Grouping by postal codes and aggregating neighbourhoods
df=df.groupby("Postal Code").agg(lambda x:','.join(set(x)))

In [213]:
#Assigning not assigned neighbourhoods same as borough
df.loc[df['Neighbourhood'].isin( ["Not assigned",'Neighbourhood'])]=df.loc[df['Neighbourhood'].isin( ["Not assigned",'Borough'])]
    


In [214]:
#Ignoring not assigned boroughs
df = df.loc[df['Borough'] != 'Not assigned']
df


Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge ,Malvern"
M1C,Scarborough,"Highland Creek ,Rouge Hill ,Port Union"
M1E,Scarborough,"West Hill ,Guildwood ,Morningside"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park ,Kennedy Park ,Ionview"
M1L,Scarborough,"Clairlea ,Golden Mile ,Oakridge"
M1M,Scarborough,"Cliffcrest ,Scarborough Village West ,Cliffside"
M1N,Scarborough,"Cliffside West ,Birch Cliff"


In [215]:
#Resetting index
df.reset_index()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge ,Malvern"
1,M1C,Scarborough,"Highland Creek ,Rouge Hill ,Port Union"
2,M1E,Scarborough,"West Hill ,Guildwood ,Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park ,Kennedy Park ,Ionview"
7,M1L,Scarborough,"Clairlea ,Golden Mile ,Oakridge"
8,M1M,Scarborough,"Cliffcrest ,Scarborough Village West ,Cliffside"
9,M1N,Scarborough,"Cliffside West ,Birch Cliff"


In [216]:
#Reading geospatial_coordinates csv into a dataframe
df2 = pd.read_csv('Geospatial_Coordinates.csv')
df2

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [217]:
#Merging the two dataframes
dfinal = df.merge(df2, on="Postal Code", how = 'inner')
dfinal

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge ,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek ,Rouge Hill ,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"West Hill ,Guildwood ,Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park ,Kennedy Park ,Ionview",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea ,Golden Mile ,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest ,Scarborough Village West ,Cliffside",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West ,Birch Cliff",43.692657,-79.264848


In [218]:
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(
        len(dfinal['Borough'].unique()),
        dfinal.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighbourhoods.


In [219]:
dfinal['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [220]:
# convert an address into latitude and longitude values

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="t_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [221]:

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for latitude, longitude, label in zip(dfinal['Latitude'], dfinal['Longitude'], dfinal['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [222]:
CLIENT_ID = 'client_id' # Foursquare ID was inserted and removed for security reasons
CLIENT_SECRET = 'client-secret' #Foursquare Secret was inserted and removed for security reasons

VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JHV042UB1TCUG4QS0TMLXLALVNAUOT5A23GOSPVGN1AII3QE
CLIENT_SECRET:30QUMQ2JOJWPNZD2ZZVBCSAU4BSWSCX3DWP3PVB44E4EWQ3A


In [223]:
dfinal.loc[0, 'Neighbourhood']

'Rouge ,Malvern '

In [224]:
neighborhood_latitude = dfinal.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = dfinal.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = dfinal.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Rouge ,Malvern  are 43.806686299999996, -79.19435340000001.


In [225]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius


url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL


'https://api.foursquare.com/v2/venues/explore?&client_id=JHV042UB1TCUG4QS0TMLXLALVNAUOT5A23GOSPVGN1AII3QE&client_secret=30QUMQ2JOJWPNZD2ZZVBCSAU4BSWSCX3DWP3PVB44E4EWQ3A&v=20180605&ll=43.806686299999996,-79.19435340000001&radius=500&limit=100'

In [226]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d1b82d1d29cbb00234cb4c8'},
  'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.8111863045, 'lng': -79.18812958073042},
   'sw': {'lat': 43.80218629549999, 'lng': -79.2005772192696}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bb6b9446edc76b0d771311c',
       'name': "Wendy's",
       'location': {'crossStreet': 'Morningside & Sheppard',
        'lat': 43.80744841934756,
        'lng': -79.19905558052072,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.80744841934756,
          'lng': -79.19905558052072}],
        'distance': 387,
        'cc': 'CA',
        'city': 'Toronto',
    

In [227]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [228]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy's,Fast Food Restaurant,43.807448,-79.199056
1,Interprovincial Group,Print Shop,43.80563,-79.200378


In [229]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [230]:
toronto_venues = getNearbyVenues(names=dfinal['Neighbourhood'],
                                   latitudes=dfinal['Latitude'],
                                   longitudes=dfinal['Longitude']
                                  )



Rouge ,Malvern 
Highland Creek ,Rouge Hill ,Port Union 
West Hill ,Guildwood ,Morningside 
Woburn 
Cedarbrae 
Scarborough Village 
East Birchmount Park ,Kennedy Park ,Ionview 
Clairlea ,Golden Mile ,Oakridge 
Cliffcrest ,Scarborough Village West ,Cliffside 
Cliffside West ,Birch Cliff 
Wexford Heights ,Scarborough Town Centre ,Dorset Park 
Wexford ,Maryvale 
Agincourt 
Tam O'Shanter ,Sullivan ,Clarks Corners 
L'Amoreaux East ,Milliken ,Steeles East ,Agincourt North 
L'Amoreaux West 
Upper Rouge 
Hillcrest Village 
Oriole ,Fairview ,Henry Farm 
Bayview Village 
Silver Hills ,York Mills 
Newtonbrook ,Willowdale 
Willowdale South 
York Mills West 
Willowdale West 
Parkwoods 
Don Mills North 
Flemingdon Park ,Don Mills South 
Bathurst Manor ,Downsview North ,Wilson Heights 
Northwood Park ,York University 
Downsview East ,CFB Toronto 
Downsview West 
Downsview Central 
Downsview Northwest 
Victoria Village 
Parkview Hill ,Woodbine Gardens 
Woodbine Heights 
The Beaches 
Leaside 
Thorncliff

In [231]:
print(dfinal.shape)
dfinal.head()

(103, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge ,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek ,Rouge Hill ,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"West Hill ,Guildwood ,Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [232]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood ,Long Branch",10,10,10,10,10,10
"Bathurst Manor ,Downsview North ,Wilson Heights",18,18,18,18,18,18
Bayview Village,4,4,4,4,4,4
"Bedford Park ,Lawrence Manor East",25,25,25,25,25,25
Berczy Park,55,55,55,55,55,55
Business Reply Mail Processing Centre 969 Eastern,19,19,19,19,19,19
"Cabbagetown ,St. James Town",46,46,46,46,46,46
Caledonia-Fairbanks,6,6,6,6,6,6
Canada Post Gateway Processing Centre,10,10,10,10,10,10


In [233]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 280 uniques categories.


In [234]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [235]:
toronto_onehot.shape

(2259, 280)

In [236]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,"Alderwood ,Long Branch",0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Bathurst Manor ,Downsview North ,Wilson Heights",0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.00,0.000000,0.000000,0.055556,0.000000,0.000000,0.000000,0.000000,0.000000
3,Bayview Village,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,"Bedford Park ,Lawrence Manor East",0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,Berczy Park,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.00,0.018182,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,Business Reply Mail Processing Centre 969 East...,0.052632,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,"Cabbagetown ,St. James Town",0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,Caledonia-Fairbanks,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.166667
9,Canada Post Gateway Processing Centre,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [237]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt ----
                venue  freq
0      Sandwich Place  0.25
1              Lounge  0.25
2      Breakfast Spot  0.25
3  Chinese Restaurant  0.25
4         Yoga Studio  0.00


----Alderwood ,Long Branch ----
            venue  freq
0     Pizza Place   0.2
1    Skating Rink   0.1
2  Sandwich Place   0.1
3    Dance Studio   0.1
4             Pub   0.1


----Bathurst Manor ,Downsview North ,Wilson Heights ----
                       venue  freq
0                Coffee Shop  0.11
1  Middle Eastern Restaurant  0.06
2                 Restaurant  0.06
3           Sushi Restaurant  0.06
4         Frozen Yogurt Shop  0.06


----Bayview Village ----
                 venue  freq
0  Japanese Restaurant  0.25
1                 Bank  0.25
2   Chinese Restaurant  0.25
3                 Café  0.25
4    Mobile Phone Shop  0.00


----Bedford Park ,Lawrence Manor East ----
                  venue  freq
0  Fast Food Restaurant  0.08
1    Italian Restaurant  0.08
2           Coffee Shop  0.08


           venue  freq
0          Trail  0.25
1          Field  0.25
2   Tennis Court  0.25
3   Hockey Arena  0.25
4  Metro Station  0.00


----Island airport ,Bathurst Quay ,South Niagara ,Railway Lands ,CN Tower ,King and Spadina ,Harbourfront West ----
              venue  freq
0  Airport Terminal  0.12
1    Airport Lounge  0.12
2   Airport Service  0.12
3   Harbor / Marina  0.06
4  Sculpture Garden  0.06


----Keelesdale ,Mount Dennis ,Del Ray ,Silverthorn ----
                   venue  freq
0             Restaurant  0.25
1         Discount Store  0.25
2         Sandwich Place  0.25
3  Check Cashing Service  0.25
4          Movie Theater  0.00


----King ,Richmond ,Adelaide ----
                 venue  freq
0          Coffee Shop  0.06
1                 Café  0.05
2           Steakhouse  0.04
3  American Restaurant  0.04
4                  Bar  0.04


----Kingsview Village ,St. Phillips ,Richview Gardens ,Martin Grove Gardens ----
                venue  freq
0         Pizza Place  

                             venue  freq
0                     Tennis Court   0.5
1                       Playground   0.5
2                      Yoga Studio   0.0
3               Mexican Restaurant   0.0
4  Molecular Gastronomy Restaurant   0.0


----Sunnylea ,Old Mill South ,Mimico NE ,King's Mill Park ,The Queensway East ,Royal York South East ,Kingsway Park South East ,Humber Bay ----
                             venue  freq
0                   Baseball Field   1.0
1                      Yoga Studio   0.0
2  Molecular Gastronomy Restaurant   0.0
3       Modern European Restaurant   0.0
4                Mobile Phone Shop   0.0


----Tam O'Shanter ,Sullivan ,Clarks Corners ----
                 venue  freq
0          Pizza Place   0.2
1             Pharmacy   0.1
2         Noodle House   0.1
3  Fried Chicken Joint   0.1
4                 Bank   0.1


----The Annex ,Yorkville ,North Midtown ----
            venue  freq
0     Coffee Shop  0.13
1  Sandwich Place  0.13
2            Café 

In [238]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [125]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Breakfast Spot,Sandwich Place,Chinese Restaurant,Electronics Store,Eastern European Restaurant,Empanada Restaurant,Dumpling Restaurant,Drugstore,Department Store
1,"Alderwood ,Long Branch",Pizza Place,Gym,Pharmacy,Pub,Sandwich Place,Pool,Dance Studio,Skating Rink,Coffee Shop,Drugstore
2,"Bathurst Manor ,Downsview North ,Wilson Heights",Coffee Shop,Middle Eastern Restaurant,Frozen Yogurt Shop,Sandwich Place,Bridal Shop,Fast Food Restaurant,Diner,Restaurant,Bank,Supermarket
3,Bayview Village,Chinese Restaurant,Café,Japanese Restaurant,Bank,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
4,"Bedford Park ,Lawrence Manor East",Italian Restaurant,Coffee Shop,Fast Food Restaurant,Liquor Store,Butcher,Indian Restaurant,Ice Cream Shop,Café,Sushi Restaurant,Japanese Restaurant


In [239]:
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering.
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1])

In [250]:
#neighborhoods_venues_sorted  = neighborhoods_venues_sorted .reset_index(level=0, drop=True).reset_index()
neighborhoods_venues_sorted.reset_index(level=-1, drop=True)



Unnamed: 0,Cluster_Labels,Cluster Labels,ClusterLabels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,1,1,Agincourt,Lounge,Breakfast Spot,Sandwich Place,Chinese Restaurant,Electronics Store,Eastern European Restaurant,Empanada Restaurant,Dumpling Restaurant,Drugstore,Department Store
1,1,1,1,"Alderwood ,Long Branch",Pizza Place,Gym,Pharmacy,Pub,Sandwich Place,Pool,Dance Studio,Skating Rink,Coffee Shop,Drugstore
2,1,1,1,"Bathurst Manor ,Downsview North ,Wilson Heights",Coffee Shop,Middle Eastern Restaurant,Frozen Yogurt Shop,Sandwich Place,Bridal Shop,Fast Food Restaurant,Diner,Restaurant,Bank,Supermarket
3,1,1,1,Bayview Village,Chinese Restaurant,Café,Japanese Restaurant,Bank,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
4,1,1,1,"Bedford Park ,Lawrence Manor East",Italian Restaurant,Coffee Shop,Fast Food Restaurant,Liquor Store,Butcher,Indian Restaurant,Ice Cream Shop,Café,Sushi Restaurant,Japanese Restaurant
5,1,1,1,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Seafood Restaurant,Italian Restaurant,Steakhouse,Café,Cheese Shop,Beer Bar,Farmers Market
6,1,1,1,Business Reply Mail Processing Centre 969 East...,Yoga Studio,Auto Workshop,Park,Comic Shop,Pizza Place,Butcher,Recording Studio,Restaurant,Burrito Place,Brewery
7,1,1,1,"Cabbagetown ,St. James Town",Coffee Shop,Restaurant,Pub,Pizza Place,Park,Café,Italian Restaurant,Bakery,Diner,Sandwich Place
8,3,3,3,Caledonia-Fairbanks,Park,Women's Store,Pharmacy,Fast Food Restaurant,Market,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
9,1,1,1,Canada Post Gateway Processing Centre,Coffee Shop,Hotel,Sandwich Place,Gym / Fitness Center,Burrito Place,Mediterranean Restaurant,Fried Chicken Joint,American Restaurant,Women's Store,Dim Sum Restaurant


In [255]:

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood

toronto_merged=pd.concat([neighborhoods_venues_sorted,toronto_merged], axis=0)
toronto_merged.head() # check the last columns!



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


Unnamed: 0,10th Most Common Venue,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,Borough,Cluster Labels,ClusterLabels,Cluster_Labels,Latitude,Longitude,Neighborhood,Neighbourhood,Postal Code
0,Department Store,Lounge,Breakfast Spot,Sandwich Place,Chinese Restaurant,Electronics Store,Eastern European Restaurant,Empanada Restaurant,Dumpling Restaurant,Drugstore,,1.0,1.0,1.0,,,Agincourt,,
1,Drugstore,Pizza Place,Gym,Pharmacy,Pub,Sandwich Place,Pool,Dance Studio,Skating Rink,Coffee Shop,,1.0,1.0,1.0,,,"Alderwood ,Long Branch",,
2,Supermarket,Coffee Shop,Middle Eastern Restaurant,Frozen Yogurt Shop,Sandwich Place,Bridal Shop,Fast Food Restaurant,Diner,Restaurant,Bank,,1.0,1.0,1.0,,,"Bathurst Manor ,Downsview North ,Wilson Heights",,
3,Donut Shop,Chinese Restaurant,Café,Japanese Restaurant,Bank,Women's Store,Diner,Discount Store,Dog Run,Doner Restaurant,,1.0,1.0,1.0,,,Bayview Village,,
4,Japanese Restaurant,Italian Restaurant,Coffee Shop,Fast Food Restaurant,Liquor Store,Butcher,Indian Restaurant,Ice Cream Shop,Café,Sushi Restaurant,,1.0,1.0,1.0,,,"Bedford Park ,Lawrence Manor East",,


In [263]:
print(latitude)
print(longitude)


43.706748299999994
-79.5940544


<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x0000010F5D97EC88>

In [256]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, bor, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    #label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [43.706748299999994, -79.5940544],
        radius=5,
        popup=label,
        color=[colors.rgb2hex(i) for i in colors_array],

        fill=True,
        fill_color=[colors.rgb2hex(i) for i in colors_array],

        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters