In [2]:
import pandas as pd
import numpy as np
import json
import requests
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

from geopy.geocoders import Nominatim 
from bs4 import BeautifulSoup

In [3]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

# All requested packages already installed.



## Scrape the wikipedia page

In [4]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content,'html.parser')
table = soup.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]

## Create and clean dataframe

In [5]:
df = pd.DataFrame(row)
df1 = df[0].str.split('\n', expand = True)
df2 = df1.rename(columns = df1.iloc[0])
df3 = df2.drop(df2.index[0])
df3.head()

Unnamed: 0,Unnamed: 1,Postal code,Unnamed: 3,Borough,Unnamed: 5,Neighborhood,Unnamed: 7
1,,M1A,,Not assigned,,,
2,,M2A,,Not assigned,,,
3,,M3A,,North York,,Parkwoods,
4,,M4A,,North York,,Victoria Village,
5,,M5A,,Downtown Toronto,,Regent Park / Harbourfront,


In [6]:
df4 = df3[df3.Borough != 'Not assigned']
df4.head(10)

Unnamed: 0,Unnamed: 1,Postal code,Unnamed: 3,Borough,Unnamed: 5,Neighborhood,Unnamed: 7
3,,M3A,,North York,,Parkwoods,
4,,M4A,,North York,,Victoria Village,
5,,M5A,,Downtown Toronto,,Regent Park / Harbourfront,
6,,M6A,,North York,,Lawrence Manor / Lawrence Heights,
7,,M7A,,Downtown Toronto,,Queen's Park / Ontario Provincial Government,
9,,M9A,,Etobicoke,,Islington Avenue,
10,,M1B,,Scarborough,,Malvern / Rouge,
12,,M3B,,North York,,Don Mills,
13,,M4B,,East York,,Parkview Hill / Woodbine Gardens,
14,,M5B,,Downtown Toronto,,"Garden District, Ryerson",


## Combine neighborhoods which have the same postcode

In [7]:
df5 = df4.groupby(['Postal code','Borough'], sort = False).agg(','.join)
df5.reset_index(inplace = True)
df5.head(10)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [8]:
df5.shape

(103, 3)

In [9]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns = ['Postal code','Latitude','Longitude']

In [10]:
df_pos = pd.merge(df5, df_geo, on=['Postal code'], how='inner')
df_tor = df_pos[['Borough', 'Neighborhood', 'Postal code', 'Latitude', 'Longitude']].copy()
df_tor.head(10)

Unnamed: 0,Borough,Neighborhood,Postal code,Latitude,Longitude
0,North York,Parkwoods,M3A,43.753259,-79.329656
1,North York,Victoria Village,M4A,43.725882,-79.315572
2,Downtown Toronto,Regent Park / Harbourfront,M5A,43.65426,-79.360636
3,North York,Lawrence Manor / Lawrence Heights,M6A,43.718518,-79.464763
4,Downtown Toronto,Queen's Park / Ontario Provincial Government,M7A,43.662301,-79.389494
5,Etobicoke,Islington Avenue,M9A,43.667856,-79.532242
6,Scarborough,Malvern / Rouge,M1B,43.806686,-79.194353
7,North York,Don Mills,M3B,43.745906,-79.352188
8,East York,Parkview Hill / Woodbine Gardens,M4B,43.706397,-79.309937
9,Downtown Toronto,"Garden District, Ryerson",M5B,43.657162,-79.378937


In [11]:
df_tor.groupby('Borough').count()['Neighborhood']

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
East York            5
Etobicoke           12
Mississauga          1
North York          24
Scarborough         17
West Toronto         6
York                 5
Name: Neighborhood, dtype: int64

In [12]:
df_toronto = df_tor[df_tor['Borough'].str.contains('Toronto')]
df_toronto.reset_index(inplace = True)
df_toronto.drop('index',axis = 1, inplace = True)
df_toronto.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Borough,Neighborhood,Postal code,Latitude,Longitude
0,Downtown Toronto,Regent Park / Harbourfront,M5A,43.65426,-79.360636
1,Downtown Toronto,Queen's Park / Ontario Provincial Government,M7A,43.662301,-79.389494
2,Downtown Toronto,"Garden District, Ryerson",M5B,43.657162,-79.378937
3,Downtown Toronto,St. James Town,M5C,43.651494,-79.375418
4,East Toronto,The Beaches,M4E,43.676357,-79.293031
5,Downtown Toronto,Berczy Park,M5E,43.644771,-79.373306
6,Downtown Toronto,Central Bay Street,M5G,43.657952,-79.387383
7,Downtown Toronto,Christie,M6G,43.669542,-79.422564
8,Downtown Toronto,Richmond / Adelaide / King,M5H,43.650571,-79.384568
9,West Toronto,Dufferin / Dovercourt Village,M6H,43.669005,-79.442259


In [13]:
df_toronto.groupby('Borough').count()['Neighborhood']

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
West Toronto         6
Name: Neighborhood, dtype: int64

In [14]:
#Create list with the Boroughs (to be used later)
boroughs = df_toronto['Borough'].unique().tolist()

In [15]:
lat_toronto = df_toronto['Latitude'].mean()
lon_toronto = df_toronto['Longitude'].mean()
print('The geographical coordinates of Toronto are {}, {}'.format(lat_toronto, lon_toronto))

The geographical coordinates of Toronto are 43.66713498717949, -79.38987324871795


In [16]:
map_toronto = folium.Map(location=[lat_toronto, lon_toronto], zoom_start=10)

# add markers to map
for lat, lng, borough, Neighbourhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(Neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Define Foursquare Credentials and Version

In [17]:
CLIENT_ID = 'F4RRK0B2LFZBDX3SFFUCBG25AZX5A5DMYEPC3124KAQ5PVBJ' # your Foursquare ID
CLIENT_SECRET = '2L55SWTBQASVTTUYCIQM0AW4A4LRZMYDZ5HXQ4RUPQNOD3KD' # your Foursquare Secret
VERSION = '20203004' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: F4RRK0B2LFZBDX3SFFUCBG25AZX5A5DMYEPC3124KAQ5PVBJ
CLIENT_SECRET:2L55SWTBQASVTTUYCIQM0AW4A4LRZMYDZ5HXQ4RUPQNOD3KD


In [18]:
# defining radius and limit of venues to get
radius=500
LIMIT=100

In [19]:
import requests

## Define a function getNearbyVenues

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

Regent Park / Harbourfront
Queen's Park / Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond / Adelaide / King
Dufferin / Dovercourt Village
Harbourfront East / Union Station / Toronto Islands
Little Portugal / Trinity
The Danforth West / Riverdale
Toronto Dominion Centre / Design Exchange
Brockton / Parkdale Village / Exhibition Place
India Bazaar / The Beaches West
Commerce Court / Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park / The Junction South
North Toronto West
The Annex / North Midtown / Yorkville
Parkdale / Roncesvalles
Davisville
University of Toronto / Harbord
Runnymede / Swansea
Moore Park / Summerhill East
Kensington Market / Chinatown / Grange Park
Summerhill West / Rathnelly / South Hill / Forest Hill SE / Deer Park
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport
Roseda

In [22]:
toronto_venues.shape

(1604, 7)

In [23]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,55,55,55,55,55,55
Brockton / Parkdale Village / Exhibition Place,24,24,24,24,24,24
Business reply mail Processing CentrE,16,16,16,16,16,16
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport,16,16,16,16,16,16
Central Bay Street,60,60,60,60,60,60
Christie,16,16,16,16,16,16
Church and Wellesley,77,77,77,77,77,77
Commerce Court / Victoria Hotel,100,100,100,100,100,100
Davisville,33,33,33,33,33,33
Davisville North,8,8,8,8,8,8


In [24]:
toronto_venues['Venue Category'].unique()[:100]

array(['Bakery', 'Coffee Shop', 'Breakfast Spot', 'Distribution Center',
       'Spa', 'Restaurant', 'Park', 'Gym / Fitness Center',
       'Historic Site', 'Farmers Market', 'Chocolate Shop', 'Pub',
       'Performing Arts Venue', 'Dessert Shop', 'French Restaurant',
       'Yoga Studio', 'Café', 'Theater', 'Event Space', 'Ice Cream Shop',
       'Shoe Store', 'Art Gallery', 'Asian Restaurant', 'Cosmetics Shop',
       'Bank', 'Electronics Store', 'Beer Store', 'Hotel',
       'Health Food Store', 'Antique Shop', 'Playground',
       'Sushi Restaurant', 'Italian Restaurant', 'Creperie',
       'Mexican Restaurant', 'Beer Bar', 'Arts & Crafts Store',
       'Burrito Place', 'Hobby Shop', 'Diner', 'Fried Chicken Joint',
       'Discount Store', 'Japanese Restaurant', 'Burger Joint',
       'Juice Bar', 'Sandwich Place', 'Gym', 'Bar', 'College Auditorium',
       'College Cafeteria', 'General Entertainment', 'Clothing Store',
       'Tea Room', 'Comic Shop', 'Plaza', 'Music Venue',
     

In [25]:
"Indian Restaurant" in toronto_venues['Venue Category'].unique()

True

In [26]:
to_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
to_onehot['Neighborhoods'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [to_onehot.columns[-1]] + list(to_onehot.columns[:-1])
to_onehot = to_onehot[fixed_columns]

print(to_onehot.shape)
to_onehot.head()

(1604, 229)


Unnamed: 0,Neighborhoods,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Regent Park / Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Regent Park / Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Regent Park / Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Regent Park / Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Regent Park / Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
to_grouped = to_onehot.groupby(['Neighborhoods']).mean().reset_index()
print(to_grouped.shape)
to_grouped

(39, 229)


Unnamed: 0,Neighborhoods,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0
1,Brockton / Parkdale Village / Exhibition Place,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business reply mail Processing CentrE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CN Tower / King and Spadina / Railway Lands / ...,0.0625,0.0625,0.0625,0.125,0.1875,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.016667,0.0,0.0,0.0,0.0,0.016667
5,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.012987,0.0,0.0,...,0.012987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025974
7,Commerce Court / Victoria Hotel,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0
8,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
len(to_grouped[to_grouped["Indian Restaurant"] > 0])

7

In [29]:
to_indian = to_grouped[["Neighborhoods", "Indian Restaurant"]]

In [30]:
to_indian.head(9)

Unnamed: 0,Neighborhoods,Indian Restaurant
0,Berczy Park,0.0
1,Brockton / Parkdale Village / Exhibition Place,0.0
2,Business reply mail Processing CentrE,0.0
3,CN Tower / King and Spadina / Railway Lands / ...,0.0
4,Central Bay Street,0.016667
5,Christie,0.0
6,Church and Wellesley,0.012987
7,Commerce Court / Victoria Hotel,0.0
8,Davisville,0.030303


In [31]:
nclusters = 3

to_clustering = to_indian.drop(['Neighborhoods'],1)

kmeans = KMeans(n_clusters = nclusters, random_state = 1)
kmeans.fit(to_clustering)

kmeans.labels_[0:20]

array([0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
      dtype=int32)

In [32]:
to_merged = to_indian.copy()
to_merged['Cluster labels'] = kmeans.labels_

In [33]:
to_merged.rename(columns = {'Neighborhoods':'Neighborhood'}, inplace = True)
to_merged.head(10)

Unnamed: 0,Neighborhood,Indian Restaurant,Cluster labels
0,Berczy Park,0.0,0
1,Brockton / Parkdale Village / Exhibition Place,0.0,0
2,Business reply mail Processing CentrE,0.0,0
3,CN Tower / King and Spadina / Railway Lands / ...,0.0,0
4,Central Bay Street,0.016667,1
5,Christie,0.0,0
6,Church and Wellesley,0.012987,1
7,Commerce Court / Victoria Hotel,0.0,0
8,Davisville,0.030303,2
9,Davisville North,0.0,0


In [34]:
to_merged = to_merged.join(toronto_venues.set_index("Neighborhood"), on="Neighborhood")

print(to_merged.shape)
to_merged.head()

(1604, 9)


Unnamed: 0,Neighborhood,Indian Restaurant,Cluster labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Berczy Park,0.0,0,43.644771,-79.373306,LCBO,43.642944,-79.37244,Liquor Store
0,Berczy Park,0.0,0,43.644771,-79.373306,The Keg Steakhouse + Bar - Esplanade,43.646712,-79.374768,Restaurant
0,Berczy Park,0.0,0,43.644771,-79.373306,Fresh On Front,43.647815,-79.374453,Vegetarian / Vegan Restaurant
0,Berczy Park,0.0,0,43.644771,-79.373306,Meridian Hall,43.646292,-79.376022,Concert Hall
0,Berczy Park,0.0,0,43.644771,-79.373306,Hockey Hall Of Fame (Hockey Hall of Fame),43.646974,-79.377323,Museum


In [36]:
to_merged.sort_values(["Cluster labels"], inplace = True)
to_merged.head()

Unnamed: 0,Neighborhood,Indian Restaurant,Cluster labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Berczy Park,0.0,0,43.644771,-79.373306,LCBO,43.642944,-79.37244,Liquor Store
11,First Canadian Place / Underground city,0.0,0,43.648429,-79.38228,King Taps,43.648476,-79.382058,Gastropub
7,Commerce Court / Victoria Hotel,0.0,0,43.648198,-79.379817,Ruby Thai (First Canadian Place),43.649091,-79.3816,Thai Restaurant
7,Commerce Court / Victoria Hotel,0.0,0,43.648198,-79.379817,Rosalinda,43.650252,-79.385156,Vegetarian / Vegan Restaurant
7,Commerce Court / Victoria Hotel,0.0,0,43.648198,-79.379817,Cactus Club Cafe,43.649552,-79.381671,American Restaurant


In [37]:
map_clusters = folium.Map(location=[lat_toronto, lon_toronto],zoom_start=14)

# set color scheme for the clusters


# add markers to the map
markers_colors={}
markers_colors[0] = 'red'
markers_colors[1] = 'blue'
markers_colors[2] = 'green'
markers_colors[3] = 'yellow'
markers_colors[4] = 'cyan'
markers_colors[5] = 'black'
for lat, lon, cluster in zip(to_merged['Neighborhood Latitude'], to_merged['Neighborhood Longitude'], to_merged['Cluster labels']):
    
    
    folium.features.CircleMarker(
        [lat, lon],
        radius=5,
       
        color =markers_colors[cluster],
        fill_color=markers_colors[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [38]:
# cluster 0
to_merged.loc[(to_merged['Cluster labels'] == 0) & (to_merged['Venue Category'] == 'Indian Restaurant')]

Unnamed: 0,Neighborhood,Indian Restaurant,Cluster labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category


In [39]:
# cluster 1
to_merged.loc[(to_merged['Cluster labels'] == 1) & (to_merged['Venue Category'] == 'Indian Restaurant')]

Unnamed: 0,Neighborhood,Indian Restaurant,Cluster labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
30,St. James Town / Cabbagetown,0.022222,1,43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant
6,Church and Wellesley,0.012987,1,43.66586,-79.38316,Kothur Indian Cuisine,43.667872,-79.385659,Indian Restaurant
4,Central Bay Street,0.016667,1,43.657952,-79.387383,Colaba Junction,43.66094,-79.385635,Indian Restaurant
36,The Danforth West / Riverdale,0.02381,1,43.679557,-79.352188,Sher-E-Punjab,43.677308,-79.353066,Indian Restaurant
14,Harbourfront East / Union Station / Toronto Is...,0.01,1,43.640816,-79.381752,Indian Roti House,43.63906,-79.385422,Indian Restaurant


In [40]:
# cluster 2
to_merged.loc[(to_merged['Cluster labels'] == 2) & (to_merged['Venue Category'] == 'Indian Restaurant')]

Unnamed: 0,Neighborhood,Indian Restaurant,Cluster labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
34,The Annex / North Midtown / Yorkville,0.041667,2,43.67271,-79.405678,Roti Cuisine of India,43.674618,-79.408249,Indian Restaurant
8,Davisville,0.030303,2,43.704324,-79.38879,Marigold Indian Bistro,43.702881,-79.388008,Indian Restaurant
