In [81]:
import requests
from bs4 import BeautifulSoup

In [82]:
TARGET_URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [83]:
res = requests.get(TARGET_URL).text
soup = BeautifulSoup(res,'lxml')

In [84]:
postal_codes = []
neighborhoods = []
boroughs = []

In [85]:
for items in soup.find('table', class_='wikitable sortable').find_all('tr')[1::1]:
    data = items.find_all(['th','td'])
    try:
        if data[1].text != 'Not assigned\n':
            
            postal_code = data[0].text.translate({ord('\n'): None})
            borough = data[1].text.translate({ord('\n'): None})
            neighborhood = data[2].text.translate({ord('\n'): None})
            
            print(postal_code, borough, neighborhood, sep=" | ")
            
            postal_codes.append(postal_code)
            boroughs.append(borough)
            neighborhoods.append(neighborhood)
            

    except IndexError:
        pass

M3A | North York | Parkwoods
M4A | North York | Victoria Village
M5A | Downtown Toronto | Regent Park, Harbourfront
M6A | North York | Lawrence Manor, Lawrence Heights
M7A | Downtown Toronto | Queen's Park, Ontario Provincial Government
M9A | Etobicoke | Islington Avenue, Humber Valley Village
M1B | Scarborough | Malvern, Rouge
M3B | North York | Don Mills
M4B | East York | Parkview Hill, Woodbine Gardens
M5B | Downtown Toronto | Garden District, Ryerson
M6B | North York | Glencairn
M9B | Etobicoke | West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
M1C | Scarborough | Rouge Hill, Port Union, Highland Creek
M3C | North York | Don Mills
M4C | East York | Woodbine Heights
M5C | Downtown Toronto | St. James Town
M6C | York | Humewood-Cedarvale
M9C | Etobicoke | Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
M1E | Scarborough | Guildwood, Morningside, West Hill
M4E | East Toronto | The Beaches
M5E | Downtown Toronto | Berczy Park
M6E | York | Caledonia-F

In [86]:
import pandas as pd
import numpy as np

In [87]:
data_dict = {'PostalCode':postal_codes,'Borough':boroughs,'Neighborhood':neighborhoods}
data = pd.DataFrame(data_dict)

In [88]:
data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [89]:
missing_data = data.isnull()
print(missing_data[missing_data['PostalCode']==True].shape)
print(missing_data[missing_data['Borough']==True].shape)
print(missing_data[missing_data['Neighborhood']==True].shape)

(0, 3)
(0, 3)
(0, 3)


So none of the data values are missing, which is good

In [90]:
data['Neighborhood'].value_counts()

Downsview                                   4
Don Mills                                   2
Davisville North                            1
York Mills West                             1
Humberlea, Emery                            1
                                           ..
Wexford, Maryvale                           1
Toronto Dominion Centre, Design Exchange    1
Central Bay Street                          1
Parkview Hill, Woodbine Gardens             1
Scarborough Village                         1
Name: Neighborhood, Length: 99, dtype: int64

In [91]:
data.shape

(103, 3)

In [92]:
lat_long_data = pd.read_csv('Geospatial_Coordinates.csv')

In [93]:
lat = []
long = []
for i in np.arange(0,data.shape[0],1):
    postal_code = data['PostalCode'][i]
    lat.append(float(lat_long_data['Latitude'][lat_long_data['Postal Code']==postal_code]))
    long.append(float(lat_long_data['Longitude'][lat_long_data['Postal Code']==postal_code]))
data['Latitude'] = lat
data['Longitude'] = long

In [94]:
data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [95]:
missing_data = data.isnull()
print(missing_data[missing_data['Latitude']==True].shape)
print(missing_data[missing_data['Longitude']==True].shape)

(0, 5)
(0, 5)


In [96]:
import folium
from geopy.geocoders import Nominatim

In [97]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [98]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(data['Latitude'], data['Longitude'], data['Borough'], data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

We'll consider only those boroughs that contain the word 'Toronto'

In [100]:
data = data[data['Borough'].str.contains('Toronto')].reset_index()

In [101]:
data.shape

(39, 6)

In [103]:
data['Borough'].value_counts()

Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64

So the 4 Boroughs are :    
* Downtown Toronto   
* Central Toronto   
* West Toronto   
* East Toronto    

In [104]:
CLIENT_ID = "NHZQYFYG31KQDECK5ADMERSUAU31STR4GD11BE1IAACJFETM"
CLIENT_SECRET = "ZYJZGROI4TFWWFEKA5UC03KT23RASIYP5T1GRURD2RWWWOST"
VERSION = '20180604'
LIMIT = 30

In [105]:
downtown_data = data[data['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_data.head()

Unnamed: 0,index,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [109]:
# create map of Downtown Toronto using latitude and longitude values
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, label in zip(downtown_data['Latitude'], downtown_data['Longitude'], downtown_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

In [110]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [119]:
resp_cache = {}

In [125]:
def get_nearby_venues(names, latitudes, longitudes, radius, Limit):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        key = name + ' ' + str(lat) + ' , ' + str(lng)
        
        print(key)
        
        if key in resp_cache:
            print( 'using cache')
            response = resp_cache[key]
        else :  
            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                radius, 
                Limit)
            
            # make the GET request
            response = requests.get(url).json()["response"]
            resp_cache[key]=response
        
        if 'groups' not in response:
            print("response :", response, " skipping ...")
            continue
        
        results = response['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [126]:
r  = 500
LIMIT = 1000

toronto_venues = get_nearby_venues(names=data['Neighborhood'],
                                latitudes=data['Latitude'],
                                longitudes=data['Longitude'],
                                radius=r,
                                Limit=LIMIT)

Regent Park, Harbourfront 43.6542599 , -79.3606359
using cache
Queen's Park, Ontario Provincial Government 43.6623015 , -79.3894938
using cache
Garden District, Ryerson 43.6571618 , -79.37893709999999
using cache
response : {}  skipping ...
St. James Town 43.6514939 , -79.3754179
The Beaches 43.67635739999999 , -79.2930312
Berczy Park 43.644770799999996 , -79.3733064
Central Bay Street 43.6579524 , -79.3873826
Christie 43.669542 , -79.4225637
Richmond, Adelaide, King 43.65057120000001 , -79.3845675
Dufferin, Dovercourt Village 43.66900510000001 , -79.4422593
Harbourfront East, Union Station, Toronto Islands 43.6408157 , -79.38175229999999
Little Portugal, Trinity 43.647926700000006 , -79.4197497
The Danforth West, Riverdale 43.6795571 , -79.352188
Toronto Dominion Centre, Design Exchange 43.6471768 , -79.38157640000001
Brockton, Parkdale Village, Exhibition Place 43.6368472 , -79.42819140000002
India Bazaar, The Beaches West 43.6689985 , -79.31557159999998
Commerce Court, Victoria Hote

For some reason the API returned emp response for 'Garden District, Ryerson', skipping records for tatneighborhood

In [127]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub


In [131]:
toronto_venues['Venue Category'].unique()

array(['Bakery', 'Coffee Shop', 'Distribution Center', 'Spa', 'Pub',
       'Park', 'Restaurant', 'Breakfast Spot', 'Gym / Fitness Center',
       'Historic Site', 'Farmers Market', 'Performing Arts Venue',
       'Chocolate Shop', 'Dessert Shop', 'French Restaurant', 'Café',
       'Yoga Studio', 'Theater', 'Event Space', 'Shoe Store',
       'Ice Cream Shop', 'Art Gallery', 'Cosmetics Shop',
       'Asian Restaurant', 'Electronics Store', 'Bank', 'Beer Store',
       'Health Food Store', 'Antique Shop', 'Italian Restaurant',
       'Sushi Restaurant', 'Creperie', 'Beer Bar', 'Arts & Crafts Store',
       'Burrito Place', 'Mexican Restaurant', 'Hobby Shop', 'Diner',
       'Fried Chicken Joint', 'Discount Store', 'Smoothie Shop',
       'Sandwich Place', 'Gym', 'Bar', 'College Auditorium',
       'Japanese Restaurant', 'Food Truck', 'Middle Eastern Restaurant',
       'Gastropub', 'Church', 'Poke Place', 'BBQ Joint', 'Hotel',
       'American Restaurant', 'Thai Restaurant',
       'Ve

In [132]:
# one hot encoding
toronto_encoded = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_encoded['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_encoded.columns[-1]] + list(toronto_encoded.columns[:-1])
toronto_encoded = toronto_encoded[fixed_columns]

toronto_encoded.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [134]:
toronto_grouped = toronto_encoded.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.066667,0.066667,0.133333,0.2,0.066667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.015385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015385,0.0,0.0,0.015385,0.0,0.0


In [135]:
num_top_venues = 5

for neighborhood in toronto_grouped['Neighborhood']:
    print(neighborhood, ":")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == neighborhood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print()

Berczy Park :
          venue  freq
0   Coffee Shop  0.09
1  Cocktail Bar  0.05
2   Cheese Shop  0.03
3        Bakery  0.03
4    Restaurant  0.03

Brockton, Parkdale Village, Exhibition Place :
                    venue  freq
0                    Café  0.14
1             Coffee Shop  0.09
2          Breakfast Spot  0.09
3                     Gym  0.05
4  Furniture / Home Store  0.05

Business reply mail Processing Centre, South Central Letter Processing Plant Toronto :
                venue  freq
0  Light Rail Station  0.11
1       Garden Center  0.05
2    Recording Studio  0.05
3             Brewery  0.05
4          Smoke Shop  0.05

CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport :
              venue  freq
0   Airport Service  0.20
1    Airport Lounge  0.13
2          Boutique  0.07
3   Harbor / Marina  0.07
4  Sculpture Garden  0.07

Central Bay Street :
                 venue  freq
0          Coffee Shop  0.17
1       Sand

In [136]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)   
    return row_categories_sorted.index.values[0:num_top_venues]

In [143]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Café,Beer Bar,Restaurant,Cheese Shop,Seafood Restaurant,Bistro,Beach
1,"Brockton, Parkdale Village, Exhibition Place",Café,Coffee Shop,Breakfast Spot,Pet Store,Stadium,Burrito Place,Restaurant,Climbing Gym,Gym,Bakery
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Auto Workshop,Brewery,Skate Park,Smoke Shop,Farmers Market,Spa,Fast Food Restaurant,Burrito Place,Restaurant
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Coffee Shop,Harbor / Marina,Rental Car Location,Sculpture Garden,Boutique,Bar,Boat or Ferry,Airport Terminal
4,Central Bay Street,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Japanese Restaurant,Thai Restaurant,Burger Joint,Bar,Salad Place,Bubble Tea Shop


In [144]:
from sklearn.cluster import KMeans

In [145]:
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

array([0, 0, 7, 6, 0, 0, 0, 0, 0, 7, 8, 0, 5, 0, 0, 7, 0, 4, 0, 1, 0, 0,
       0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0])

In [146]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [147]:
toronto_merged = data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,index,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,Coffee Shop,Park,Bakery,Pub,Breakfast Spot,Café,Theater,Cosmetics Shop,Shoe Store,Restaurant
1,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0.0,Coffee Shop,Diner,Sushi Restaurant,Gym,Park,Mexican Restaurant,Italian Restaurant,Hobby Shop,Fried Chicken Joint,Distribution Center
2,9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,,,,,,,,,,,
3,15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0.0,Coffee Shop,Café,Gastropub,Cocktail Bar,American Restaurant,Restaurant,Gym,Hotel,Creperie,Lingerie Store
4,19,M4E,East Toronto,The Beaches,43.676357,-79.293031,9.0,Asian Restaurant,Pub,Health Food Store,Trail,Distribution Center,Department Store,Dessert Shop,Diner,Discount Store,Doner Restaurant


In [153]:
toronto_merged.dropna(subset=['Cluster Labels'], inplace=True)

In [157]:
toronto_merged['Cluster Labels']=toronto_merged['Cluster Labels'].astype(int)

In [158]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [159]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters