In [222]:
import pandas as pd
import numpy as np

import csv

from bs4 import BeautifulSoup
import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

# from urllib.request import urlopen as uReq

---

## Get html code accessible for manipulation

In [223]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'lxml')
# print(soup.prettify())

In [224]:
table = soup.find('table')
# print(table.prettify())

---

## Write to csv file

In [225]:
with open('canadian_postal_codes_scrape.csv', 'w', newline='') as f:
    
    csv_file = csv.writer(f)
    csv_file.writerow(['Postal Code', 'Borough', 'Neighbourhood'])
    
    tag = table.find_all('td')
    co = 1
    
    for i in tag:
        if (co==1):
            postal = i.text
            co += 1
        elif (co==2):
            borough = i.text
            co += 1
        elif (co==3):
            neighbourhood = i.text
            co += 1
        else:
            co = 2
            csv_file.writerow([postal, borough, neighbourhood])
            postal = i.text
        

In [226]:
pc = pd.read_csv('canadian_postal_codes_scrape.csv')
pc

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n
7,M7A,Downtown Toronto,Queen's Park\n
8,M8A,Not assigned,Not assigned\n
9,M9A,Queen's Park,Not assigned\n


---

## Cleaning the data and deleting unwanted rows

In [227]:
pc['Neighbourhood'] = pc['Neighbourhood'].str.replace('\n', '')
pc = pc[pc['Borough'] != 'Not assigned']
pc

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


---
### Create a dictionary that has updated values for the Neighbourhood column..
### Then update the dataframe to replace its Neighbourhood values with that of the dictionary

In [228]:
# Create a dictionary that has updated values for the Neighbourhood column.
pCodes = {}
i = 0
while i < pc.shape[0]:
    if (pc.iloc[i]['Postal Code'] in pCodes):
        pCodes[pc.iloc[i]['Postal Code']] = pCodes[pc.iloc[i]['Postal Code']] + ', ' + pc.iloc[i]['Neighbourhood']
        pc.iloc[i]['Neighbourhood'] = pCodes[pc.iloc[i]['Postal Code']]
    else:
        pCodes[pc.iloc[i]['Postal Code']] = pc.iloc[i]['Neighbourhood']
    i += 1

# Change Neighbourhood column to be in accordance with the values in pCodes dictionary.
# i = 0
# while i < pc.shape[0]:
#     pc.iloc[i]['Neighbourhood'] = pCodes[pc.iloc[i]['Postal Code']]
#     i += 1

---
## Giving un-named Neighbourhoods the same name as their respective borough

In [229]:
i = 0
while i < pc.shape[0]:
    if (pc.iloc[i]['Neighbourhood'] == 'Not assigned'):
        print(pc.iloc[i]['Neighbourhood'])
        pc.iloc[i]['Neighbourhood'] = pc.iloc[i]['Borough']
    i = i+1
pc    

Not assigned


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,"Rouge, Malvern"
13,M3B,North York,Don Mills North


In [230]:
pc.shape

(210, 3)

---
## Read in latitude / longitude data from another .csv file.  Merge multiple dataframes on their shared column.

In [231]:
gs = pd.read_csv('Geospatial_Coordinates.csv')
gs

pc = pd.merge(pc, gs, on='Postal Code')
pc

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
6,M9A,Queen's Park,Queen's Park,43.667856,-79.532242
7,M1B,Scarborough,Rouge,43.806686,-79.194353
8,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
9,M3B,North York,Don Mills North,43.745906,-79.352188


---
# Create a map of Toronto with neighborhoods superimposed on top.

In [232]:
# create map of New York using latitude and longitude values
# map_toronto = folium.Map(location=[79, 43], zoom_start=10)
map_toronto = folium.Map(location=[43.65, -79.4], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(pc['Latitude'], pc['Longitude'], pc['Borough'], pc['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Make a subset dataframe that contains only Boroughs with the word Toronto in them.

In [233]:
toronto_data = pc[pc['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
5,M4E,East Toronto,The Beaches,43.676357,-79.293031
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M6G,Downtown Toronto,Christie,43.669542,-79.422564
9,M5H,Downtown Toronto,Adelaide,43.650571,-79.384568


In [234]:
# create map of Manhattan using latitude and longitude values
map_t = folium.Map(location=[43.65, -79.4], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_t)  
    
map_t

In [235]:
CLIENT_ID = 'CQMVECIK3CR5EY1QAP0WUXDM1BSO0OXDUY3FYPQXOHKGJ04X' # your Foursquare ID
CLIENT_SECRET = 'NKJ3C4QT2TMLK3KHOSXMW05GRI1YCKJEBTBVW2KKAVDOUAJ1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [238]:
neighbourhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = toronto_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Harbourfront are 43.6542599, -79.3606359.


In [239]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=CQMVECIK3CR5EY1QAP0WUXDM1BSO0OXDUY3FYPQXOHKGJ04X&client_secret=NKJ3C4QT2TMLK3KHOSXMW05GRI1YCKJEBTBVW2KKAVDOUAJ1&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100'

In [240]:
results = requests.get(url).json()
# results

{'meta': {'code': 200, 'requestId': '5e3f1e9b9da7ee001b5eefd9'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 46,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

In [241]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [242]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Gym / Fitness Center,43.653191,-79.357947
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Impact Kitchen,Restaurant,43.656369,-79.35698


In [243]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

46 venues were returned by Foursquare.


In [244]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [245]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Harbourfront
Queen's Park
Ryerson
Ryerson, Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide
Adelaide, King
Adelaide, King, Richmond
Dovercourt Village
Dovercourt Village, Dufferin
Harbourfront East
Harbourfront East, Toronto Islands
Harbourfront East, Toronto Islands, Union Station
Little Portugal
Little Portugal, Trinity
The Danforth West
The Danforth West, Riverdale
Design Exchange
Design Exchange, Toronto Dominion Centre
Brockton
Brockton, Exhibition Place
Brockton, Exhibition Place, Parkdale Village
The Beaches West
The Beaches West, India Bazaar
Commerce Court
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North
Forest Hill North, Forest Hill West
High Park
High Park, The Junction South
North Toronto West
The Annex
The Annex, North Midtown
The Annex, North Midtown, Yorkville
Parkdale
Parkdale, Roncesvalles
Davisville
Harbord
Harbord, University of Toronto
Runnymede
Runnymede, Swansea
Moo

---
## Putting toronto venues into a dataframe

In [246]:
print(toronto_venues.shape)
toronto_venues

(3203, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.654260,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.654260,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.654260,-79.360636,Cooper Koo Family YMCA,43.653191,-79.357947,Gym / Fitness Center
3,Harbourfront,43.654260,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,43.654260,-79.360636,Impact Kitchen,43.656369,-79.356980,Restaurant
5,Harbourfront,43.654260,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
6,Harbourfront,43.654260,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub
7,Harbourfront,43.654260,-79.360636,Corktown Common,43.655618,-79.356211,Park
8,Harbourfront,43.654260,-79.360636,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot
9,Harbourfront,43.654260,-79.360636,The Distillery Historic District,43.650244,-79.359323,Historic Site


---
## Group this dataframe by neighborhood

In [247]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
"Adelaide, King",100,100,100,100,100,100
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,55,55,55,55,55,55
Brockton,24,24,24,24,24,24
"Brockton, Exhibition Place",24,24,24,24,24,24
"Brockton, Exhibition Place, Parkdale Village",24,24,24,24,24,24
Business Reply Mail Processing Centre 969 Eastern,17,17,17,17,17,17
CN Tower,16,16,16,16,16,16
"CN Tower, Bathurst Quay",16,16,16,16,16,16


In [248]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 229 uniques categories.


In [249]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [250]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

---
### Show top 5 venues for each neighborhood

In [251]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide----
            venue  freq
0     Coffee Shop  0.07
1      Steakhouse  0.04
2             Bar  0.04
3            Café  0.04
4  Cosmetics Shop  0.03


----Adelaide, King----
            venue  freq
0     Coffee Shop  0.07
1      Steakhouse  0.04
2             Bar  0.04
3            Café  0.04
4  Cosmetics Shop  0.03


----Adelaide, King, Richmond----
            venue  freq
0     Coffee Shop  0.07
1      Steakhouse  0.04
2             Bar  0.04
3            Café  0.04
4  Cosmetics Shop  0.03


----Berczy Park----
                venue  freq
0         Coffee Shop  0.07
1        Cocktail Bar  0.05
2          Steakhouse  0.04
3  Seafood Restaurant  0.04
4         Cheese Shop  0.04


----Brockton----
            venue  freq
0  Breakfast Spot  0.08
1            Café  0.08
2     Coffee Shop  0.08
3       Nightclub  0.08
4             Gym  0.04


----Brockton, Exhibition Place----
            venue  freq
0  Breakfast Spot  0.08
1            Café  0.08
2     Coffee Shop  0.08
3    

         venue  freq
0  Coffee Shop  0.17
1         Park  0.07
2       Bakery  0.07
3          Pub  0.07
4         Café  0.07


----Harbourfront East----
                venue  freq
0         Coffee Shop  0.12
1            Aquarium  0.05
2               Hotel  0.04
3                Café  0.04
4  Italian Restaurant  0.04


----Harbourfront East, Toronto Islands----
                venue  freq
0         Coffee Shop  0.12
1            Aquarium  0.05
2               Hotel  0.04
3                Café  0.04
4  Italian Restaurant  0.04


----Harbourfront East, Toronto Islands, Union Station----
                venue  freq
0         Coffee Shop  0.12
1            Aquarium  0.05
2               Hotel  0.04
3                Café  0.04
4  Italian Restaurant  0.04


----High Park----
                venue  freq
0  Mexican Restaurant  0.08
1                Café  0.08
2     Thai Restaurant  0.08
3                 Bar  0.08
4         Music Venue  0.04


----High Park, The Junction South----
         

In [252]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

---
### Display the top 10 most common venues per neighborhood in the Toronto area.

In [254]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Bar,Steakhouse,Café,Restaurant,Cosmetics Shop,Asian Restaurant,Burger Joint,Breakfast Spot,Thai Restaurant
1,"Adelaide, King",Coffee Shop,Bar,Steakhouse,Café,Restaurant,Cosmetics Shop,Asian Restaurant,Burger Joint,Breakfast Spot,Thai Restaurant
2,"Adelaide, King, Richmond",Coffee Shop,Bar,Steakhouse,Café,Restaurant,Cosmetics Shop,Asian Restaurant,Burger Joint,Breakfast Spot,Thai Restaurant
3,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Seafood Restaurant,Bakery,Farmers Market,Steakhouse,Cheese Shop,Café,Bistro
4,Brockton,Breakfast Spot,Café,Nightclub,Coffee Shop,Yoga Studio,Gym,Pet Store,Performing Arts Venue,Office,Italian Restaurant
5,"Brockton, Exhibition Place",Breakfast Spot,Café,Nightclub,Coffee Shop,Yoga Studio,Gym,Pet Store,Performing Arts Venue,Office,Italian Restaurant
6,"Brockton, Exhibition Place, Parkdale Village",Breakfast Spot,Café,Nightclub,Coffee Shop,Yoga Studio,Gym,Pet Store,Performing Arts Venue,Office,Italian Restaurant
7,Business Reply Mail Processing Centre 969 Eastern,Skate Park,Recording Studio,Burrito Place,Fast Food Restaurant,Light Rail Station,Farmers Market,Auto Workshop,Restaurant,Spa,Pizza Place
8,CN Tower,Airport Service,Airport Terminal,Airport Lounge,Boat or Ferry,Sculpture Garden,Rental Car Location,Coffee Shop,Harbor / Marina,Airport Gate,Airport Food Court
9,"CN Tower, Bathurst Quay",Airport Service,Airport Terminal,Airport Lounge,Boat or Ferry,Sculpture Garden,Rental Car Location,Coffee Shop,Harbor / Marina,Airport Gate,Airport Food Court
