### Book 1 - create a dataframe of Toronto borough's and neighbourhoods

Import pandas library

In [1]:
import pandas as pd

Scrape wiki page and upload data into a dataframe called df_postcode

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df_postcode = pd.read_html(url)

Number of tables found

In [3]:
len(df_postcode)

3

Show the appropriate dataframe

In [8]:
df_postcode[0]

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Remove "Not assigned" rows from the dataframe where "Not assigned" appears in the column "Borough"

In [33]:
df = df_postcode[0][~df_postcode[0].Borough.str.contains("Not assigned")]

df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [34]:
df.shape

(103, 3)

### Book 2 - merge latitude and longitude to Toronto dataframe

In [None]:
df_latlon = pd.read_csv ('http://cocl.us/Geospatial_data')

df_latlon

In [39]:
df2 = pd.merge(df, df_latlon, on='Postal Code')
df2

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Book 3 - explore and cluster neighbourhoods in Toronto

Import and install appropriate libraries

In [40]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library



  current version: 4.8.3
  latest version: 4.9.1

Please update conda by running

    $ conda update -n base -c defaults conda




Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\James\anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.9.0                |   py38h9bdc248_1         3.1 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-2.0.0                |     pyh9f0ad1d_0          63 KB  conda-forge
    python_abi-3.8             |           1_cp38           4 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-2.0.0-pyh9f0ad1d_0
  python_abi 

Find the latitude and longitude of Toronto

In [41]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Produce a map with Toronto neighbourhoods added

In [43]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto



Define FourSquare credentials 

In [46]:
CLIENT_ID = 'FourSquare ID - Restricted'
CLIENT_SECRET = 'FourSquare Secret - Restricted'
VERSION = '20180605'
LIMIT = 100

Function that extracts the category of the venue from FourSquare

In [47]:

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [48]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
# make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    


Create a new dataframe with venues in the different neighbourhoods

In [139]:
import requests # library to handle requests
from pandas.io.json import json_normalize

toronto_venues = getNearbyVenues(names=df2['Borough'],
                                   latitudes=df2['Latitude'],
                                   longitudes=df2['Longitude']
                                  )

North York
North York
Downtown Toronto
North York
Downtown Toronto
Etobicoke
Scarborough
North York
East York
Downtown Toronto
North York
Etobicoke
Scarborough
North York
East York
Downtown Toronto
York
Etobicoke
Scarborough
East Toronto
Downtown Toronto
York
Scarborough
East York
Downtown Toronto
Downtown Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
North York
North York
Scarborough
North York
North York
East Toronto
North York
York
North York
Scarborough
North York
North York
Central Toronto
Central Toronto
York
York
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Etobicoke
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Mississauga
Etobicoke
Scarborough
Central Toronto
Downtown Toronto
West Toron

In [140]:
print(toronto_venues.shape)
toronto_venues.head()

(2139, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,North York,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,North York,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,North York,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,North York,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
4,North York,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [141]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Toronto,104,104,104,104,104,104
Downtown Toronto,1248,1248,1248,1248,1248,1248
East Toronto,119,119,119,119,119,119
East York,79,79,79,79,79,79
Etobicoke,74,74,74,74,74,74
Mississauga,13,13,13,13,13,13
North York,240,240,240,240,240,240
Scarborough,89,89,89,89,89,89
West Toronto,153,153,153,153,153,153
York,20,20,20,20,20,20


In [152]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 273 uniques categories.


One-hot encode

In [154]:
toronto2_onehot = pd.get_dummies(toronto_venues[['Venue']], prefix="", prefix_sep="")

toronto2_onehot

Unnamed: 0,(The New) Moksha Yoga Bloor West,195 Jane Rocket,2 Brothers shawarma,241 Pizza,306 Yonge Street - Jordan Store,360 Restaurant,401 Games,7 Numbers,7 West Cafe,7-Eleven,...,famous last words,freshii,iQ Food Co,iQ Food Co. (First Canadian Place),jane sheppard mall,lululemon,lululemon athletica,nodo,souvlaki express,金城超級市場 Lucky Moose Food Mart
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2134,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2136,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Add borough column back into the dataframe as neighborhood

In [157]:
#Add borough column back into the onehot dataframe
toronto2_onehot['Neighborhood']= toronto_venues['Neighborhood']

toronto2_onehot.head()

Unnamed: 0,(The New) Moksha Yoga Bloor West,195 Jane Rocket,2 Brothers shawarma,241 Pizza,306 Yonge Street - Jordan Store,360 Restaurant,401 Games,7 Numbers,7 West Cafe,7-Eleven,...,freshii,iQ Food Co,iQ Food Co. (First Canadian Place),jane sheppard mall,lululemon,lululemon athletica,nodo,souvlaki express,金城超級市場 Lucky Moose Food Mart,Neighborhood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,North York
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,North York
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,North York
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,North York
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,North York


Group data together and average based on neighborhood

In [158]:
toronto_grouped = toronto2_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,(The New) Moksha Yoga Bloor West,195 Jane Rocket,2 Brothers shawarma,241 Pizza,306 Yonge Street - Jordan Store,360 Restaurant,401 Games,7 Numbers,7 West Cafe,...,famous last words,freshii,iQ Food Co,iQ Food Co. (First Canadian Place),jane sheppard mall,lululemon,lululemon athletica,nodo,souvlaki express,金城超級市場 Lucky Moose Food Mart
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.009615,0.0,0.0,0.009615,0.0
1,Downtown Toronto,0.0,0.0,0.0,0.000801,0.000801,0.000801,0.001603,0.0,0.000801,...,0.0,0.000801,0.004006,0.002404,0.0,0.0,0.001603,0.0,0.0,0.000801
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008403,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Mississauga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.004167,0.0,0.0,0.0,0.0,0.0
7,Scarborough,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,West Toronto,0.006536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006536,0.0,0.0,0.0,0.0,0.0,0.0,0.006536,0.0,0.0
9,York,0.0,0.05,0.05,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [159]:
toronto_grouped.shape

(10, 1405)



Create a function to return the most common venues found in each neighbourhood then use it to generate a new dataframe containing the top 10 venues

In [160]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [161]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Subway,Starbucks,Tim Hortons,Second Cup,Pizzaiolo,LCBO,Shoppers Drug Mart,Crossfit Metric,Uncle Betty's Diner,Starving Artist
1,Downtown Toronto,Starbucks,Tim Hortons,Pilot Coffee Roasters,Buster's Sea Cove,John & Sons Oyster House,Balzac's Coffee,Dineen Coffee,Winners,Equinox Bay Street,Beerbistro
2,East Toronto,Starbucks,LCBO,Cafe Frappe,Bonjour Brioche,Queen Books,Purple Penguin Cafe,Moksha Yoga Danforth,Subway,IQ Living,IL FORNELLO on Danforth
3,East York,The Beer Store,East York Memorial Arena,LCBO,Shoppers Drug Mart,Subway,Tim Hortons,Pizza Pizza,TD Canada Trust,Danforth & Jones,Stan Wadlow Park
4,Etobicoke,Subway,Shoppers Drug Mart,McDonald's,LCBO,Pizza Hut,Dollarama,Pizza Pizza,Starbucks,The Beer Store,Popeyes Louisiana Kitchen


Fit data to a k-means cluster algorithm

In [162]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 4, 2, 1, 0, 0, 0, 3])

Add cluster labels and create new merged dataframe with latitude and longitude dataframe created earlier (df2)

In [171]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df2

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = df2.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Borough')

In [172]:
toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,Tim Hortons,Subway,Starbucks,TD Canada Trust,Shoppers Drug Mart,Aroma Espresso Bar,LCBO,Pizza Pizza,Price Chopper,Dairy Queen
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Tim Hortons,Subway,Starbucks,TD Canada Trust,Shoppers Drug Mart,Aroma Espresso Bar,LCBO,Pizza Pizza,Price Chopper,Dairy Queen
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Starbucks,Tim Hortons,Pilot Coffee Roasters,Buster's Sea Cove,John & Sons Oyster House,Balzac's Coffee,Dineen Coffee,Winners,Equinox Bay Street,Beerbistro
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,Tim Hortons,Subway,Starbucks,TD Canada Trust,Shoppers Drug Mart,Aroma Espresso Bar,LCBO,Pizza Pizza,Price Chopper,Dairy Queen
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Starbucks,Tim Hortons,Pilot Coffee Roasters,Buster's Sea Cove,John & Sons Oyster House,Balzac's Coffee,Dineen Coffee,Winners,Equinox Bay Street,Beerbistro


In [174]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters