In [1]:
import pandas as pd
import wikipedia as wp
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colours
from sklearn.cluster import KMeans
import folium
from folium.plugins import MarkerCluster
from geopy.geocoders import Nominatim
import requests

#### create our dataframe of Postal Codes, Boroughs, Neighbourhoods, and Coordinates

##### read and scrape the table from Wikipedia

In [2]:
html = wp.page("List of postal codes of Canada: M").html().encode("UTF-8")
df = pd.read_html(html)[0]

##### adjust the data in the dataframe to our specifications

In [3]:
df = df[df.Borough != 'Not assigned']
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

##### add latitude and longitude to the dataframe

In [4]:
latlng = pd.read_csv('Geospatial_Coordinates.csv').reset_index()
del latlng['index']
del latlng['Postal Code']
df = df.join(latlng, sort=False)

#### cluster the neighbourhoods of Toronto proper and map them

##### slice the original dataframe and create a new dataframe of the Toronto proper data

In [5]:
toronto_data = df.replace(to_replace='East Toronto', value='Toronto')
toronto_data = toronto_data.replace(to_replace='Central Toronto', value='Toronto')
toronto_data = toronto_data.replace(to_replace='Downtown Toronto', value='Toronto')
toronto_data = toronto_data.replace(to_replace='West Toronto', value='Toronto')
toronto_data = toronto_data[toronto_data.Borough != 'Queen\'s Park']
toronto_data = toronto_data[toronto_data.Borough != 'Scarborough']
toronto_data = toronto_data[toronto_data.Borough != 'North York']
toronto_data = toronto_data[toronto_data.Borough != 'East York']
toronto_data = toronto_data[toronto_data.Borough != 'York']
toronto_data = toronto_data[toronto_data.Borough != 'Etobicoke']
toronto_data = toronto_data[toronto_data.Borough != 'Mississauga'].reset_index()
del toronto_data['index']

##### set number of clusters

In [6]:
kclusters = 4

toronto_grouped_clust = toronto_data.drop('Neighbourhood', 1)
del toronto_grouped_clust['Postcode']
del toronto_grouped_clust['Borough']

##### run k-means clustering

In [7]:
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clust)

##### check cluster labels generated for each row in the dataframe

In [8]:
kmeans.labels_[0:10]

array([0, 0, 0, 0, 2, 2, 2, 2, 2, 2])

##### add clustering labels

In [9]:
toronto_data.insert(0, 'Cluster', kmeans.labels_)

##### add a radius size to each neighbourhood depending on cluster

##### the radius size (in metres) was determined in Excel through a rough calculation of the median neighbourhood size per cluster

In [10]:
toronto_data['Radius'] = toronto_data['Cluster']
toronto_data['Radius'] = toronto_data['Radius'].replace(to_replace=0, value=1000)
toronto_data['Radius'] = toronto_data['Radius'].replace(to_replace=1, value=600)
toronto_data['Radius'] = toronto_data['Radius'].replace(to_replace=2, value=900)
toronto_data['Radius'] = toronto_data['Radius'].replace(to_replace=3, value=950)

#### get venue names, locations, and categories from the Foursquare API

##### define Foursquare credentials and version

In [11]:
client_id = 'MDWOQBWHDYHIUG1CDT1MUH04MP2KSYSBSAOFNMEVDGEQOY1S'
client_secret = 'YXKVMZDCWIFWPFYH5SSASBYDI2K44PMGGTAUGPCWY1IXWEEN'
version = '20180605'

##### create a function to build our dataframe from the Foursquare data

In [12]:
def get_venues(names, latitudes, longitudes, radii, limit=100):
    
    venues_list=[]
    for name, lat, lng, radius in zip(names, latitudes, longitudes, radii):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id, 
            client_secret, 
            version, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'],
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],
                v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue',
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Category'] 
    
    return(nearby_venues)

##### pull the top 100 venues for each neighbourhood

In [13]:
toronto_venues = get_venues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude'],
                                   radii=toronto_data['Radius']
                                  )

##### filter out all venues that aren't from our list of categories

In [14]:
bars = ['Beer Bar', 'Bar', 'Cocktail Bar', 'Dive Bar', 'Gastropub', 'Gay Bar',\
        'Hotel Bar', 'Irish Pub', 'Jazz Club', 'Lounge', 'Nightclub',\
        'Other Nightlife', 'Piano Bar', 'Pub','Rock Club', 'Sake Bar',\
        'Speakeasy', 'Sports Bar', 'Whisky Bar', 'Wine Bar']

toronto_venues = toronto_venues[toronto_venues['Venue Category'].isin(bars)].reset_index()
del toronto_venues['index']

##### save the dataframe as a .csv

In [15]:
latlng = pd.read_csv('Geospatial_Coordinates.csv').reset_index()
del latlng['index']
del latlng['Longitude']
latlng = latlng.rename(index=str, columns={'Latitude':'Neighbourhood Latitude'})
toronto_venues = toronto_venues.merge(latlng, on='Neighbourhood Latitude')

toronto_venues.to_csv('toronto_venues.csv')

##### count the number of venues per neighbourhood

In [16]:
venue_count_nhood = toronto_venues.groupby('Neighbourhood').size()
venue_count_pcode = toronto_venues.groupby('Postal Code').size()
print(venue_count_nhood)

Neighbourhood
Adelaide, King, Richmond                                                                                      11
Berczy Park                                                                                                   14
Brockton, Exhibition Place, Parkdale Village                                                                  11
Business Reply Mail Processing Centre 969 Eastern                                                              2
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara     1
Cabbagetown, St. James Town                                                                                    5
Central Bay Street                                                                                             4
Chinatown, Grange Park, Kensington Market                                                                     10
Christie                                                                          

#### plot the data on a map

##### create the venue map

In [17]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_venues = folium.Map(location=[latitude, longitude], zoom_start=12)
map_choropleth = folium.Map(location=[latitude, longitude], zoom_start=12)
marker_cluster = MarkerCluster()

##### create the markers

In [18]:
for lat, lon, venue in zip(toronto_venues['Venue Latitude'], toronto_venues['Venue Longitude'],\
                           toronto_venues['Venue']):
    marker_cluster.add_child(folium.Marker(location=[lat, lon]))

##### map it

In [19]:
map_venues.add_child(marker_cluster)    
map_venues

##### this map displays the locations of bar venues in Toronto proper - it groups nearby venues together when zoomed out

#### plot a choropleth map

In [20]:
toronto_json = "./TorontoFSAs.geojson"
choropleth = folium.Choropleth(geo_data=toronto_json,
    data = venue_count_pcode,
    columns=['Postal Code',''],
    key_on='feature.properties.CFSAUID',
    fill_color='YlOrRd',
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Number of Venues by Postal Code').add_to(map_choropleth)
    
map_choropleth

##### this choropleth map displays the number of venues in each individual postal code / neighbourhood