In [None]:
import pandas as pd
#!pip install geocoder
#import geocoder
import requests
import folium

# Importing from the web
Import the data from the Wikipedia table at https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

I use `pd.read_html` to transform the HTML table into a dataframe.
The content of each cell is then transferred to the mapping function `getCellData` that splits the postal code, the borough and the neighbourhood name. An intermediate list (array) of dictionaries is used to create the final dataframe.

The dataframe is then cleaned by getting rid of the rows that contain undefined boroughs. 

In [None]:
origin_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Pandas already allows to parse HTML tables within a web page; # we do not need
# this information categorized in a dataframe yet, but we can use the dataframe
# to parse the table and get all the postal codes. 
# We flatten the table and apply a filter to it to clean each cell's content; 
# Postal codes are composed of three characters, which correspond to
# the first three of each cell.

# Cell processing function
def getCellData(x):
  x = x.strip()
  r = {'postalCode': x[0:3]}
  if x.find(')') > -1:
    r['Borough'] = x[3:x.index('(')]
    r['Neighbourhood'] = x[x.index('(')+1:x.index(')')].replace('/', ',').replace(' ,', ',')
    return r
  else:
    r['Borough'] = x[3::]
  return r

# From web to pandasa and applying processor function
postal_codes = pd.read_html(origin_url)[0] \
  .applymap( getCellData ) \
  .stack().values

# List of dicts to dataframe
postal_codes = pd.DataFrame.from_records(postal_codes)

# Drop rows with unassigned borough
postal_codes = postal_codes[ postal_codes['Borough'] != 'Not assigned' ]

# Set the unassigned neighbourhood to be the same as the borough
postal_codes.loc[postal_codes['Neighbourhood'].isnull(), ['Neighbourhood']] = \
postal_codes.loc[postal_codes['Neighbourhood'].isnull(), ['Borough']] 

# Shape & show
print("Shape:", postal_codes.shape)
postal_codes.head()

Shape: (103, 3)


Unnamed: 0,postalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government


# Getting the geolocation code

The lat/lon location of each postal code is obtained from the geocoder module (failed) or the CSV (success).

In [None]:
# Iterate each postal code
locations = []
for pc in postal_codes['postalCode']:
  loc = None
  print("Getting location for postal code {}...".format(pc))
  while loc == None:
    g = geocoder.google('{}, Toronto, Ontario'.format(pc))
    loc = g.latlng
    locations.push({"lat": loc[0], "lon": loc[1]})

## Since geocoder did not work for me I use the next cell

In [None]:
#!wget "http://cocl.us/Geospatial_data" -O pre_loc.csv
pre_loc = pd.read_csv('pre_loc.csv').rename(columns={'Postal Code': 'postalCode'})
pre_loc.head()

Unnamed: 0,postalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Producing the dataframe with the list of neighbours and their location

The two dataframes are merged using the values of the column "postalCodes" as the matching key reference.

In [None]:
# Nice!
# Now, we merge the two dataframes
dataFrame = postal_codes.merge(pre_loc, on='postalCode')
postal_codes = None
dataFrame.head()

Unnamed: 0,postalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


# Setting up FourSquare API

In [None]:
# /!\ /!\ /!\ /!\ /!\ 
# HIDE THIS CELL
# /!\ /!\ /!\ /!\ /!\ 
FOURSQUARE_APIKEY = ' -- HIDDEN KEY --'

In [None]:
# Function to ease queries to the FourSquare API
# Note this is for FourSquare API v3, and needs the API key sent via header 
# instead of user id and user secret sent via GET parameter.
def foursquare_request(endp, id, query={}):
  url = 'https://api.foursquare.com/v3/'+endp+'/'+id+'?'
  for k, v in query.items(): url += '&'+str(k)+'='+str(v)
  r = requests.get(url, headers={"accept": "application/json", "Authorization": FOURSQUARE_APIKEY})
  return r.json()

# I will store requested data in my local drive in order to reduce
# future identical requests to the API
from os.path import exists
import json
def cache_foursquare_request(endp, id, query):
  fname = 'FOURSQUARE$'+endp+'-'+id+'?'
  for k, v in query.items(): fname += '&'+str(k)+'='+str(v)
  fname = './drive/MyDrive/$TMP/'+fname
  if exists(fname):
    #print("From file")
    with open(fname) as f:
      data = json.load(f)
  else:
    #print("From API & to file")
    data = foursquare_request(endp, id, query)
    with open(fname, 'w') as outfile:
      json.dump(data, outfile)

  return data

In [None]:
# Make a test for places near one of my favourite locations
cache_foursquare_request('places', 'search', {
    'query': 'restaurant', 
    'll': '40.9726091,-5.6698419', 
    'radius': 500,
    'limit': 5 \
  } )

With the API tools set up, we can continue by plotting the Toronto neighbourhoods

We will define a function to plot the neighbourhoods from the dataframe, and will use the same function for the final result after classification.

In [None]:
color_groups = ['black', 'red', 'blue', 'yellow', 'green', 'purple'] # For later use
# I use a similar function to the one described along the course
def do_map(df):
  max_lat = -1000; min_lat = 1000; max_lon = -1000; min_lon = 1000;
  for lat, lon in zip(df['Latitude'], df['Longitude']):
    max_lat = max(max_lat, lat)
    min_lat = min(min_lat, lat)
    max_lon = max(max_lon, lon)
    min_lon = min(min_lon, lon)

  map = folium.Map(location=[(max_lat+min_lat)/2, (max_lon+min_lon)/2], zoom_start=11)
  for lat, lng, label, cat in zip( \
    df['Latitude'], \
    df['Longitude'], \
    df['Neighbourhood'], \
    df['category'] ):
      label = folium.Popup(label, parse_html=True)
      folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color=color_groups[cat],
        fill=True,
        fill_color=color_groups[cat],
        fill_opacity=0.4,
        parse_html=False).add_to(map)
  return map

In [None]:
# I add the column category to the dataeset
dataFrame['category'] = 0
do_map(dataFrame)

Now we get the places from the FOURSQUARE API using a loop for each neighbourhood

In [None]:
# Now we will extract the closest(up to 50) nearby(<500 m) places from FourSquare
results = {}
for pc, lat, lon in zip(dataFrame['postalCode'], dataFrame['Latitude'], dataFrame['Longitude']):
  results[pc] = cache_foursquare_request('places', 'search', {\
    'll': "{},{}".format(lat,lon), \
    'query': "", \
    'radius': 500, \
    'limit': 50
  })['results']

And extract all the possible categories from the results.

In [None]:
# Get full category list
categories = {}
for nh in results.values():
  for p in nh:
    for c in p['categories']:
      cat = c['name']
      categories[cat] = categories.get(cat, 0)+1

# Now we transform the dictionary into a dataframe
categories = pd.DataFrame(data=categories.items() ,columns=['Category', 'N'])
categories.sort_values(by=['N'], ascending=False, inplace=True)
print("all", categories.shape)
print(">1", categories[categories['N']>1].shape)
print(">2", categories[categories['N']>2].shape)
print(">5", categories[categories['N']>5].shape)

all (466, 2)
>1 (348, 2)
>2 (287, 2)
>5 (197, 2)
>10 (197, 2)


We want to simplify categories. 466 are too many!

Since this is just an exercise, I will take those that repeat more than 5 times, and will include them in one of the following types:
  - Restaurants (restaurants of all kind)
  - Leissure (art galleries, bars, sports, parks, spas, cafés...)
  - Personal and health care (hospitals, hairdressers', etc.)
  - Provisions (grocerie's, baker's, supermarket, drugstores, butcher's...)
  - Shops (clothes, electronics, malls...)
  - Other business

I export to a CSV and set categories externally in excel

In [None]:
# 295 are too many categories!
# Since this is just an exercise, I will take those that repeat more than
# 5 times, and will include them in one of the following types:
#   - Food (restaurants of all kind)
#   - Leissure (art galleries, bars, sports, parks, spas)
#   - Shops (basics: grocieries, bakers, supermarket, drugstores...)
#   - Other shops and professional services
# I export to a CSV and set categories externally in excel
categories = categories[categories['N']>5]
categories.to_csv('./drive/MyDrive/$TMP/categories.csv', sep=";")
# Link: https://drive.google.com/file/d/15rMxKlbxbogypEWao5dysJZklR7uCw6s

After some excel editing...

In [None]:
# Let's see what we have
categories = pd.read_csv('./drive/MyDrive/$TMP/categories_types.csv', sep=";")
categories.head()

Unnamed: 0.1,Unnamed: 0,Category,N,Type
0,35,Restaurant,257,Restaurants
1,21,Fast Food Restaurant,141,Restaurants
2,5,Business and Professional Services,117,Other business
3,38,General Contractor,108,Other business
4,47,Coffee Shop,101,Leissure


In [None]:
# Now we will insert 5 new columns called ct_Food, ct_Leissure into the main 
# dataFrame
types = categories['Type'].unique()
for ct in types: dataFrame['ct_'+ct] = 0

# And we will count each category.
# Take into account that some places have several categories that can be
# of the same type, and the should not be accounted for more than once.
for pc in dataFrame['postalCode']:
  places = results[pc]
  for p in places:
    p_cats = {}
    for t in types: p_cats[t] = 0
    for c in p['categories']:
      t = categories.loc[categories['Category']==c['name'], 'Type'].values
      if len(t) > 0: p_cats[t[0]] += 1
    for t in p_cats.keys():
      dataFrame.loc[dataFrame['postalCode']==pc, 'ct_'+t] += min(1,p_cats[t])
      # We used min(1,...) so as not to account for more than 1 place

dataFrame.head()

Unnamed: 0,postalCode,Borough,Neighbourhood,Latitude,Longitude,category,ct_Restaurants,ct_Other business,ct_Leissure,ct_Shops,ct_Personal and health care,ct_Supplies,ct_Shop
0,M3A,North York,Parkwoods,43.753259,-79.329656,0,0,11,2,1,0,2,0
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,4,31,4,4,1,1,0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,13,10,12,5,3,7,0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,0,2,11,3,23,3,1,0
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494,0,21,5,16,4,3,1,2


In [None]:
# Nice, now, we will classify the neighbourhoods according to the number of
# places of each ct_* types
# The idea is to classify the neighbourhoods depending on the kind of services
# they mostly feature. For example, supermarkets should be majoritary in 
# residential areas, while leissure and restaurants are more typical close to 
# touristic parades, and others at business-oriented neighbourhoods.
# So, our variables (X) are the ct_* types columns, normalized
X = pd.DataFrame()
for col in list(map(lambda x: 'ct_'+x, types)):
  X[col] = dataFrame[col]/max(dataFrame[col])
X.head()

Unnamed: 0,ct_Restaurants,ct_Other business,ct_Leissure,ct_Shops,ct_Personal and health care,ct_Supplies,ct_Shop
0,0.0,0.34375,0.095238,0.038462,0.0,0.2,0.0
1,0.114286,0.96875,0.190476,0.153846,0.090909,0.1,0.0
2,0.371429,0.3125,0.571429,0.192308,0.272727,0.7,0.0
3,0.057143,0.34375,0.142857,0.884615,0.272727,0.1,0.0
4,0.6,0.15625,0.761905,0.153846,0.272727,0.1,0.5


In [None]:
# We perform the k-means clustering
from sklearn.cluster import KMeans
kclusters = 5

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(X)
kmeans.labels_[0:10] 

array([3, 1, 2, 4, 0, 3, 1, 1, 1, 2], dtype=int32)

In [None]:
# Now, we add kmeans.labels_ to the main dataframe
# We need to add one to kmeans.labels_ for map plotting
dataFrame['category'] = kmeans.labels_ + 1

do_map(dataFrame)