In [60]:
import pandas as pd
#!pip install geocoder
#import geocoder
import requests
import folium

# Importing from the web
Import the data from the Wikipedia table at https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

I use `pd.read_html` to transform the HTML table into a dataframe.
The content of each cell is then transferred to the mapping function `getCellData` that splits the postal code, the borough and the neighbourhood name. An intermediate list (array) of dictionaries is used to create the final dataframe.

The dataframe is then cleaned by getting rid of the rows that contain undefined boroughs. 

In [37]:
origin_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Pandas already allows to parse HTML tables within a web page; # we do not need
# this information categorized in a dataframe yet, but we can use the dataframe
# to parse the table and get all the postal codes. 
# We flatten the table and apply a filter to it to clean each cell's content; 
# Postal codes are composed of three characters, which correspond to
# the first three of each cell.

# Cell processing function
def getCellData(x):
  x = x.strip()
  r = {'postalCode': x[0:3]}
  if x.find(')') > -1:
    r['Borough'] = x[3:x.index('(')]
    r['Neighbourhood'] = x[x.index('(')+1:x.index(')')].replace('/', ',').replace(' ,', ',')
    return r
  else:
    r['Borough'] = x[3::]
  return r

# From web to pandasa and applying processor function
postal_codes = pd.read_html(origin_url)[0] \
  .applymap( getCellData ) \
  .stack().values

# List of dicts to dataframe
postal_codes = pd.DataFrame.from_records(postal_codes)

# Drop rows with unassigned borough
postal_codes = postal_codes[ postal_codes['Borough'] != 'Not assigned' ]

# Set the unassigned neighbourhood to be the same as the borough
postal_codes.loc[postal_codes['Neighbourhood'].isnull(), ['Neighbourhood']] = \
postal_codes.loc[postal_codes['Neighbourhood'].isnull(), ['Borough']] 

# Shape & show
print("Shape:", postal_codes.shape)
postal_codes.head()

Shape: (103, 3)


Unnamed: 0,postalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government


# Getting the geolocation code

The lat/lon location of each postal code is obtained from the geocoder module (failed) or the CSV (success).

In [None]:
# Iterate each postal code
locations = []
for pc in postal_codes['postalCode']:
  loc = None
  print("Getting location for postal code {}...".format(pc))
  while loc == None:
    g = geocoder.google('{}, Toronto, Ontario'.format(pc))
    loc = g.latlng
    locations.push({"lat": loc[0], "lon": loc[1]})

## Since geocoder did not work for me I use the next cell

In [38]:
#!wget "http://cocl.us/Geospatial_data" -O pre_loc.csv
pre_loc = pd.read_csv('pre_loc.csv').rename(columns={'Postal Code': 'postalCode'})
pre_loc.head()

Unnamed: 0,postalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Producing the dataframe with the list of neighbours and their location

The two dataframes are merged using the values of the column "postalCodes" as the matching key reference.

In [39]:
# Nice!
# Now, we merge the two dataframes
dataFrame = postal_codes.merge(pre_loc, on='postalCode')
postal_codes = None
dataFrame.head()

Unnamed: 0,postalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


# Setting up FourSquare API

In [None]:
# /!\ /!\ /!\ /!\ /!\ 
# HIDE THIS CELL
# /!\ /!\ /!\ /!\ /!\ 
FOURSQUARE_APIKEY = 'fsq3MYRMDdtJwWf6mQdgOSh1RLQUXugAdYQ/K3/yygILE8Y='

In [None]:
# Function to ease queries to the FourSquare API
# Note this is for FourSquare API v3, and needs the API key sent via header 
# instead of user id and user secret sent via GET parameter.
def foursquare_request(endp, id, query={}):
  url = 'https://api.foursquare.com/v3/'+endp+'/'+id+'?'
  for k, v in query.items(): url += '&'+str(k)+'='+str(v)
  r = requests.get(url, headers={"accept": "application/json", "Authorization": FOURSQUARE_APIKEY})
  return r.json()

# I will store requested data in my local drive in order to reduce
# future identical requests to the API
from os.path import exists
import json
def cache_foursquare_request(endp, id, query):
  fname = 'FOURSQUARE$'+endp+'-'+id+'?'
  for k, v in query.items(): fname += '&'+str(k)+'='+str(v)
  fname = './drive/MyDrive/$TMP/'+fname
  if exists(fname):
    print("From file")
    with open(fname) as f:
      data = json.load(f)
  else:
    print("From API & to file")
    data = foursquare_request(endp, id, query)
    with open(fname, 'w') as outfile:
      json.dump(data, outfile)

  return data

In [None]:
# Make a test for places near one of my favourite locations
cache_foursquare_request('places', 'search', {
    'query': 'restaurant', 
    'll': '40.9726091,-5.6698419', 
    'radius': 500,
    'limit': 5 \
  } )

With the API tools set up, we can continue by plotting the Toronto neighbourhoods

In [None]:
# I add the column category to the dataeset
dataSet

In [83]:
color_groups = ['black', 'red', 'blue', 'yellow', 'green', 'purple'] # For later use
# I use a similar function to the one described along the course
def do_map(df):
  max_lat = -1000; min_lat = 1000; max_lon = -1000; min_lon = 1000;
  for lat, lon in zip(df['Latitude'], df['Longitude']):
    max_lat = max(max_lat, lat)
    min_lat = min(min_lat, lat)
    max_lon = max(max_lon, lon)
    min_lon = min(min_lon, lon)

  map = folium.Map(location=[(max_lat+min_lat)/2, (max_lon+min_lon)/2], zoom_start=11)
  for lat, lng, label, cat in zip( \
    df['Latitude'], \
    df['Longitude'], \
    df['Neighbourhood'], \
    df['category'] ):
      label = folium.Popup(label, parse_html=True)
      folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color=color_groups[cat],
        fill=True,
        fill_color=color_groups[cat],
        fill_opacity=0.4,
        parse_html=False).add_to(map)
  return map

do_map(dataFrame)

In [62]:
dataFrame.to_csv('./drive/MyDrive/$TMP/postal_codes.csv', index=False)

In [None]:
dataFrame = pd.read_csv('./drive/MyDrive/$TMP/postal_codes.csv')
dataFrame.head()

In [69]:
dataFrame.head()

Unnamed: 0,postalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [77]:
max(1,2)

2