# Toronto segmenting and clustering

## Phase 1 - scraping and formatting postal code data to neighbourhood dataframe

In [1]:
#importing necessary libraries
import pandas as pd
import numpy as np

In [2]:
# the method to process the list is quite straightforward:
# 1. need to read the original html into pandas dataframe, it will create a list of 'table' elements from the html
# 2. need to take the first element (the table with the codes)
# 3. need to filter out items where the borough is not set, I'll use simple dataframe filter for that
# 4. nedd to update neighbourhood to the borough value where the latter is set, I'll use numpy.where for that. this gives a warning, but is sufficient for now
# 5. need to combine rows with same postcode, using groupby with 2 columns (postcode, borough) for that and apply.join

In [3]:
orig_tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [4]:
#the source html has 3 tables, we need only the first one
orig_table = orig_tables[0]
print(orig_table.shape)
orig_table.head()

(287, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
# no borough is not needed
df = orig_table
df = df[df["Borough"] != "Not assigned"]
print(df.head())
df.shape

  Postcode           Borough     Neighbourhood
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront
5      M6A        North York  Lawrence Heights
6      M6A        North York    Lawrence Manor


(210, 3)

In [6]:
#updating not set neighborhood values
df["Neighbourhood"] = np.where(df["Neighbourhood"] == "Not assigned", df["Borough"],df["Neighbourhood"])
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [7]:
#grouping by postcode
print(df.shape)
df2 = df
df_clean = pd.DataFrame( df2.groupby(['Postcode','Borough'])["Neighbourhood"].apply(','.join))
print(df_clean.head())

(210, 3)
                                             Neighbourhood
Postcode Borough                                          
M1B      Scarborough                         Rouge,Malvern
M1C      Scarborough  Highland Creek,Rouge Hill,Port Union
M1E      Scarborough       Guildwood,Morningside,West Hill
M1G      Scarborough                                Woburn
M1H      Scarborough                             Cedarbrae


In [8]:
df_clean = df_clean.reset_index()
print('the resulting dataframe size is {}'.format(df_clean.shape))
df_clean.head()

the resulting dataframe size is (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Phase 2 - reading location data for neighbourhoods

In [9]:
# due to the unreliability of the api to get the data use the below flag to govern the source 
readAPI = True #set to false to read the csv directly from project's url
numRetries = 3 #the geocoder API tends to end up in some eternal loop, we need to break after a given number of retries

#may need to install geocoder - see https://anaconda.org/conda-forge/geocoder
#!conda install -c conda-forge geocoder 
import geocoder # import geocoder

In [10]:
#the below tries to call the API, but admittedly, even the demo/ simple examples from https://geocoder.readthedocs.io/api.html don't work. 
#this whole library seems to be broken 
#creating a custom exception so we can handle that the API does not return anything
class GeocoderError(Exception):
    """Raised when the geocoder API returns nothing"""
    pass

In [11]:
#dataframe for coordinates -> 3 columns, Postal code, latitude, longitude. will be filled by either API or csv read

In [12]:
#TODO: remove
readAPI = False

if (readAPI == True):

    # initialize your variable to None
    lat_lng_coords = None
    try:
        
        #TODO: the below code would not work for the whole dataframe, but the API simply does not work... below is just a stub to test it
        
        tries = 1
        # loop until you get the coordinates or need to exit the loop, as this API does not seem to work
        while(lat_lng_coords is None and tries<= numRetries):
            g = geocoder.google('{}, Toronto, Ontario'.format("M3A"))
            lat_lng_coords = g.latlng
            tries = tries+1

        if (lat_lng_coords is None):
           raise GeocoderError

    except GeocoderError:
        print("Geocoder API returned nothing, falling back to using csv")
        readAPI = False

if (readAPI == False):
    #either we did not try or we did not suceed using the geocoder
    #use csv file - download, process
    
    #!wget -q -O 'geospatial_coordinates.csv' https://cocl.us/Geospatial_data
    #print('Data downloaded!')

    df_coordinates = pd.read_csv('https://cocl.us/Geospatial_data/geospatial_coordinates.csv')
    print(df_coordinates.head())
    print(df_coordinates.shape)
    print('Data downloaded!')

    

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476
(103, 3)
Data downloaded!


In [13]:
#combining the cleaned neighbourhood frame with the coordinates
#renaming Postal Code to Postcode to use as an id in merging
df_coordinates.rename(columns = {"Postal Code" : "Postcode"}, inplace=True)
df_coordinates.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
#merging them together
df_merged = pd.merge(df_clean, df_coordinates, on='Postcode')
print(df_merged.shape)
df_merged

(103, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


## Phase 3 - neighbourhood clustering

In [15]:
# this part will essentially reuse the new york sample in the following way:
# 1. explore the foursquare API and get the top10 features for lat / long
# 2. do a hot one and explode items
# 3. cluster 'hoods
# 4. display clusters on map

#importing libraries
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [16]:
#creating a map of greater toronto and visualizing our boroughs and 'hoods
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [22]:
# create map of Torono using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Borough'], df_merged['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [25]:
#foursquare api connection
{
    "tags": [
        "remove_input",
    ]
}
CLIENT_ID = 'LOSYA4ISECIWNJ4OJA3JX5AWLLJNX3EPWLKFQV3AUB2RLJJE' # your Foursquare ID
CLIENT_SECRET = 'IEXMRGUFI2OB5OWQI14WAGWMUA3RVHCOTCSYKF5SOR4RDR1C' # your Foursquare Secret
VERSION = '20191130' # Foursquare API version
