# Segmenting and Clustering Neighborhoods in Toronto
### Part 1

In [1]:
import os
import requests
import json

from dotenv import load_dotenv
import pandas as pd




## Section 1: Scrape Toronto neighbourhoods and postal codes

As the Wikipedia page contains a single table, it is pretty straightforward to use the read_html() function in Pandas. It is returned as a list of tables (one item here), so it is the first dataframe loaded.

In [2]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_neighborhoods = pd.read_html(URL)[0]
df_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Filter out rows which are 'Not assigned'

In [3]:
df_neighborhoods = df_neighborhoods[df_neighborhoods['Borough'] != 'Not assigned']
df_neighborhoods.reset_index(drop=True, inplace=True)
df_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
df_neighborhoods.shape

(103, 3)

## Section 2 - Merge geospatial coordinates

Seems the simplest option is to use the Geospatial_Coordinates CSV file, and use the Pandas merge function to join the tables on the Postal Code column.

First, having downloaded the file we can load it into a dataframe

In [4]:
path = os.path.join(os.path.abspath('../data'), 'Geospatial_Coordinates.csv')
df_geodata = pd.read_csv(path)
df_geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


And make the merge.

In [5]:
df_complete = df_neighborhoods.merge(df_geodata, how='inner', on='Postal Code')
df_complete.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Check the number of rows, should still be 103

In [6]:
df_complete.shape

(103, 5)

In [7]:
path = os.path.join(os.path.abspath('../data'), 'TorontoNeighborhoods.csv')
df_complete.to_csv(path, index=False)

## Section 3.1 - Compile venue data for neighborhoods

This next cell loads the Foursquare API credentials from environment variables saved on my local machine. In this way the credentials can be kept confidential, and will not be published on the Github repository, for example. Were one to clone the repository, they can easily load their own credentials on their local environment.

In [19]:
load_dotenv()
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
VERSION = '20180605'
LIMIT = 100

BaseURL = ('https://api.foursquare.com/v2/venues/explore?' +
           f'client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&' +
           f'v={VERSION}')

### Get venues for each neighborhood using the Foursquare API

In [90]:
def getVenues(names, lats, longs, radius=500):
    neighbor_venues = []
    
    count = 0
    
    # iterate through the neighborhoods grouped by postal codes
    for name, lat, long in zip(names, lats, longs):
        url = BaseURL + f'&ll={lat},{long}&radius={radius}&limit={LIMIT}'
        
        # GET response, and make sure it is valid (status_code == 200)
        results = requests.get(url)
        if results.status_code  != 200:
            raise Exception(f'HTTP response code was {results.status_code}')
            
        # Update what percentage of neighborhoods processed and print
        count += 1
        print(f'\r{round(count / len(names) * 100, 2)}% neighborhoods downloaded', end='')
            
        # Breakdown the JSON response to what we want
        venues = results.json()['response']['groups'][0]['items']
        for venue in venues:
              ven = venue['venue']
              row = {'Neighborhood': name,
                     'Neighborhood_lat': lat,
                     'Neighborhood_long': long,
                     'Venue': ven['name'],
                     'Category': ven['categories'][0]['name'],
                     'Venue_lat': ven['location']['lat'],
                     'Venue_long': ven['location']['lng']}
              neighbor_venues.append(row)
              
    # Make it into a Dataframe and return
    venues_df = pd.DataFrame(neighbor_venues)
        
    return venues_df

In [92]:
venues_df = getVenues(df_complete['Neighbourhood'],
                      df_complete['Latitude'],
                      df_complete['Longitude'])


100.0% neighborhoods downloaded

In [93]:
venues_df.head()

Unnamed: 0,Neighborhood,Neighborhood_lat,Neighborhood_long,Venue,Category,Venue_lat,Venue_long
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,Park,43.751976,-79.33214
1,Parkwoods,43.753259,-79.329656,Sun Life,Construction & Landscaping,43.75476,-79.332783
2,Parkwoods,43.753259,-79.329656,Variety Store,Food & Drink Shop,43.751974,-79.333114
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,Hockey Arena,43.723481,-79.315635
4,Victoria Village,43.725882,-79.315572,Portugril,Portuguese Restaurant,43.725819,-79.312785


In [94]:
venues_df.shape

(2110, 7)

In [95]:
venues_df.groupby('Neighborhood')['Venue'].count()

Neighborhood
Agincourt                                           4
Alderwood, Long Branch                              8
Bathurst Manor, Wilson Heights, Downsview North    21
Bayview Village                                     4
Bedford Park, Lawrence Manor East                  23
                                                   ..
Willowdale, Willowdale East                        34
Willowdale, Willowdale West                         5
Woburn                                              4
Woodbine Heights                                    6
York Mills West                                     2
Name: Venue, Length: 95, dtype: int64

In [96]:
venues_df['Category'].nunique()

272

In [98]:
categories = venues_df.groupby('Category')['Neighborhood'].count()
categories

Category
Accessories Store      1
Airport                2
Airport Food Court     1
Airport Gate           1
Airport Lounge         2
                      ..
Warehouse Store        1
Wine Bar               9
Wings Joint            1
Women's Store          3
Yoga Studio           14
Name: Neighborhood, Length: 272, dtype: int64

### Save the neighborhood venues dataframe

This way, we will not need to re-query them from the API in the future

In [131]:
path = os.path.join(os.path.abspath('../data'), 'TorontoVenues.csv')
venues_df.to_csv(path, index=False)