## Week 3 - CAPSTONE: IBM Data Science Professional Certificate
# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import folium
from geopy.geocoders import MapBox
from geopy.extra.rate_limiter import RateLimiter

In [2]:
# config file containing API KEYS
from project_config import *

## SECTION 1: Zipcode Data
### Scrape the Data

Pull the data from the table on the Wikipedia Page by requesting the page and parsing the HTML.  We will use the popular requests package to make our get request and use BeautifulSoup to parse the HTML.  BeautifulSoup has an easy API to traverse the HTML nodes and search based on HTML tags and attributes.

#### Make the get request

In [37]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
# print the sever response to make sure we get a good 200 code
print(r) 

<Response [200]>


#### Parse the page

In [38]:
# parse the page and get a list of all the tables on the page
page = BeautifulSoup(r.text, 'html.parser')


#### Find the table with the data

In [39]:
tables = page.find_all('table')
print(f'There are {len(tables)} HTML tables on the page')

There are 5 HTML tables on the page


Since there are more then one table lets pull the header cells with HTML tag of **th** to find the correct table.

In [40]:
[table.find_all('th') for table in tables]

[[<th>Postcode</th>,
  <th>Borough</th>,
  <th>Neighbourhood
  </th>],
 [],
 [],
 [<th class="navbox-title" style="font-size:110%"><a href="/wiki/Postal_codes_in_Canada" title="Postal codes in Canada">Canadian postal codes</a>
  </th>],
 []]

From the python list above we can see that the first table on the page is the one we want to scrape.

#### Parse the table

Loop over ever row in the table to get the text from each cell and map it to the column name.  Then convert to the pandas DataFrame to clean.

In [41]:
# create list of every row in the table finding the HTML tag <tr>
zipcode_table = tables[0].find_all('tr')
header = [th.get_text(strip=True) for th in zipcode_table[0].find_all('th')]

# loop over ever row and create a dict for each row
zipcodes = map(
    # function to create the dict using the header names and text value inside each cell
    lambda row: {head: cell.get_text(strip=True) for head, cell in zip(header, row.find_all('td'))},
    # skip the first row as this was the header row
    zipcode_table[1:]
)

# convert to pandas dataframe
zip_df = pd.DataFrame(zipcodes)
zip_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Clean the Dataframe

Use the new panda.NA singleton for all missing values with the place holder "Not assigned."  Any postalcode without a Borough will be dropped, and any empty Neighbourhood will be replaced with the Borough value.

In [42]:
# convert "Not assigned" to pandas.NA types
zip_df = zip_df.replace("Not assigned", pd.NA)

In [43]:
# drop all rows without a Borough
zip_df = zip_df.dropna(subset=['Borough'])
# fill any missing Neighbourhoods with the Borough Name
zip_df['Neighbourhood'] = zip_df['Neighbourhood'].fillna(value=zip_df['Borough'])

### Transform the Dataframe

Combine all duplicated Postalcodes together by forming a comma seperated list of every Neighbourhood.  Also assuming that there is one Borough name per zipcode.

In [44]:
# For each postal code and borough group create a list of all the Neighbourhoods 
zip_df = zip_df.groupby(['Postcode','Borough']).apply(lambda x: ", ".join(x['Neighbourhood'].unique())).reset_index()
zip_df.columns = ['Postcode','Borough', 'Neighbourhood']
zip_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Print Shape of the Dataframe

In [45]:
zip_df.shape

(103, 3)

## SECTION 2: Latitude and Longitude

Create a function to pull the latitude and longitude from the "zipcode, city, country" from geocoder form MapBox using the geopy package

In [3]:
locator = MapBox(api_key=MAPBOX_API_KEY)
# limit the rate of api calls to 1 every second to avoid being blocked
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

In [4]:
def zip_to_coords(zipcode: str, city: str, state: str, country: str):
    """ Get the latitude and longitude from the the given zipcode using the geocode funciton above"""
    address = f'{city}, {state}, {zipcode}, {country}'
    location = geocode(address)
    return pd.Series((location.latitude, location.longitude))

Create a new column in the zipcode dataframe for the latitude and longitude

***Note:*** *Only run this cell below if you haven't already saved the CSV as this will take a few minutes to search for all the coordinates and will consume your MapBox resources*

In [5]:
zip_df[['Latitude', 'Longitude']] = zip_df['Postcode'].apply(zip_to_coords, args=('Toronto', 'Ontario', 'Canada'))
zip_df.head()

NameError: name 'zip_df' is not defined

In [48]:
# export dataframe as a csv
zip_df.to_csv('week3_data.csv')

## SECTION 3: Cluster and Map

In [6]:
# download the csv if created, so we can skip the code above
zip_df = pd.read_csv('week3_data.csv', index_col=0)
zip_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.808241,-79.220533
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.78,-79.19
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.77,-79.19
3,M1G,Scarborough,Woburn,43.78,-79.23
4,M1H,Scarborough,Cedarbrae,43.78,-79.25


Get the coordinates of Toronto, Ontario, Canada to create the folium map centered on the city.

In [7]:
tor_loc = geocode('Toronto, Ontario, Canada')
print(f"{tor_loc} is located at {tor_loc.latitude}, {tor_loc.longitude}")

Toronto, Ontario, Canada is located at 43.6529, -79.3849


### Map of Toronto

I am choosing to exclude Downtown Toronto as the Postal Codes in that area are very dense, so this allows me to use Postal Codes that are much more evenly spaced out.  See below for Downtown Toronto in Blue.

In [8]:
tor_map = folium.Map(location=[tor_loc.latitude, tor_loc.longitude], zoom_start=12)

exclude_downtown = zip_df['Borough']!='Downtown Toronto'

# Postal markers for everything but Downtown Toronto
for mark in zip_df.loc[exclude_downtown,:].itertuples():
    label = f"{mark.Borough}, {mark.Postcode} ({mark.Neighbourhood})"
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [mark.Latitude, mark.Longitude],
        radius=5,
        popup=label,
        color='#E97900',
        fill=True,
        fill_color='#F7BB2D',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(tor_map)

# Postal markers for Downtown Toronto
for mark in zip_df.loc[-exclude_downtown,:].itertuples():
    label = f"{mark.Borough}, {mark.Postcode} ({mark.Neighbourhood})"
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [mark.Latitude, mark.Longitude],
        radius=5,
        popup=label,
        color='#0071e9',
        fill=True,
        fill_color='#56b3ff',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(tor_map)
tor_map

In [17]:
# New Dataframe without Downtown Toronto
tor_df = zip_df.loc[exclude_downtown,:]
tor_df.shape

(84, 5)

### Foursquare API

Create a function to access the Foursquare "venues/explore" endpoint and parse the JSON into records to put into a dataframe.

In [10]:
def parse_results(results: dict, postcode: str):
    """ Parses the JSON results returned from the https://api.foursquare.com/v2/venues/explore endpoint """
    # Collect all the venue objects nested under the items list
    venues = [item['venue'] for item in results['response']['groups'][0]['items']]
    
    # Create the list of all the places
    places = list()
    
    # loop over each venue to create the record for each place
    for venue in venues:
        places.append({
            'venue_id': venue['id'],
            'Name': venue['name'],
            'Category': venue['categories'][0]['name'],      
            'Latitude': venue['location']['lat'],
            'Longitude': venue['location']['lng'],
            'Distance': venue['location']['distance'],
            'Postcode': postcode
        })

    return places

def explore_venues(postcode: str, lat: float, lon: float, section: str = None, radius: int = 1000, limit: int = 100):
    """ Access the Foursquare API venues/explore enpoint
    Params:
        postcode (str): Postal code to label each place
        lat (float): latitude of search
        lon (float): longitude of search
        section (str): category to search, choose from:
            [food, drinks, coffee, shops, arts, outdoors, sights, trending, nextVenues, topPicks]
        radius (int): radius from the coordinates to search
        limit (int): limit the number of results returned
    """
    
    # Paramater defintion for the endpoint
    params = {
        'client_id': FOURSQUARE_CLIENT_ID,
        'client_secret': FOURSQUARE_CLIENT_SECRET,
        'v': '20200305',
        'll': f"{lat},{lon}",
        'radius': radius,
        'limit': limit         
    }
    
    # if there is a valid section provided add it to the parameters
    if section in ['food', 'drinks', 'coffee', 'shops', 'arts', 'outdoors', 'sights', 'trending', 'nextVenues', 'topPicks']:
        params['section'] = section
        
    
    # endpoint to access
    endpoint = 'https://api.foursquare.com/v2/venues/explore'
    
    # GET request
    r = requests.get(endpoint, params=params)
    
    # check to make sure we get a good 200 status code
    if r.status_code != 200:
        print(r.status_code)
        print(r.text)
        return None
      
    return parse_results(r.json(), postcode)     

For my analysis I am going to search for all places with section="food" within a 2km radius of each Postalcode center.  I will loop over every Postal Code in and make a call to the Foursquare API, and then I will convert the records into a DataFrame for anaylsis.

In [11]:
food = list()
for postcode in tor_df.itertuples():
    food.extend(explore_venues(postcode.Postcode, postcode.Latitude, postcode.Longitude, section='food', radius=2000))
len(food)

5255

In [31]:
food_df = pd.DataFrame(food)
food_df.shape

(5255, 8)

In [33]:
# Merge the Borough and Neighborhood data into the food data 
tor_food = food_df.merge(tor_df[['Postcode', 'Borough', 'Neighbourhood']], how='left', on="Postcode")
tor_food.head()

Unnamed: 0,venue_id,Name,Category,Sub-Category,Latitude,Longitude,Distance,Postcode,Borough,Neighbourhood
0,4c68119de1da1b8d45179fc3,Subway,Sandwich Place,,43.806961,-79.221476,161,M1B,Scarborough,"Rouge, Malvern"
1,4bf0817e24f020a11c33684f,Pizza Pizza,Pizza Place,,43.806613,-79.221243,189,M1B,Scarborough,"Rouge, Malvern"
2,4cb9e2d84495721e640c4d7a,Pizza Hut,Pizza Place,,43.808326,-79.220616,11,M1B,Scarborough,"Rouge, Malvern"
3,4d18ffb61356a093b0d1e682,KFC,Fast Food Restaurant,,43.806812,-79.220786,160,M1B,Scarborough,"Rouge, Malvern"
4,4b147cd4f964a520c5a323e3,Subway,Sandwich Place,,43.811607,-79.243405,1875,M1B,Scarborough,"Rouge, Malvern"


In [34]:
tor_food['Category'].unique().shape

(118,)

In [38]:
# convert data types to categorical dtypes
cat_list = ['Category', 'Postcode', 'Borough', 'Neighbourhood'] 
for cat in cat_list:
    tor_food[cat] = tor_food[cat].astype('category')
# convert to string type
text_list = ['venue_id', 'Name']
for text in text_list:
    tor_food[text] = tor_food[text].astype('string')
tor_food.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5255 entries, 0 to 5254
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   venue_id       5255 non-null   string  
 1   Name           5255 non-null   string  
 2   Category       5255 non-null   category
 3   Sub-Category   0 non-null      object  
 4   Latitude       5255 non-null   float64 
 5   Longitude      5255 non-null   float64 
 6   Distance       5255 non-null   int64   
 7   Postcode       5255 non-null   category
 8   Borough        5255 non-null   category
 9   Neighbourhood  5255 non-null   category
dtypes: category(4), float64(2), int64(1), object(1), string(2)
memory usage: 320.5+ KB


In [41]:
tor_food = tor_food.join(pd.get_dummies(tor_food['Category']))
tor_food.shape

(5255, 128)