## Segmenting and Clustering Neighborhoods in Toronto

This script is for Coursera IBM Data Science capstone project. It is used to analyze and cluster neighborhoods in Toronto.

### Import libraries

In [21]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests 
from bs4 import BeautifulSoup # used to parse data from website
import lxml

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Step 1: Parse table from website

We can define a function to parse the url and search for table content

In [2]:
def parse_url_table(url):
    
    # parse url
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # parse table from url
    table = soup.find_all("table")[0]
    
    # find column names
    col_names = []
    th_tags = table.find_all('th')
    for th in th_tags:
        col_names.append(th.get_text().rstrip("\n"))
    
    # create a new pandas DataFrame to restore the table
    df = pd.DataFrame(columns=col_names)
    
    # read table content
    for row in table.find_all('tr'):
        cols = row.find_all('td')
        if len(cols)>0:
            temp = []
            for col in cols:
                temp.append(col.get_text().rstrip("\n"))
            df = df.append(pd.Series(temp,index=df.columns),ignore_index=True)
        
        
    return df

Get table from the Wikipedia page

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = parse_url_table(url)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Step 2: Clean the table

Define a function to clean the table

In [4]:
def clean_table(df):
    
    # drop rows with 'Not assigned' Borough
    df = df[df.Borough!='Not assigned']
    
    # set 'Not assigned' Neighborhood the same name as Borough
    df[df.Neighborhood.isna()].loc[:,'Neighborhood'] = df[df.Neighborhood.isna()].loc[:,'Borough']
    
    # clean Neighborhood, change '/' to ', '
    temp = df['Neighborhood'].values
    for idx, istr in enumerate(temp):
        temp[idx] = istr.replace(' / ',', ')
        
    df.assign(Neighborhood = temp)
    
    df = df.reset_index(drop=True)
    
    
    return df

Clean the pandas DataFrame

In [5]:
df = clean_table(df)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Let's check the shape of table

In [6]:
df.shape

(103, 3)

### Step 3: Get Geographic Coordinates for boroughs

Let's define a function to get geographic coordinate for any give postal code

In [7]:
# download geographic coordinates from the link
geo_code = pd.read_csv('http://cocl.us/Geospatial_data')

def get_geo_post(postal_code):
    latitude = geo_code.loc[geo_code['Postal Code']==postal_code, 'Latitude'].values
    longitude = geo_code.loc[geo_code['Postal Code']==postal_code, 'Longitude'].values
    
    return latitude, longitude

Use above function to get lat/lon for each borough

In [8]:
# add two new columns to the table
df['Latitude'] = np.nan
df['Longitude'] = np.nan

# get geographic coordinate for each postal code (row)
for idx in range(len(df.index)):
    postal_code = df.iloc[idx,0]        # get postal code for each borough
    df.iloc[idx,3], df.iloc[idx,4] = get_geo_post(postal_code)  # fill in the lat/lon

# check lat/lon in the table
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Step 4: Select boroughs in Toronto only and plot the boroughs in a map

Let's use geopy to get the geographical coordinates of Toronto first.

In [9]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In this work, the main interest is focusing on the neighborhoods in Toronto area. So let's pull the Toronto data from the table first.

In [10]:
# create a boolean list to select the rows that contains 'Toronto'
selected_list = ['Toronto' in name for name in df['Borough']]
tor_neighborhoods = df[selected_list]
tor_neighborhoods.reset_index(inplace=True, drop=True)
tor_neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


##### Create a map of Toronto with neighborhoods superimposed on top.

In [11]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(tor_neighborhoods['Latitude'], tor_neighborhoods['Longitude'], tor_neighborhoods['Borough'], tor_neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.6,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Step 5: Explore the venues in the first borough using the Foursquare API 

#### Define Foursquare Credentials and Version

In [13]:
# read Foursquare confidentials from json file

with open('Foursquare_credentials.json') as file:
    foursquare_id = json.load(file)
    CLIENT_ID = foursquare_id['CLIENT_ID']
    CLIENT_SECRET = foursquare_id['CLIENT_SECRET']
    VERSION = foursquare_id['VERSION']


#### Information of the first borough in Toronto

In [16]:
neighborhood_latitude = tor_neighborhoods.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = tor_neighborhoods.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = tor_neighborhoods.loc[0, 'Neighborhood'] +', in '+ tor_neighborhoods.loc[0, 'Borough'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront, in Downtown Toronto are 43.6542599, -79.3606359.


#### Get the top 100 venues that are in the first borough within 500 meters.

In [18]:
# Set up parameter and url for request
LIMIT = 100
radius= 500
url='https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,VERSION,neighborhood_latitude,neighborhood_longitude,radius,LIMIT)

Retrieve the results from Foursquare.

In [29]:
results = requests.get(url).json()

Define a function to get the category type for each revenue 

In [28]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Clean the json and structure it into a pandas dataframe.

In [31]:
venues = results['response']['groups'][0]['items']

# flatten JSON
nearby_venues = json_normalize(venues)

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

# number of venues that are returned by Foursquare
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

# print first 5 rows
nearby_venues.head()

44 venues were returned by Foursquare.


  after removing the cwd from sys.path.


Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149
3,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
4,Body Blitz Spa East,Spa,43.654735,-79.359874
