# This notebook will be used for the development of the Capstone Project
### *By Isaac Bautista*

In [1]:
# !conda install -c conda-forge folium=0.5.0 --yes
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


# Week 3 - Segmenting and Clustering Neighborhoods in Toronto

Retrieving the Wikipedia HTML document as plain text, extracting the Toronto table and finding all of its rows in HTML format

In [3]:
result = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
# result.text

html = result.text

soup = BeautifulSoup(html, 'lxml')

tables = soup.find_all('table')
toronto_table = tables[0]

toronto_table_trs = toronto_table.find_all('tr')
toronto_table_trs[0:5]

[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>, <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>]

Cleaning the rows from unwanted characters to obtain an array of string rows

In [4]:
toronto_table_rows = []

toronto_table_ths = toronto_table_trs[0].find_all('th')
toronto_table_ths_str = str(toronto_table_ths)
toronto_table_head = BeautifulSoup(toronto_table_ths_str, 'lxml').get_text()
toronto_table_head = toronto_table_head.lstrip('[')
toronto_table_head = toronto_table_head.rstrip('\n]')
toronto_table_head = ','.join(toronto_table_head.split(', '))
toronto_table_rows.append(toronto_table_head)

for n_toronto_table_tr in toronto_table_trs:
    
    n_toronto_table_tr_tds = n_toronto_table_tr.find_all('td')
    if len(n_toronto_table_tr_tds) == 0:
        continue
    n_toronto_table_tr_tds_str = str(n_toronto_table_tr_tds)
    n_toronto_row = BeautifulSoup(n_toronto_table_tr_tds_str, 'lxml').get_text()
    n_toronto_row = n_toronto_row.lstrip('[')
    n_toronto_row = n_toronto_row.rstrip('\n]')
    n_toronto_row = ','.join(n_toronto_row.split(', '))
    toronto_table_rows.append(n_toronto_row)

toronto_table_rows[0:5]

['Postcode,Borough,Neighbourhood',
 'M1A,Not assigned,Not assigned',
 'M2A,Not assigned,Not assigned',
 'M3A,North York,Parkwoods',
 'M4A,North York,Victoria Village']

Creating the data frame

In [5]:
toronto_df01 = pd.DataFrame(toronto_table_rows)
toronto_df01 = toronto_df01[0].str.split(',', expand=True)

print('toronto_df01.shape:', toronto_df01.shape)
toronto_df01.head()

toronto_df01.shape: (289, 3)


Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


Moving up the first row as the data frame header

In [6]:
toronto_df02 = toronto_df01.rename(columns=toronto_df01.iloc[0])
toronto_df02 = toronto_df02.drop([0])
toronto_df02.reset_index(drop=True, inplace=True)
toronto_df02.rename(columns={'Postcode': 'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace=True)
print('toronto_df02.shape:', toronto_df02.shape)
toronto_df02.head()

toronto_df02.shape: (288, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Removing the rows whose Borough value is Not assigned

In [7]:
toronto_df03 = toronto_df02.drop(toronto_df02[toronto_df02['Borough'] == 'Not assigned'].index)
toronto_df03.reset_index(drop=True, inplace=True)

print('toronto_df03.shape:', toronto_df03.shape)
toronto_df03.head()

toronto_df03.shape: (211, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


Joining rows' neighbourhoods by grouping them by Postcode and Borough

In [8]:
toronto_df04 = toronto_df03.groupby(['PostalCode', 'Borough'], sort=False)['Neighborhood'].apply(', '.join).reset_index()

print('toronto_df04.shape:', toronto_df04.shape)
toronto_df04.head()

toronto_df04.shape: (103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


Assigning the Borough value to the Neighbourhood that is set to Not assigned for each row

In [9]:
toronto_df = toronto_df04

toronto_df.loc[toronto_df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = toronto_df.loc[toronto_df['Neighborhood'] == 'Not assigned', 'Borough']

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [10]:
print('toronto_df.shape:', toronto_df.shape)

toronto_df.shape: (103, 3)


Retrieving Toronto geospatial data

In [11]:
toronto_geospatial_data_df = pd.read_csv('https://cocl.us/Geospatial_data')
print('toronto_geospatial_data_df.shape:', toronto_geospatial_data_df.shape)
toronto_geospatial_data_df.head()

toronto_geospatial_data_df.shape: (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging the Toronto and Toronto geospatial dataframes

In [12]:
toronto_df = toronto_df.merge(toronto_geospatial_data_df, left_on='PostalCode', right_on='Postal Code')
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,M7A,43.662301,-79.389494


Defining the Toronto location

In [13]:
toronto_address = 'Toronto, Ontario'
toronto_geolocator = Nominatim(user_agent='ny_explorer')
toronto_location = toronto_geolocator.geocode(toronto_address)
toronto_latitude = toronto_location.latitude
toronto_longitude = toronto_location.longitude

print(toronto_latitude, toronto_longitude)

43.653963 -79.387207


Creating the Toronto's map, showing each of its neighborhoods

In [14]:
toronto_map = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

for nLatitude, nLongitude, nBorough, nNeighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    nLabel = '{}, {}'.format(nNeighborhood, nBorough)
    nLabel = folium.Popup(nLabel, parse_html=True)
    folium.CircleMarker(
        [nLatitude, nLongitude],
        radius=5,
        popup=nLabel,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(toronto_map)

toronto_map

Retrieving all the Boroughs that contain the word "Toronto"

In [15]:
toronto_boroughs_df = toronto_df[toronto_df['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_boroughs_df

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",M5A,43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",M5B,43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,M5C,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,M4E,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,M5E,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,M5G,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,M6G,43.669542,-79.422564
7,M5H,Downtown Toronto,"Adelaide, King, Richmond",M5H,43.650571,-79.384568
8,M6H,West Toronto,"Dovercourt Village, Dufferin",M6H,43.669005,-79.442259
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",M5J,43.640816,-79.381752


Creating a map of Toronto showing only the "Toronto" neighborhoods

In [16]:
toronto_boroughs_map = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

for nLatitude, nLongitude, nNeighborhood in zip(toronto_boroughs_df['Latitude'], toronto_boroughs_df['Longitude'], toronto_boroughs_df['Neighborhood']):
    nLabel = folium.Popup(nNeighborhood, parse_html=True)
    folium.CircleMarker(
        [nLatitude, nLongitude],
        radius=5,
        popup=nLabel,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(toronto_boroughs_map)

toronto_boroughs_map

The next cell is hidden because it contains the credentials to access the Foursquare APIs

In [17]:
# The code was removed by Watson Studio for sharing.

This method retrieves 100 venues, in radius of 500 units, for each of the "Toronto" neighborhoods

In [18]:
radius = 500
LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Retrieving 100 venues for each of the "Toronto" neighborhoods

In [19]:
toronto_venues_df = getNearbyVenues(names=toronto_boroughs_df['Neighborhood'], latitudes=toronto_boroughs_df['Latitude'], longitudes=toronto_boroughs_df['Longitude'])
toronto_venues_df.head()

Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
The Danforth West, Riverdale
Design Exchange, Toronto Dominion Centre
Brockton, Exhibition Place, Parkdale Village
The Beaches West, India Bazaar
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North, Forest Hill West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
Harbord, University of Toronto
Runnymede, Swansea
Moore Park, Summerhill East
Chinatown, Grange Park, Kensington Market


KeyError: 'groups'

Preparing the data to fit the KMeans model.

Transformating the data to set the venue categories as columns

In [None]:
# toronto_venues_df.groupby('Neighborhood').count()
toronto_ohe_df = pd.get_dummies(toronto_venues_df[['Venue Category']], prefix='', prefix_sep='')

toronto_ohe_df.pop('Neighborhood')

toronto_ohe_df['Neighborhood'] = toronto_venues_df['Neighborhood']

fixed_columns = [toronto_ohe_df.columns[-1]] + list(toronto_ohe_df.columns[:-1])
toronto_ohe_df = toronto_ohe_df[fixed_columns]

print('toronto_ohe_df.shape:', toronto_ohe_df.shape)
toronto_ohe_df.head()

Grouping data by neighborhood

In [None]:
toronto_neighborhoods_grouped_df = toronto_ohe_df.groupby('Neighborhood').mean().reset_index()

print('toronto_neighborhoods_grouped_df.shape:', toronto_neighborhoods_grouped_df.shape)
toronto_neighborhoods_grouped_df.head()

This method sorts venues in descending order

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Retrieving the top 10 venues for each neighborhood

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind + 1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind + 1))

# create a new dataframe
toronto_neighborhood_venues_sorted_df = pd.DataFrame(columns=columns)
toronto_neighborhood_venues_sorted_df['Neighborhood'] = toronto_neighborhoods_grouped_df['Neighborhood']

for ind in np.arange(toronto_neighborhoods_grouped_df.shape[0]):
    toronto_neighborhood_venues_sorted_df.iloc[ind, 1:] = return_most_common_venues(toronto_neighborhoods_grouped_df.iloc[ind, :], num_top_venues)

toronto_neighborhood_venues_sorted_df.head()

Fitting the KMeans model.

In [None]:
k_clusters = 4

toronto_neighborhoods_to_cluster_df = toronto_neighborhoods_grouped_df.drop('Neighborhood', 1)

kmeans_model = KMeans(n_clusters=k_clusters, random_state=0).fit(toronto_neighborhoods_to_cluster_df)

kmeans_model.labels_

Merging the cluster labels with the top 10 venues by neighborhood

In [None]:
toronto_neighborhood_venues_sorted_df.insert(0, 'Cluster Labels', kmeans_model.labels_)

toronto_merged = toronto_boroughs_df

toronto_merged = toronto_merged.join(toronto_neighborhood_venues_sorted_df.set_index('Neighborhood'), on='Neighborhood')

print('toronto_merged.shape:', toronto_merged.shape)
toronto_merged.head()

Creating a map of Toronto showing its neighborhoods clustered together using colors for each cluster

In [None]:
toronto_clusters_map = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

x = np.arange(k_clusters)
ys = [i + x + (i * x)**2 for i in range(k_clusters)]

colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []

for n_latitude, n_longitude, n_neighborhood, n_cluster_label in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    n_label = folium.Popup(str(n_neighborhood) + ' Cluster ' + str(n_cluster_label), parse_html=True)
    folium.CircleMarker(
        [n_latitude, n_longitude],
        radius=5,
        popup=n_label,
        color=rainbow[n_cluster_label - 1],
        fill=True,
        fill_color=rainbow[n_cluster_label - 1],
        fill_opacity=0.7
    ).add_to(toronto_clusters_map)

toronto_clusters_map