# Capstone Project Week 3: Segmenting and Clustering Neighborhoods in Toronto

### Step 0. Import necessary packages.

In [1]:
import pandas as pd
import numpy as np
# !pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests

### Step 1. Scrape the table from the Wiki page, and convert the table into Pandas DataFrame format.

In [2]:
# Wiki url
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
source = requests.get(url).text

soup = BeautifulSoup(source, 'lxml')

In [4]:
wiki_table = soup.find('table', class_='wikitable sortable')
# print(table1.prettify())

In [5]:
# scrape the table from the wiki page
tabs = []
for tr in wiki_table.find_all('tr')[1:]:
    tds = tr.find_all('td')
    tabs.append({'PostalCode': tds[0].text,
                 'Borough': tds[1].text,
                 'Neighbourhood': tds[2].text.rstrip() # remove '\n' at the end
                })

In [6]:
# convert to pandas DataFrame
df_tb = pd.DataFrame(tabs, columns=['PostalCode', 'Borough', 'Neighbourhood'])

### Step 2. Data cleaning process

Incldues: 
    * remove cells with a borough that is 'Not assigned'
    * set the neighborhood the same as the borough, if a cell has borough but neighborhood is 'Not assigned'
    * merge the neighborhoods that have the same Postal Code

In [7]:
# Ignore cells with a borough that is Not assigned.
df_tb_assigned = df_tb[df_tb['Borough']!='Not assigned']
# df_tb_assigned

In [8]:
# set the neighborhood the same as the borough, if a cell has borough but neighborhood is 'Not assigned'
df_tb_assigned2 = df_tb_assigned
df_tb_assigned2.loc[df_tb_assigned['Neighbourhood']=='Not assigned', 'Neighbourhood'] = df_tb_assigned.loc[df_tb_assigned['Neighbourhood']=='Not assigned', 'Borough']
# df_tb_assigned2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
# merge the neighborhoods that have the same Postal Code
gp = df_tb_assigned2.groupby('PostalCode').groups
updated_table = []
for key, value in gp.items():
    updated_table.append({'PostalCode': key,
                             'Borough': df_tb_assigned2.loc[value[0], 'Borough'],
                             'Neighbourhood': ', '.join(df_tb_assigned2.loc[value,'Neighbourhood'])})

In [10]:
final_updated_table = pd.DataFrame(updated_table, columns=['PostalCode', 'Borough', 'Neighbourhood'])

In [11]:
final_updated_table.shape

(103, 3)

### Step 3. Add latitude and longitude coordinates to the DataFrame

Since the "geocoder" package is not available, here I used the geospatial table provided in the assignment page.
URL: https://cocl.us/Geospatial_data

In [12]:
geospatial_data_ref =pd.read_csv("https://cocl.us/Geospatial_data", index_col="Postal Code")

In [13]:
final_table_geocode = final_updated_table.join(geospatial_data_ref, on='PostalCode')

Testing if the result is correct: (Postal code M5G)

In [14]:
final_table_geocode[final_table_geocode['PostalCode']=='M5G']

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
57,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [15]:
final_table_geocode

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Step 4. Explore and cluster the neighborhoods in Toronto. Check only boroughs that contain the word Toronto and see if the boroughs can be clustered by their geocodes.

In [16]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

Extract the neighborhood data where Borough contains "Toronto":

In [17]:
toronto_neighborhood = final_table_geocode[final_table_geocode['Borough'].str.contains('Toronto', regex=False)]
print('{} Postal Codes and {} unique boroughs are associated with Toronto.'.format(toronto_neighborhood.shape[0], len(toronto_neighborhood['Borough'].unique())))

38 Postal Codes and 4 unique boroughs are associated with Toronto.


In [18]:
print(toronto_neighborhood.shape)
toronto_neighborhood.head()

(38, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Visualize the Toronto neighbourhoods in a map:

In [19]:
# create map of Toronto using latitude and longitude values
latitude = toronto_neighborhood.loc[toronto_neighborhood['PostalCode']=='M4E','Latitude']
longitude = toronto_neighborhood.loc[toronto_neighborhood['PostalCode']=='M4E','Longitude']

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=100)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_neighborhood['Latitude'], toronto_neighborhood['Longitude'], toronto_neighborhood['Borough'], toronto_neighborhood['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Check how many neighbourhoods (Postal Codes) in each borough.

In [20]:
toronto_neighborhood.groupby('Borough').count()

Unnamed: 0_level_0,PostalCode,Neighbourhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,18,18,18,18
East Toronto,5,5,5,5
West Toronto,6,6,6,6


In [21]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_neighborhood[['Latitude', 'Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 2, 2, 2, 2, 2, 2], dtype=int32)




Add cluster labels into the original DataFrame so that we can check if the clustering method can map the borough just based on their geo locations.

In [22]:
toronto_neighborhood.insert(0, 'Cluster Labels', kmeans.labels_)

In [23]:
toronto_neighborhood

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighbourhood,Latitude,Longitude
37,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,0,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,0,M4M,East Toronto,Studio District,43.659526,-79.340923
44,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,2,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,2,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


Visualize the clusters in a map:

In [24]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_neighborhood['Latitude'], toronto_neighborhood['Longitude'], toronto_neighborhood['Neighbourhood'], toronto_neighborhood['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

This is the end of the assignment.