# Segmentation and Clustering Neighbourhoods in Toronto

## This notebook will mainly be used for the Capstone project.

### Week 1: Capstone Project

#### Import the libraries.

In [1]:
# Import the necessary libraries.

import pandas as pd # library to handle data in a vectorized manner
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import numpy as np # library for data analsysis

#### Print the supplied sentense.

In [2]:
print ("Hello Capstone Project Course!")

Hello Capstone Project Course!


### Week 3: Capstone Project

#### Read the data from Wikipedia

In [3]:
# Read the data from Wikipedia.

data = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

#### Isolate the required table.

In [4]:
# Determine the number of tables in the data.

print(len(data))

3


In [5]:
# Find the correct table.

data[0]

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [6]:
# Convert the table into a dataframe.

df = data[0]

In [7]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Create a dataframe called "filtered" by dropping the "Not assigned" entries under "Borough".

In [8]:
# Drop the rows where "Not assigned" is entered under Borough.

filtered = df[df.Borough != "Not assigned"]
filtered.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Combine neighbourhoods that have the same postal code.

In [9]:
# Combine neighbourhoods with the same postal code and reset the index to match the assignment dataframe.

combined = filtered.groupby("Postal Code").agg({"Borough":'first', 
                             'Neighborhood': ', '.join}).reset_index()
combined.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Print the number of rows and columns in the dataframe.

In [10]:
# Print the shape of the dataframe.

combined.shape

(103, 3)

#### Read the coordinates data.

In [11]:
# Read the data from the link provided.

data2 = pd.read_csv("http://cocl.us/Geospatial_data/Geospacial_coordinates.csv")

#### Create a datafram from the new data.

In [12]:
# Create a dataframe.

coordinates = data2

In [13]:
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
# Check to see if the shape matches that of the previous dataframe (combined).

coordinates.shape

# It does!

(103, 3)

#### Merge the two dataframes on the common column (Postal Code)

In [15]:
# Merge the two dataframes.

merged = pd.merge(combined, coordinates, on="Postal Code")

In [16]:
merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Analysis

##### Isolate boroughs that contain the word "Toronto".

In [17]:
toronto_data = merged[merged["Borough"].str.contains("Toronto")].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


##### Use onehot coding.

In [18]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_data['Neighborhood'])

# add neighborhood column back to dataframe
toronto_onehot['Borough'] = toronto_data['Borough'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Borough,Berczy Park,"Brockton, Parkdale Village, Exhibition Place","Business reply mail Processing Centre, South Central Letter Processing Plant Toronto","CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",Central Bay Street,Christie,Church and Wellesley,"Commerce Court, Victoria Hotel",Davisville,Davisville North,"Dufferin, Dovercourt Village","First Canadian Place, Underground city","Forest Hill North & West, Forest Hill Road Park","Garden District, Ryerson","Harbourfront East, Union Station, Toronto Islands","High Park, The Junction South","India Bazaar, The Beaches West","Kensington Market, Chinatown, Grange Park",Lawrence Park,"Little Portugal, Trinity","Moore Park, Summerhill East","North Toronto West, Lawrence Park","Parkdale, Roncesvalles","Queen's Park, Ontario Provincial Government","Regent Park, Harbourfront","Richmond, Adelaide, King",Rosedale,Roselawn,"Runnymede, Swansea",St. James Town,"St. James Town, Cabbagetown",Stn A PO Boxes,Studio District,"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park","The Annex, North Midtown, Yorkville",The Beaches,"The Danforth West, Riverdale","Toronto Dominion Centre, Design Exchange","University of Toronto, Harbord"
0,East Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,East Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,East Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,East Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,Central Toronto,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
# Group by borough.

toronto_grouped = toronto_onehot.groupby('Borough').mean().reset_index()
toronto_grouped

Unnamed: 0,Borough,Berczy Park,"Brockton, Parkdale Village, Exhibition Place","Business reply mail Processing Centre, South Central Letter Processing Plant Toronto","CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",Central Bay Street,Christie,Church and Wellesley,"Commerce Court, Victoria Hotel",Davisville,Davisville North,"Dufferin, Dovercourt Village","First Canadian Place, Underground city","Forest Hill North & West, Forest Hill Road Park","Garden District, Ryerson","Harbourfront East, Union Station, Toronto Islands","High Park, The Junction South","India Bazaar, The Beaches West","Kensington Market, Chinatown, Grange Park",Lawrence Park,"Little Portugal, Trinity","Moore Park, Summerhill East","North Toronto West, Lawrence Park","Parkdale, Roncesvalles","Queen's Park, Ontario Provincial Government","Regent Park, Harbourfront","Richmond, Adelaide, King",Rosedale,Roselawn,"Runnymede, Swansea",St. James Town,"St. James Town, Cabbagetown",Stn A PO Boxes,Studio District,"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park","The Annex, North Midtown, Yorkville",The Beaches,"The Danforth West, Riverdale","Toronto Dominion Centre, Design Exchange","University of Toronto, Harbord"
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.0,0.0
1,Downtown Toronto,0.052632,0.0,0.0,0.052632,0.052632,0.052632,0.052632,0.052632,0.0,0.0,0.0,0.052632,0.0,0.052632,0.052632,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.052632,0.052632,0.052632,0.052632,0.0,0.0,0.052632,0.052632,0.052632,0.0,0.0,0.0,0.0,0.0,0.052632,0.052632
2,East Toronto,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.2,0.2,0.0,0.0
3,West Toronto,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Import necessary libraries for clustering.

In [20]:
import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors

from sklearn.cluster import KMeans # import k-means from clustering stage

##### Cluster the data by borough.

In [21]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_onehot.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

##### Install necessary libraries for plotting.

In [22]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

Solving environment: / ^C
failed

CondaError: KeyboardInterrupt



##### Plot Clusters.

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_grouped_clustering['Latitude'], toronto_grouped_clustering['Longitude'], toronto_grouped_clustering['Neighborhood'], toronto_grouped_clustering['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters