# Segmenting and Clustering Neighborhoods in Toronto
### By: Hardeep Dhaliwal

#### Create a Beautiful Soup Object for parsing HTML file of wikipedia page

In [40]:
#imports
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

readf = open("List of postal codes of Canada_ M - Wikipedia.html", "rb")
contents = readf.read()
soup = BeautifulSoup(contents, "lxml")

#### Find the table in the html file and assign it to the variable 'table' using the class: wikitable sortable to identify the correct table 

In [41]:
table = soup.find('table', {'class': 'wikitable sortable'})

#### Create a csv file, neighborhoods, and read the contents of the table variable made above into this file iterating through the contents that contain td and excluding any not assigned values

In [42]:
outfile = open('neighborhoods.csv', 'w')

items = table.findAll('td')
results = {}
temp = []
for item in items:
    text = item.renderContents()
    text = str(text.strip(), 'utf-8')
    if '<a' in text:
        parts = text.split('>')
        text = parts[1][:-3]
    temp.append(text)
    if len(temp) == 3:
        if temp[1] != 'Not assigned':
            name = "%s-%s" % (temp[0], temp[1])
            if name not in results.keys():
                results[name] = temp[2]
            else:
                results[name] += ',%s' % temp[2]
        temp = []

for key, val in results.items():
    post, boro = key.split('-')
    if val == 'Not assigned':
        val = boro
    outfile.write("%s;%s;%s\n" % (post, boro, val))

#### Create a dataframe, df, and read the csv contents into the dataframe with the three correct column names and output the size of the dataframe

In [43]:
df = pd.read_csv('neighborhoods.csv', sep=';',
                 names=['PostalCode','Borough','Neighborhood'])
print('df shape is', df.shape)

df shape is (103, 3)


In [44]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


#### Read the contents of Geospatial_Coordinates into geo_data dataframe and print the first five rows of the dataframe

In [45]:
geo_data = pd.read_csv('Geospatial_Coordinates.csv')
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Create a new dataframe and merge the contents of df and geo_data into that dataframe without overlapping the postal code column

In [46]:
merged_data = df
merged_data = merged_data.join(geo_data.set_index('Postal Code'), on='PostalCode')
merged_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


#### Import folium for map visualization and add markers to the map for each neighborhood

In [48]:
import folium
latitude, longitude = merged_data.iloc[0][3], merged_data.iloc[0][4]

# create map for Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(merged_data['Latitude'], merged_data['Longitude'], merged_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map_toronto)  

map_toronto

#### Cluster neighborhoods using K means based on latitude and longitude

In [50]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = merged_data[['Latitude', 'Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 4, 2, 3, 2, 1, 0, 4, 4, 2])

#### Add cluster labels column to the dataframe

In [51]:
# # add clustering labels
toronto_data = merged_data

toronto_data.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_data.head() # check the last columns!

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,2,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


#### Visualize the clusters on the map

In [52]:
# visualize the resulting clusters

import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data['Latitude'], 
                                  toronto_data['Longitude'], 
                                  toronto_data['Neighborhood'], 
                                  toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters