# Segmenting and Clustering Neighborhoods in Toronto

## Import libraries

In [137]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import folium
import requests
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


## Parse HTML page into pandas Dataframes

In [23]:
dfs = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [24]:
# Select the desired DataFrame
df = dfs[0]

In [74]:
df.columns = ["Postcode", "Borough", "Neighborhood"]

## Drop rows that have a Borough Not assigned

In [75]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df = df[df.Borough != "Not assigned"]

In [76]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


## If a Neighbourhood has a Not Assigned value, replace it with the value of Borough

In [77]:
df["Neighborhood"] = np.where(df['Neighborhood'] == 'Not assigned', df['Borough'], df['Neighborhood'])

In [78]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


## Group By Postcode and join Neighbourhood values separated by comma

In [83]:
nb = df.groupby('Postcode')['Neighborhood'].agg([('Neighborhood', ', '.join)]).reset_index()

In [84]:
nb

Unnamed: 0,Postcode,Neighborhood
0,M1B,"Rouge, Malvern"
1,M1C,"Highland Creek, Rouge Hill, Port Union"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae
...,...,...
98,M9N,Weston
99,M9P,Westmount
100,M9R,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [85]:
nb = nb.merge(df, on="Postcode")
nb.drop("Neighborhood_y", axis=1, inplace=True)
nb.columns = ["Postcode", "Neighborhood", "Borough"]
nb.drop_duplicates(inplace=True)

## Print shape of DataFrame

In [86]:
nb.shape

(103, 3)

## Reading coordinates CSV

In [87]:
coordinates = pd.read_csv("Geospatial_Coordinates.csv")

In [88]:
nb = nb.merge(coordinates, left_on="Postcode", right_on="Postal Code")
nb.drop("Postal Code", axis=1, inplace=True)

In [89]:
nb.head()

Unnamed: 0,Postcode,Neighborhood,Borough,Latitude,Longitude
0,M1B,"Rouge, Malvern",Scarborough,43.806686,-79.194353
1,M1C,"Highland Creek, Rouge Hill, Port Union",Scarborough,43.784535,-79.160497
2,M1E,"Guildwood, Morningside, West Hill",Scarborough,43.763573,-79.188711
3,M1G,Woburn,Scarborough,43.770992,-79.216917
4,M1H,Cedarbrae,Scarborough,43.773136,-79.239476
