# Segmenting and Clustering Neighborhoods in Toronto

### Author: Hannah Solorzano

***

In [2]:

from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import pandas as pd
import numpy as np
import sys
!{sys.executable} -m pip install folium
import folium
print("finished")

finished


### Part 1

#### Read in List of Postal Codes

In [3]:

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df=pd.read_html(url, header=0)[0]

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Clean Dataframe

- Remove Boroughs where their value is 'Not Assigned'
- Comma separate the names of the Neighborhood where there are repeating Postal Codes
- If the Borough has a valid value, but the Neighborhood is undefined. Assign the name of the Borough as the name of the Neighborhood.
- Remove duplicate entries


In [4]:
df = df.drop(df[(df.Borough == "Not assigned")].index)
df.Neighborhood.replace("Not assigned", df.Borough, inplace=True)
df.Neighborhood.fillna(df.Borough, inplace=True)
df=df.drop_duplicates()

df2 = pd.DataFrame({'Postal Code':df["Postal Code"].unique()})
df2['Borough']=pd.DataFrame(list(set(df['Borough'].loc[df['Postal Code'] == x['Postal Code']])) for i, x in df2.iterrows())
df2['Neighborhood']=pd.Series(list(set(df['Neighborhood'].loc[df['Postal Code'] == x['Postal Code']])) for i, x in df2.iterrows())
df2['Neighborhood']=df2['Neighborhood'].apply(lambda x: ', '.join(x))

df2.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Part 2

#### Assign Latitude and Longitude

In [5]:
dfll= pd.read_csv("http://cocl.us/Geospatial_data")
dfll.set_index("Postal Code")
df2.set_index("Postal Code")
toronto_data=pd.merge(df2, dfll)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


#### Part 3

##### Fetch all rows where the Borough is 'Toronto'

In [6]:
df3 = toronto_data[toronto_data['Borough'].str.contains('Toronto',regex=False)]

In [7]:
df3

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


#### Map the Neighborhoods of Each Part of Toronto

In [8]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(df3['Latitude'],df3['Longitude'],df3['Borough'],df3['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

#### Create the Clusters Using K Means Clustering

In [13]:
k=5
toronto_clustering = df3.drop(['Postal Code','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df3.insert(0, 'Cluster Labels', kmeans.labels_)

In [14]:
df3

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,1,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


####  Map the Clusters of Each Neighborhood

In [15]:
# create map
k=5
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df3['Latitude'], df3['Longitude'], df3['Neighborhood'], df3['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters