# Segmenting and Clustering Neighborhoods in Toronto

## PART 1

###  Libraries

In [47]:
import numpy as np
import pandas as pd 
import requests 
from bs4 import BeautifulSoup 

###   Wikipedia Scrap

In [48]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(data, 'html.parser')
postalcode = []
borough = []
neighborhood = []

In [49]:
soup.find('table').find_all('tr')
soup.find('table').find_all('tr')
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalcode.append(cells[0].text.rstrip('\n'))
        borough.append(cells[1].text.rstrip('\n'))
        neighborhood.append(cells[2].text.rstrip('\n'))

### Create dataframe

In [50]:
df = pd.DataFrame({"postalcode": postalcode,
                           "borough": borough,
                           "neighborhood": neighborhood})
df.head()

Unnamed: 0,postalcode,borough,neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Drop NA and Group neighborhoods in borough

In [51]:
df = df[df.borough != "Not assigned"].reset_index(drop=True)
df = df.groupby(["postalcode", "borough"], as_index=False).agg(lambda x: ", ".join(x))

In [52]:
postcodes = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
column_names = ["postalcode", "borough", "neighborhood"]
df_final = pd.DataFrame(columns=column_names)

for postcode in postcodes:
    df_final = df_final.append(df[df["postalcode"]==postcode], ignore_index=True)

In [53]:
df.shape

(103, 3)

## PART 2

### Load geospatial coordinates

In [54]:
geo_coordinates = pd.read_csv("Geospatial_Coordinates.csv")
geo_coordinates.head()

Unnamed: 0,postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge 

In [55]:
df = df.merge(geo_coordinates, on="postalcode", how="left")

In [56]:
postcodes = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]
column_names = ["postalcode", "borough", "neighborhood"]
df_final = pd.DataFrame(columns=column_names)

for postcode in postcodes:
    df_final = df_final.append(df[df["postalcode"]==postcode], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


## PART 3

###  Libraries

In [57]:
import folium
from sklearn.cluster import KMeans

###  Map

In [58]:
X = df['Latitude']
Y = df['Longitude']
Z = np.stack((X, Y), axis=1)

In [59]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)
clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
df['cluster'] = clusters

In [60]:
map_toronto = folium.Map(location=[43.65, -79.4], zoom_start=12)
for latitude, longitude, borough, cluster in zip(df['Latitude'], df['Longitude'], df['borough'], df['cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(map_toronto)  

In [61]:
map_toronto