# Segmenting and Clustering Neighborhoods in Toronto

### Imports

In [25]:
import requests
import numpy as np
import pandas as pd
from pandas import DataFrame as df
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

# !conda install -c conda-forge geopy --yes  
# Uncomment the line above if you get 'No module named 'geopy'' error.
from geopy.geocoders import Nominatim

# !conda install -c conda-forge folium=0.5.0 --yes 
# Uncomment the line above if you get 'No module named 'folium'' error.
import folium

## Part 1

### Web scraping

In [5]:
# Getting HTML data, parsing it and using the main table.
mainPage = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(mainPage.text, 'html.parser')
results = soup.find('table', class_ = "wikitable sortable")

# From the main table getting data cells represented as "td", which includes values from HTML script.
columns = results.find_all('td')

### Seperating data

In [6]:
# Creating required lists.
data = []
post_code = []
borough = []
neighborhood = []

# Appending all data to an empty list named 'data', and the data is not seperated yet.
for i in range(len(columns)):
    data.append(columns[i].text.strip())

# Creating a counter in order to be able to seperate the data. 
# Appending the data to its list, increasing the counter by 1 and since we have 3 columns, resetting the counter after it hits 3.
counter = 0
for i in range(len(columns)):
    if counter == 0:
        post_code.append(data[i])
    elif counter == 1: 
        borough.append(data[i])
    elif counter == 2:
        neighborhood.append(data[i])
    elif counter == 3:
        counter = 0
        post_code.append(data[i])
    counter += 1

### Creating the dataframe

In [9]:
df = pd.DataFrame()

# Specifying column names.
df['Postal Code'] = post_code
df['Borough'] = borough
df['Neighborhood'] = neighborhood

# Dropping 'Not assigned' indexes.
df.replace("Not assigned", np.nan, inplace = True)
df.dropna(subset=["Borough"], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

print('The shape of the dataframe is: ' , df.shape)
df.head(10)

The shape of the dataframe is:  (103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## Part 2

### Adding Latitude and Longitude values from another CSV file.

In [8]:
# Reading the data and printing first 5 rows.
url = 'https://drive.google.com/file/d/14Z3IxmxHhxo6SIqwgOqmTF7QEXcCrOtF/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
coor = pd.read_csv(path)
coor.head()

df_merge = pd.merge(df, coor, on='Postal Code')
df_merge.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Part 3

### Finding geograpical coordinates of Toronto.

In [10]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Creating a map using the markers

In [56]:
# Creating a map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# Adding markers to the map
for lat, lng, borough, neighborhood in zip(df_merge['Latitude'], df_merge['Longitude'], df_merge['Borough'], df_merge['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Clustering

In [52]:
# Creating a copy of the main dataframe and setting the cluster number as 5
k = 5
clustered = df_merge

# Fitting latitude and longitude values to the K-Means model
clustering = clustered.drop(['Postal Code','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k, random_state = 0).fit(clustering)

# clustered.drop(['Cluster Labels'], axis=1, inplace = True)
# Uncomment the line above if you get 'cannot insert Cluster Labels, already exists' error

# Adding cluster labels to the dataframe as a column
kmeans.labels_
clustered.insert(0, 'Cluster Labels', kmeans.labels_)
clustered

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,0,M3A,North York,Parkwoods,43.753259,-79.329656
1,0,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,1,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,2,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...,...
98,3,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,2,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,0,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,3,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


### Creating a cluster map

In [57]:
# Creating a new map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# Setting color for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Adding markers to the map as clustered
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(clustered['Latitude'], clustered['Longitude'], clustered['Neighborhood'], clustered['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters