# Segmenting and Clustering Neighborhoods in Toronto

# -- PART I -- 
---

In [2]:
import pandas as pd 

## Scraping Table from Wikipedia

In [3]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]

In [4]:
print(df.head())
print(df.shape)

  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront
(287, 3)


## Cleaning Table

In [5]:
df = df[df.Borough != 'Not assigned']
print(df.head())
print(df.shape)

  Postcode           Borough     Neighbourhood
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront
5      M6A        North York  Lawrence Heights
6      M6A        North York    Lawrence Manor
(210, 3)


## Combining Neighborhoods

In [6]:
df = df.groupby(['Postcode', 'Borough'], sort = False).agg(','.join)
df.reset_index(inplace = True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


## Checking for Unassigned Neighbourhoods 

In [7]:
df[df.Neighbourhood =='Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
5,M9A,Queen's Park,Not assigned


###### Luckily there's just one unassigned Neighbourhood, so let's just change that one 

In [8]:
df = df.replace("Not assigned", "Queen's Park")
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


## Checking number of rows 

In [9]:
df.shape

(103, 3)

# -- PART II -- 
---

## Let's get geocoder ready

In [10]:
#Installing geocoder
!conda install -c conda-forge geocoder --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.4 MB

The following NEW packages will be INSTALLED:

    geocoder:        1.38.1-py_1       conda-forge
    ratelim:         0.1.6-py_2        conda-forge

The following packages will be UPDATED:

    

## Code was taking too long on the While Loop, let's use the csv instead

In [11]:
link = "http://cocl.us/Geospatial_data"
geodata = pd.read_csv(link)
geodata.rename(columns={'Postal Code': 'Postcode'}, inplace=True) 
geodata.head()
df_geo = pd.merge(df, geodata, on='Postcode')
df_geo.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


# -- PART III -- 
---

## Let's get some required libraries

In [None]:
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
!conda install -c conda-forge geopy --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.3.1               |             py_0          25 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         673 KB

The following NEW packages will be INSTALLED:

    altair:  4.0.1-py_0 conda-forge
    branca:  0.3.1-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Downloading and Extracting Packages
branca-0.3.1         | 25 KB     | #####

In [None]:
from geopy.geocoders import Nominatim
import folium

## Code was taking too long on the While Loop, let's use the csv instead

In [None]:

df=df_geo[df_geo['Borough'].str.contains('Toronto')]
df.head()

In [None]:
lista = ['Postcode', 'Borough','Neighbourhood']
dropped = df.drop(lista,1)

In [None]:
dropped.head()

## Clustering with K Means

In [None]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dropped)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
# add clustering labels
df.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:
df.head()

## Creating the map

In [None]:
import numpy as np

In [None]:
address = 'Toronto'
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood, cluster in zip(df['Latitude'], df['Longitude'], 
                                           df['Borough'], df['Neighbourhood'], df['Cluster Labels']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color= rainbow[cluster-1],
        fill=True,
        fill_color= rainbow[cluster-1],
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto