## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import folium
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import requests # library to handle requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors

# ===================== Section - 1 ========================

#### Scraping Toronto Postal Data from Wiki

In [2]:
#Scraping the toronto postal code data from wiki.
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969'

data = pd.read_html('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969')
toronto_data = pd.DataFrame(data[0])

In [3]:
toronto_data['Borough'].unique()

array(['Not assigned', 'North York', 'Downtown Toronto', 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Toronto/York', 'Mississauga'], dtype=object)

#### Ignored the records which having the value "Not assigned" in column "Borough"

In [4]:
##Ignored the records which having the value "Not assigned" in column "Borough"

toronto_data = toronto_data[toronto_data['Borough'] != "Not assigned"]
toronto_data = toronto_data.reset_index(drop = True)

In [5]:
toronto_data.head(20)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
toronto_data[toronto_data['Postal Code'] == 'M1B']

Unnamed: 0,Postal Code,Borough,Neighbourhood
6,M1B,Scarborough,"Malvern, Rouge"


#### Verify the shape of the data

In [7]:
toronto_data.shape

(103, 3)

In [8]:
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

#address = '102 North End Ave, New York, NY'
#address = "Downtown Toronto"
address = "Toronto"
geolocator = Nominatim(user_agent="myGeocoder")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
#print(latitude, longitude)
print(location.address)
print(location.longitude)
print(location.latitude)

Toronto, Golden Horseshoe, Ontario, Canada
-79.3839347
43.6534817


# ==================== SECTION: 2 ======================= 

#### As the coordinates are not accurate using Geocoder, Geospatial_Coordinates data has been imported as suggested.

In [9]:
geo_coord = pd.read_csv('./Geospatial_Coordinates.csv')
geo_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge both the DataFrames.

In [10]:
df = toronto_data.merge(geo_coord,left_on = "Postal Code", right_on = "Postal Code")

df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [11]:
df[df['Postal Code'] == "M5G"]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


#### Select all the rows which is having the word "Toronto".

In [12]:
toronto_df = df[df['Borough'].str.contains('Toronto', regex = False)].reset_index(drop = True)

toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


#### Apply One hot encoding for "Postal Code" and "Neighbourhood"

In [13]:
toronto_onehot = pd.concat([toronto_df, pd.get_dummies(toronto_df["Postal Code"]),
                           pd.get_dummies(toronto_df["Neighbourhood"])], axis = 1)

#### Drop the columns "Postal Code", "Borough", "Neighbourhood"

In [14]:
toronto_onehot.drop(["Postal Code", "Borough", "Neighbourhood"], axis = 1, inplace = True)

toronto_onehot.head()

Unnamed: 0,Latitude,Longitude,M4E,M4K,M4L,M4M,M4N,M4P,M4R,M4S,M4T,M4V,M4W,M4X,M4Y,M5A,M5B,M5C,M5E,M5G,M5H,M5J,M5K,M5L,M5N,M5P,M5R,M5S,M5T,M5V,M5W,M5X,M6G,M6H,M6J,M6K,M6N,M6P,M6R,M6S,M7A,M7Y,Berczy Park,"Brockton, Parkdale Village, Exhibition Place","Business reply mail Processing Centre, South Central Letter Processing Plant Toronto","CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",Central Bay Street,Christie,Church and Wellesley,"Commerce Court, Victoria Hotel",Davisville,Davisville North,"Dufferin, Dovercourt Village","First Canadian Place, Underground city","Forest Hill North & West, Forest Hill Road Park","Garden District, Ryerson","Harbourfront East, Union Station, Toronto Islands","High Park, The Junction South","India Bazaar, The Beaches West","Kensington Market, Chinatown, Grange Park",Lawrence Park,"Little Portugal, Trinity","Moore Park, Summerhill East","North Toronto West, Lawrence Park","Parkdale, Roncesvalles","Queen's Park, Ontario Provincial Government","Regent Park, Harbourfront","Richmond, Adelaide, King",Rosedale,Roselawn,"Runnymede, Swansea","Runnymede, The Junction, Weston-Pellam Park, Carlton Village",St. James Town,"St. James Town, Cabbagetown",Stn A PO Boxes,Studio District,"Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park","The Annex, North Midtown, Yorkville",The Beaches,"The Danforth West, Riverdale","Toronto Dominion Centre, Design Exchange","University of Toronto, Harbord"
0,43.65426,-79.360636,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,43.662301,-79.389494,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,43.657162,-79.378937,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,43.651494,-79.375418,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,43.676357,-79.293031,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [15]:
features = toronto_onehot.columns

In [16]:
#features = ['Latitude', 'Longitude', 'M4E', 'M4K', 'M4L', 'M4M', 'M4N', 'M4P','M4R', 'M4S', 'M4T', 'M4V', 'M4W', 'M4X', 'M4Y', 'M5A', 'M5B', 'M5C','M5E', 'M5G', 'M5H', 'M5J', 'M5K', 'M5L', 'M5N', 'M5P', 'M5R', 'M5S','M5T', 'M5V', 'M5W', 'M5X', 'M6G', 'M6H', 'M6J', 'M6K', 'M6N', 'M6P','M6R', 'M6S', 'M7A', 'M7Y']

#### Scaling all the features

In [17]:
X = StandardScaler().fit(toronto_onehot[features]).transform(toronto_onehot[features])
X[0:5]

array([[-0.56874932,  0.80095905, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
         6.244998  , -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815, -0.16012815, -0.16012815, -0.16012815, -0.16012815,
        -0.16012815,  6.244998  , -0.16012815, -0.1

# ===================== Section : 3 ========================

#### Cluster the Neighborhood
- Checked the clusters and picked the K=3 using Elbow method as resulting the good accuracy. 

In [18]:
k_clusters = 3
neigh_cluster = KMeans(n_clusters = k_clusters, init='k-means++', n_init = 12, algorithm='auto')
neigh_cluster.fit(X)

KMeans(n_clusters=3, n_init=12)

In [19]:
neigh_cluster.labels_[0:10]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 2])

In [20]:
#toronto_onehot.insert(0, "Cluster Labels", neigh_cluster.labels_)

#### Add Labels columns to the DataFrame

In [21]:
toronto_df.insert(0, "Cluster Labels", neigh_cluster.labels_)

In [22]:
toronto_df.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,0,M4E,East Toronto,The Beaches,43.676357,-79.293031


#### Visualize the cluster lables on Toronto map.

In [23]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df["Neighbourhood"], toronto_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters