## Applied Data Science Capstone Project
### Week 3 assignment: Toronto neighbourhoods segmentation and clustering
#### part 3

In [1]:
# Import libraries

# Pandas
import pandas as pd

# Numpy
import numpy as np

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


In [2]:
# The code was removed by Watson Studio for sharing.

In [3]:
# Import Toronto neigbourhoods data from previous exercise
from project_lib import Project
project = Project(project_id = PROJECT_ID, project_access_token = PROJECT_TOKEN)

# Fetch the file
file_nhood = project.get_file("toronto_neighbourhoods2.csv")

# Read the CSV data file from the object storage into a pandas DataFrame
file_nhood.seek(0)
df_nhood = pd.read_csv(file_nhood)
df_nhood.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


#### Create a map of Toronto with neighborhoods superimposed on top.

We use the geocoder to get the coords of Toronto.
In order to define an instance of the geocoder, we need to define a user_agent. We will name our agent <em>to_explorer</em>, as shown below.

In [4]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [5]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_nhood['Latitude'], df_nhood['Longitude'], df_nhood['Borough'], df_nhood['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

We are going to now cluster the neighbourhoods by proximity

In [18]:
# set number of clusters
kclusters = 5

df_nhood_clustering = df_nhood.drop(['Neighborhood','Borough','Postal Code'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_nhood_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 2, 3, 2, 1, 0, 4, 4, 2], dtype=int32)

Now we merge the labels into the dataframe

In [21]:
# We create a copy of the existing dataframe
df_nhood_clustered = df_nhood
# And insert the cluster data into that one
df_nhood_clustered.insert(0, 'Cluster Labels', kmeans.labels_)

Now lets visualise the clusters on a map:

In [22]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_nhood_clustered['Latitude'], df_nhood_clustered['Longitude'], df_nhood_clustered['Neighborhood'], df_nhood_clustered['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters