# Segmenting and Clustering Neighbourhoods in Toronto

## Part 1

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


import requests
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

#!conda install -c conda-forge folium=0.5.0 
import folium

print('Libraries imported.')

Libraries imported.


### Scraping the neighbourhood data from a webpage

In [2]:
url_path = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Download the webpage
webPage = requests.get(url_path) 

# Get HTML code
html_code = webPage.text         

# Locate index for beginning of the table
table_start = html_code.find('<table class="wikitable sortable">')     

# Locate index for ending of the table
table_end = html_code.find('</table>')    

# HTML table extracted
toronto_data_table = html_code[table_start:table_end]         

# From HTML to Pandas data frame
toronto_df = pd.read_html(toronto_data_table, header = 0)[0]                                            

print("All table information has been scraped succesfully!")
toronto_df.head()

All table information has been scraped succesfully!


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
toronto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287 entries, 0 to 286
Data columns (total 3 columns):
Postcode         287 non-null object
Borough          287 non-null object
Neighbourhood    287 non-null object
dtypes: object(3)
memory usage: 6.8+ KB


### Data processing and cleaning of the dataframe

Remove any rows with an unassigned Borough

In [4]:
for i in range(len(toronto_df)):    
    if "Not assigned" in toronto_df.loc[i, "Borough"]:
        toronto_df = toronto_df.drop([i], axis = 0)
    else:
        toronto_df = toronto_df
toronto_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210 entries, 2 to 285
Data columns (total 3 columns):
Postcode         210 non-null object
Borough          210 non-null object
Neighbourhood    210 non-null object
dtypes: object(3)
memory usage: 6.6+ KB


Combine rows with more than one neighbourhood.

In [5]:
# Join rows with the same "Postcode" with a comma between        
join_rows = toronto_df.groupby("Postcode")["Neighbourhood"].apply(lambda x: ", ".join(x))

# Remove duplicates so that the joined rows and dataframe have the same shape.
toronto_df.drop_duplicates(["Postcode"],inplace = True)  

# Join the new row and the dataframe.
toronto_df1 = toronto_df.join(join_rows, on = "Postcode", lsuffix='_1')   

# Drop the column of neighbourhood that is not useful anymore.
toronto_df1.drop(columns = ["Neighbourhood_1"], inplace = True)

# Reset the index of the dataframe.
toronto_df1.reset_index(drop = True, inplace = True)         

toronto_df1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


Replacing the name of unassigned neighbourhoods with their borough name

In [6]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
count = 0
for i in range(len(toronto_df1)):    
    if (toronto_df1.loc[i, "Borough"] != "Not assigned") and (toronto_df1.loc[i, "Neighbourhood"] == "Not assigned"):
        print("Index : ", i)
        count = count + 1
        toronto_df1.loc[i, "Neighbourhood"] == toronto_df1.loc[i, "Borough"]
    else:
        count = count 
print("Total number of replacements: ", count)

Index :  5
Total number of replacements:  1


In [None]:
print("Data has been cleaned!.\nShape of dataframe: "+ str(toronto_df1.shape))

## Part 2

### Segmentation and Analysis

In [7]:
import geocoder # import geocoder

In [13]:
# Create a copy of the processed dataframe.
toronto_df_copy1 = toronto_df1.copy()
toronto_df_copy1["Latitude"] = np.nan
toronto_df_copy1["Longitude"] = np.nan
toronto_df_copy1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 5 columns):
Postcode         103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
Latitude         0 non-null float64
Longitude        0 non-null float64
dtypes: float64(2), object(3)
memory usage: 4.1+ KB


#### Use geopy library to get the latitude and longitude values of Toronto.

In [32]:
# Assigning latitude and longitude values to the corresponding neighbourhoods.
for i in range(len(toronto_df_copy1)):
    postal_code = toronto_df_copy1.loc[i, "Postcode"]
    address = '{}, Toronto, Ontario'.format(postal_code)
    geolocator = Nominatim(user_agent = "toronto_explorer")
    location = geolocator.geocode(address)
    toronto_df_copy1.loc[i, "Latitude"] = location.latitude
    toronto_df_copy1.loc[i, "Longitude"] = location.longitude
    print('The geograpical coordinate of {}, Toronto are {}, {}.'.format(toronto_df_copy1.loc[i, "Postcode"], location.latitude, location.longitude))
toronto_df_copy1

The geograpical coordinate of M3A, Toronto are 43.653963, -79.387207.


AttributeError: 'NoneType' object has no attribute 'latitude'

Due to the instability of the geocoder locator, we were unable to obtain all the lats and longs of each neighbourhood and hence, used the csv file.

#### Request latitude and longitude from neighbourhoods

In [33]:
# the we use the geospatial coordinates csv to obtain the coordinates.    
# Read the csv_file into a dataframe.     
geospatial_df = pd.read_csv("http://cocl.us/Geospatial_data", index_col = 0)  

# Create a copy of the processed dataframe.
toronto_df_copy = toronto_df1.copy()

# Join the two dataframes together.
toronto_on_df = toronto_df_copy.join(geospatial_df, on = "Postcode")                                
toronto_on_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
