# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np

get data from wikipedia https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

Read the first table with the class attribute containing `sortable`.

In [2]:
canada_postal_codes = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', attrs={'class': 'sortable'})[0]
canada_postal_codes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
canada_postal_codes.rename(columns={'Postcode': 'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace=True)
canada_postal_codes = canada_postal_codes[canada_postal_codes['Borough']!='Not assigned']
canada_postal_codes.reset_index(drop=True, inplace=True)
canada_postal_codes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


Group data by `PostalCode`. For each group:

1. join the neighborhood names with comma to make a new Neighborhood column.
2. use the unique name of borough for the new borough column.

Join two columns by `PostalCode` into a new DataFrame.

In [4]:
grouped = canada_postal_codes.groupby('PostalCode')
Neighborhood = grouped.apply(lambda d: ','.join(d['Neighborhood'])).to_frame()
Borough = grouped.apply(lambda d: pd.unique(d['Borough'])[0]).to_frame()
canada_postal_codes_cleaned = Borough.merge(Neighborhood, on='PostalCode')
canada_postal_codes_cleaned.rename(columns={'0_x': 'Borough', '0_y': 'Neighborhood'}, inplace=True)
canada_postal_codes_cleaned.reset_index(inplace=True)
canada_postal_codes_cleaned.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Locate rows with Neighborhood 'Not assigned' and set to the borough name.

In [5]:
canada_postal_codes_cleaned[canada_postal_codes_cleaned['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Not assigned


In [6]:
canada_postal_codes_cleaned.at[85, 'Neighborhood'] = canada_postal_codes_cleaned.at[85, 'Borough']

In [7]:
canada_postal_codes_cleaned.loc[85]

PostalCode               M7A
Borough         Queen's Park
Neighborhood    Queen's Park
Name: 85, dtype: object

In [8]:
canada_postal_codes_cleaned.shape

(103, 3)

In [9]:
from geopy.geocoders import Nominatim # geocoder.google deny the request, using geopy
geolocator = Nominatim(user_agent="ny_explorer")

def getLatLng(postal_code):
    location = geolocator.geocode('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = location and [location.latitude, location.longitude] or [None, None]
    return lat_lng_coords

In [10]:
latlng = canada_postal_codes_cleaned.apply(lambda x: pd.Series(getLatLng(x.at['PostalCode']), index=['Latitude', 'Longitude']), axis=1)
latlng

Unnamed: 0,Latitude,Longitude
0,43.653963,-79.387207
1,43.653963,-79.387207
2,,
3,43.644903,-79.381836
4,,
5,,
6,,
7,,
8,,
9,,


In [11]:
lat_lng_df = pd.read_csv('http://cocl.us/Geospatial_data')
lat_lng_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
lat_lng_df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
lat_lng_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
toronto_df = canada_postal_codes_cleaned.merge(lat_lng_df, on='PostalCode')
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
