# Notebook for Segmenting and Clustering Neighborhoods in Toronto

### Download and Import all required libraries 

In [64]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from bs4 import BeautifulSoup
import io

print('Libraries imported.')


Libraries imported.


### Download data

In [18]:
wiki_url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_data=requests.get(wiki_url).text

toronto_data=BeautifulSoup(raw_data, 'html.parser')

postal_code_table=toronto_data.find('table', class_='wikitable')
neighbors_rows=postal_code_table.find_all('tr')

neighbors_rows[0:5]


[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>, <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>]

In [20]:
neighbors_info=[]
for row in neighbors_rows:
    data=row.text.split('\n')[1:4]
    neighbors_info.append(data)

neighbors_info[0:10]

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned']]

### Add Column names as per the assignment instructions

In [27]:
columns=['PostalCode', 'Borough', 'Neighborhood']
columns

['PostalCode', 'Borough', 'Neighborhood']

In [28]:
neighbors_df=pd.DataFrame(neighbors_info[1:], columns=columns)
neighbors_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [29]:
neighbors_df.shape

(288, 3)

### Data Cleansing

#### Remove rows which have "Not assigned" Borough

In [37]:
na_boroughs_indexes=neighbors_df.index[neighbors_df['Borough'] == 'Not assigned']
print('Removing', na_boroughs_indexes.size, 'not assigned Borough rows')

Removing 77 not assigned Borough rows


In [38]:
neighbors_df.drop(neighbors_df.index[na_boroughs_indexes], inplace=True)
neighbors_df.shape

(211, 3)

In [39]:
neighbors_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


#### Change values of Neighborhood which have "Not assigned" Neighborhood

In [40]:
na_neighbors_indexes=neighbors_df.index[neighbors_df['Neighborhood'] == 'Not assigned']
na_neighbors_indexes.size

1

In [44]:
neighbors_df[neighbors_df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Not assigned


In [45]:
for i in na_neighbors_indexes:
    neighbors_df['Neighborhood'][i]=neighbors_df['Borough'][i]

neighbors_df[neighbors_df['Neighborhood'] == 'Not assigned'].size

0

In [46]:
neighbors_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


#### Move duplicate Neighborhoods into single row with comma separated values

In [49]:
print('Unique Postal Codes = ', neighbors_df['PostalCode'].unique().size)
print('Unique Boroughs = ', neighbors_df['Borough'].unique().size)
print('Unique Neighborhoods = ', neighbors_df['Neighborhood'].unique().size)

Unique Postal Codes =  103
Unique Boroughs =  11
Unique Neighborhoods =  209


In [52]:
ndf_groupby_postalcode=neighbors_df.groupby('PostalCode')
ngp_comma_separated = ndf_groupby_postalcode['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
ngp_comma_separated[0:10]

PostalCode
M1B                                     Rouge, Malvern
M1C             Highland Creek, Rouge Hill, Port Union
M1E                  Guildwood, Morningside, West Hill
M1G                                             Woburn
M1H                                          Cedarbrae
M1J                                Scarborough Village
M1K        East Birchmount Park, Ionview, Kennedy Park
M1L                    Clairlea, Golden Mile, Oakridge
M1M    Cliffcrest, Cliffside, Scarborough Village West
M1N                        Birch Cliff, Cliffside West
Name: Neighborhood, dtype: object

In [56]:
grouped_boroughs = ndf_groupby_postalcode['Borough'].apply(lambda x: set(x).pop())
neighbors_df_grouped=pd.DataFrame(list(zip(grouped_boroughs.index, grouped_boroughs,ngp_comma_separated)))

neighbors_df_grouped.head(10)

Unnamed: 0,0,1,2
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [59]:
neighbors_df_grouped.columns=columns
neighbors_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [61]:
print('Unique Postal Codes = ', neighbors_df_grouped['PostalCode'].unique().size)
print('Unique Boroughs = ', neighbors_df_grouped['Borough'].unique().size)
print('Unique Neighborhoods = ', neighbors_df_grouped['Neighborhood'].unique().size)

Unique Postal Codes =  103
Unique Boroughs =  11
Unique Neighborhoods =  103


### Fetch Location Data for all rows

In [68]:
url="http://cocl.us/Geospatial_data"
content=requests.get(url).content

In [70]:
location_data=pd.read_csv(io.StringIO(content.decode('utf-8')))
location_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [72]:
location_data.columns=['PostalCode', 'Latitude', 'Longitude']
df=pd.merge(neighbors_df_grouped, location_data, on='PostalCode')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [73]:
df.shape

(103, 5)