 ### Week3 Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

In [56]:
import numpy as np 
import pandas as pd 
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import requests 
print('Libraries imported.')

Libraries imported.


In [19]:
wikipedia_page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(wikipedia_page.content, 'lxml')
wikipedia_table = soup.find('table', class_ = 'wikitable sortable')

header = wikipedia_table.tbody.find('tr')
column_names = ['Postcode', 'Borough', 'Neighbourhood']
df = pd.DataFrame(columns=column_names)

In [20]:
for tr in header.find_next_siblings('tr'):
    tds = tr.find_all('td')
    df = df.append({'Postcode': tds[0].text,
                    'Borough': tds[1].text,
                    'Neighbourhood': tds[2].text}, ignore_index=True)
df.head

<bound method NDFrame.head of     Postcode           Borough  \
0        M1A      Not assigned   
1        M2A      Not assigned   
2        M3A        North York   
3        M4A        North York   
4        M5A  Downtown Toronto   
5        M5A  Downtown Toronto   
6        M6A        North York   
7        M6A        North York   
8        M7A      Queen's Park   
9        M8A      Not assigned   
10       M9A         Etobicoke   
11       M1B       Scarborough   
12       M1B       Scarborough   
13       M2B      Not assigned   
14       M3B        North York   
15       M4B         East York   
16       M4B         East York   
17       M5B  Downtown Toronto   
18       M5B  Downtown Toronto   
19       M6B        North York   
20       M7B      Not assigned   
21       M8B      Not assigned   
22       M9B         Etobicoke   
23       M9B         Etobicoke   
24       M9B         Etobicoke   
25       M9B         Etobicoke   
26       M9B         Etobicoke   
27       M1C      

In [21]:
# =============================================================================
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
# =============================================================================
    
df['Neighbourhood'] = df['Neighbourhood'].apply(lambda x: x.strip())
df = df[df['Borough']!='Not assigned']
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [22]:
# =============================================================================
# Combined neighborhoods into one row separated with a comma
# =============================================================================
df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [23]:
# =============================================================================
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
# =============================================================================
df.loc[df['Neighbourhood']=='Not assigned', 'Neighbourhood'] = df['Borough']
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [24]:
df.shape

(103, 3)

In [25]:
df_geo = pd.read_csv ('http://cocl.us/Geospatial_data')
df_geo

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [26]:
print(df.columns)
print(df_geo.columns)

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')
Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')


In [27]:
df.set_index('Postcode')
df_geo.set_index('Postal Code')
df2 = df.join(df_geo).drop('Postal Code', 1)
df2

df2.shape

(103, 5)

In [28]:
toronto_data = df2[df2['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
toronto_data

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049
