# Introduction

Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto


In [4]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Import the Data

Get the wiki page with the table that needs to be extracted

In [5]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)

In [6]:
soup = BeautifulSoup(r.text, 'lxml')
table = soup.tbody
rows = table.find_all('tr')

# Column Headers and Row Extraction

Get the column headers from the html file and put them as the header for the data frame.
Then extract each rows data and put in in the data frame.  Remove the newline at the end of each row.

In [7]:
columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]
print (columns)

['Postcode', 'Borough', 'Neighbourhood']


In [8]:
df = pd.DataFrame(columns=columns)

In [10]:
for i in range (1, len(rows)):
    tds = rows[i].find_all('td')
    
    values = [tds[0].text, tds[1].text, tds[2].text.replace('\n','')]
    
    df = df.append(pd.Series(values, index=columns), ignore_index=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
569,M8Z,Etobicoke,Mimico NW
570,M8Z,Etobicoke,The Queensway West
571,M8Z,Etobicoke,Royal York South West
572,M8Z,Etobicoke,South of Bloor


# Remove Borough == 'Not assigned"

In [11]:
df = df[~df.Borough.str.contains("Not assigned")]
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [12]:
df.Neighbourhood = df.Borough.where(df.Neighbourhood=='Not assigned', df.Neighbourhood)


# Combine postcodes

Combine the Postcodes when there are multiple 'Neighbourhoods' that share the same Postcode

In [13]:
temp_df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(list)

In [14]:
temp_df

Postcode  Borough    
M1B       Scarborough                     [Rouge, Malvern, Rouge, Malvern]
M1C       Scarborough    [Highland Creek, Rouge Hill, Port Union, Highl...
M1E       Scarborough    [Guildwood, Morningside, West Hill, Guildwood,...
M1G       Scarborough                                     [Woburn, Woburn]
M1H       Scarborough                               [Cedarbrae, Cedarbrae]
                                               ...                        
M9N       York                                            [Weston, Weston]
M9P       Etobicoke                                 [Westmount, Westmount]
M9R       Etobicoke      [Kingsview Village, Martin Grove Gardens, Rich...
M9V       Etobicoke      [Albion Gardens, Beaumond Heights, Humbergate,...
M9W       Etobicoke                                 [Northwest, Northwest]
Name: Neighbourhood, Length: 103, dtype: object

In [15]:
temp_df = temp_df.sample(frac=1).reset_index()
temp_df['Neighbourhood'] = temp_df['Neighbourhood'].str.join(',')

In [16]:
temp_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M5A,Downtown Toronto,"Harbourfront,Harbourfront"
1,M6A,North York,"Lawrence Heights,Lawrence Manor,Lawrence Heigh..."
2,M8W,Etobicoke,"Alderwood,Long Branch,Alderwood,Long Branch"
3,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar..."
4,M5C,Downtown Toronto,"St. James Town,St. James Town"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North,Th..."
99,M2K,North York,"Bayview Village,Bayview Village"
100,M4K,East Toronto,"The Danforth West,Riverdale,The Danforth West,..."
101,M1X,Scarborough,"Upper Rouge,Upper Rouge"


In [17]:
temp_df.shape

(103, 3)

# Start of Geocoder Section

Could not get geocoder to work, so going to import the postal codes and merge the data frames

In [27]:
postal = pd.read_csv("Geospatial_Coordinates.csv", sep=',')
postal.rename(columns={'Postal Code':'Postcode'}, inplace=True)

In [28]:
df_loc = pd.merge(temp_df, postal, how='outer', on='Postcode')

In [29]:
df_loc

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront,Harbourfront",43.654260,-79.360636
1,M6A,North York,"Lawrence Heights,Lawrence Manor,Lawrence Heigh...",43.718518,-79.464763
2,M8W,Etobicoke,"Alderwood,Long Branch,Alderwood,Long Branch",43.602414,-79.543484
3,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar...",43.650943,-79.554724
4,M5C,Downtown Toronto,"St. James Town,St. James Town",43.651494,-79.375418
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North,Th...",43.653654,-79.506944
99,M2K,North York,"Bayview Village,Bayview Village",43.786947,-79.385975
100,M4K,East Toronto,"The Danforth West,Riverdale,The Danforth West,...",43.679557,-79.352188
101,M1X,Scarborough,"Upper Rouge,Upper Rouge",43.836125,-79.205636
