# Introduction

Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto


In [60]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Import the Data

Get the wiki page with the table that needs to be extracted

In [61]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)

In [62]:
soup = BeautifulSoup(r.text, 'lxml')
table = soup.tbody
rows = table.find_all('tr')

# Column Headers and Row Extraction

Get the column headers from the html file and put them as the header for the data frame.
Then extract each rows data and put in in the data frame.  Remove the newline at the end of each row.

In [63]:
columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]
print (columns)

['Postcode', 'Borough', 'Neighbourhood']


In [64]:
df = pd.DataFrame(columns=columns)

In [65]:
for i in range (1, len(rows)):
    tds = rows[i].find_all('td')
    
    values = [tds[0].text, tds[1].text, tds[2].text.replace('\n','')]
    
    df = df.append(pd.Series(values, index=columns), ignore_index=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West
285,M8Z,Etobicoke,South of Bloor


# Remove Borough == 'Not assigned"

In [82]:
df = df[~df.Borough.str.contains("Not assigned")]
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Not assigned
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [90]:
df.Neighbourhood = df.Borough.where(df.Neighbourhood=='Not assigned', df.Neighbourhood)


# Combine postcodes

Combine the Postcodes when there are multiple 'Neighbourhoods' that share the same Postcode

In [126]:
temp_df = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(list)

In [127]:
temp_df

Postcode  Borough    
M1B       Scarborough                                     [Rouge, Malvern]
M1C       Scarborough             [Highland Creek, Rouge Hill, Port Union]
M1E       Scarborough                  [Guildwood, Morningside, West Hill]
M1G       Scarborough                                             [Woburn]
M1H       Scarborough                                          [Cedarbrae]
                                               ...                        
M9N       York                                                    [Weston]
M9P       Etobicoke                                            [Westmount]
M9R       Etobicoke      [Kingsview Village, Martin Grove Gardens, Rich...
M9V       Etobicoke      [Albion Gardens, Beaumond Heights, Humbergate,...
M9W       Etobicoke                                            [Northwest]
Name: Neighbourhood, Length: 103, dtype: object

In [128]:
temp_df = temp_df.sample(frac=1).reset_index()
temp_df['Neighbourhood'] = temp_df['Neighbourhood'].str.join(',')

In [129]:
temp_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M6L,North York,"Downsview,North Park,Upwood Park"
1,M3L,North York,Downsview West
2,M7A,Downtown Toronto,Queen's Park
3,M6B,North York,Glencairn
4,M6S,West Toronto,"Runnymede,Swansea"
...,...,...,...
98,M3J,North York,"Northwood Park,York University"
99,M2L,North York,"Silver Hills,York Mills"
100,M5L,Downtown Toronto,"Commerce Court,Victoria Hotel"
101,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw..."


In [130]:
temp_df.shape

(103, 3)