# Neighbourhood Segmentation and Clustering

Install and import the required packages.

In [41]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup
import pandas as pd



Download the html file for scraping.

In [42]:
!wget -O canada_postcodes.html https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

--2020-01-25 10:47:17--  https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
Resolving en.wikipedia.org (en.wikipedia.org)... 208.80.154.224, 2620:0:861:ed1a::1
Connecting to en.wikipedia.org (en.wikipedia.org)|208.80.154.224|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78804 (77K) [text/html]
Saving to: ‘canada_postcodes.html’


2020-01-25 10:47:17 (1.07 MB/s) - ‘canada_postcodes.html’ saved [78804/78804]



Create a dataframe from the html file.

In [43]:
# create a dataframe with three columns: PostalCode, Borough, and Neighborhood
header = ['PostalCode','Borough','Neighbourhood']
df = pd.DataFrame(columns=header)

# populate the dataframe with data from html file
with open('canada_postcodes.html','r') as html_doc:
    soup = BeautifulSoup(html_doc, 'html.parser')
#print(soup.prettify())

i=0
for tr in soup.tbody.find_all('tr'):
    row = tr.text.split('\n')
    row = list(filter(lambda a: a != '', row)) # remove all empty strings from list
    df.loc[i] = row
    i+=1
print(df.shape, "initial dataframe")

df.head()

(288, 3) initial dataframe


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


 Clean and format the data.

In [44]:
# drop rows without borough data
df.drop(index=0, inplace=True)
df.drop(df[df['Borough']=='Not assigned'].index, axis=0, inplace=True)
print(df.shape, "useless rows dropped")

# create one row per postcode and list the corresponding neighbourhoods
df = df.groupby(['PostalCode','Borough']).agg(lambda x: ', '.join(x.tolist()))
print(df.shape, "grouped by postcode")

# format the dataframe
df.reset_index(inplace=True)
df=df[header]

# fill missing neighbourhood data with the borough value
for i,v in df[df['Neighbourhood']=='Not assigned'].iterrows():
    df.loc[i,'Neighbourhood'] = df.loc[i,'Borough']
    
df.head()

(210, 3) useless rows dropped
(103, 1) grouped by postcode


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [45]:
# print entire dataframe
#with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', -1):
#    print(df)

In [46]:
print(df.shape, "final dataframe")

(103, 3) final dataframe
