 # Segmenting and clustering Neighborhoods in Toronto

## Scrape the wiki page 

In [89]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [90]:
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(wiki.content, 'html.parser')

In [91]:
# Get all the tables
tables = soup.find_all('table',class_="wikitable sortable")

In [92]:
# extract the column names
column_names = [item.get_text() for item in tables[0].find_all('th')]

In [99]:
# extract the content
contents = [item.get_text() for item in tables[0].find_all('td')]

In [103]:
# put all the content into a list
values=[]
for table in tables:
    for item in table.select('td'):
        temp = item.get_text()
        values.append(temp)

In [55]:
# Since there are 3 columns, obtain the number of rows and reshape the table
len(values)/3   # 288 rows

In [76]:
# change the shape of the table
data = np.reshape(values,(288,3))

In [None]:
#Column names
header_list = ['Postcode', 'Borough','Neighbourhood']

In [196]:
# put all the data into a dataframe
df = pd.DataFrame(data = data, columns=header_list)

In [197]:
# Remove \n in the neighbourhood text final
df['Neighbourhood'] = df['Neighbourhood'].map(lambda x: x.rstrip('\n'))
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [199]:
# Drop row where Borough is 'Not assigned'
df = df.drop(df[df.Borough == 'Not assigned'].index)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [200]:
#Number row where Neighbourhood is equal to 'Not assigned'
sum(df['Neighbourhood']=='Not assigned')

1

In [201]:
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']

In [202]:
sum(df['Neighbourhood'] == 'Not assigned')

0

In [203]:
# Group neighborhood by Borough
group = df.groupby(['Postcode','Borough'])

df2 = group.apply(lambda x: x['Neighbourhood'].unique())

In [204]:
df2 = df2.reset_index(name='Neighbourhood')
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]


In [205]:
print (type(df2.loc[0, 'Neighbourhood']))

<class 'numpy.ndarray'>


In [206]:
df2['Neighbourhood'] = df2['Neighbourhood'].str.join(', ')

In [207]:
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [209]:
df2.shape

(103, 3)