# Segmenting and Clustering Neighborhoods in Toronto

Import Libraries

In [27]:
import numpy as np 
import pandas as pd 
import requests
from bs4 import BeautifulSoup

get data from website

In [28]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))
print(df[0].to_json(orient='records'))

[{"Postcode":"M1A","Borough":"Not assigned","Neighborhood":"Not assigned"},{"Postcode":"M2A","Borough":"Not assigned","Neighborhood":"Not assigned"},{"Postcode":"M3A","Borough":"North York","Neighborhood":"Parkwoods"},{"Postcode":"M4A","Borough":"North York","Neighborhood":"Victoria Village"},{"Postcode":"M5A","Borough":"Downtown Toronto","Neighborhood":"Harbourfront"},{"Postcode":"M6A","Borough":"North York","Neighborhood":"Lawrence Heights"},{"Postcode":"M6A","Borough":"North York","Neighborhood":"Lawrence Manor"},{"Postcode":"M7A","Borough":"Downtown Toronto","Neighborhood":"Queen's Park"},{"Postcode":"M8A","Borough":"Not assigned","Neighborhood":"Not assigned"},{"Postcode":"M9A","Borough":"Queen's Park","Neighborhood":"Not assigned"},{"Postcode":"M1B","Borough":"Scarborough","Neighborhood":"Rouge"},{"Postcode":"M1B","Borough":"Scarborough","Neighborhood":"Malvern"},{"Postcode":"M2B","Borough":"Not assigned","Neighborhood":"Not assigned"},{"Postcode":"M3B","Borough":"North York","Ne

Convert data into Pandas dataframe

In [29]:
# define the dataframe columns
column_names = ['Postalcode','Borough','Neighborhood']
# instantiate the dataframe
df = pd.DataFrame(columns = column_names)

In [30]:
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [31]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Cleaning of data

Remove cells with a borough that is Not assigned

In [32]:
df=df[df['Borough']!='Not assigned']

In [33]:
df[df['Neighborhood']=='Not assigned']=df['Borough']
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [34]:
temp_df=df.groupby('Postalcode')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp_df=temp_df.reset_index(drop=False)
temp_df.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [35]:
df_merge = pd.merge(df, temp_df, on='Postalcode')

In [36]:
df_merge.drop(['Neighborhood'],axis=1,inplace=True)

In [37]:
df_merge.drop_duplicates(inplace=True)

In [38]:
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)

In [39]:
df_merge.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
5,M7A,Downtown Toronto,Queen's Park


In [40]:
df_merge.shape

(103, 3)