# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

## Scrapping Data From Wikipedia

In [2]:
#website we want to get de html
website = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Get the html from the website
source = requests.get(website).text

In [3]:
soup = BeautifulSoup(source,'lxml')
#get the table
table = soup.find('table',{'class':'wikitable sortable'})
#get the content of the table
content = table.get_text().split('\n')

In [4]:
content[0:10]

['',
 '',
 'Postcode',
 'Borough',
 'Neighbourhood',
 '',
 '',
 'M1A',
 'Not assigned',
 'Not assigned']

In [5]:
#remove empty values from the list
content = list(filter(None,content))

In [6]:
content[0:10]

['Postcode',
 'Borough',
 'Neighbourhood',
 'M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A']

In [7]:
postcode = []
borough = []
neighborhood = []
cont = 0

#create the vectors of the three columns
for i in range(len(content)):
    if cont == 0:
        postcode.append(content[i])
        cont = cont + 1
    elif cont == 1:
        borough.append(content[i])
        cont = cont + 1
    else:
        neighborhood.append(content[i])
        cont = 0

In [8]:
print(postcode[0:5])
print(borough[0:5])
print(neighborhood[0:5])

['Postcode', 'M1A', 'M2A', 'M3A', 'M4A']
['Borough', 'Not assigned', 'Not assigned', 'North York', 'North York']
['Neighbourhood', 'Not assigned', 'Not assigned', 'Parkwoods', 'Victoria Village']


In [9]:
#create the dataframe
df = pd.DataFrame()
df[postcode[0]] = postcode[1:]
df[borough[0]] = borough[1:]
df[neighborhood[0]] = neighborhood[1:]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
#Remove the 'Not assigned' data
df.drop(index = df[(df['Neighbourhood'] == 'Not assigned') & (df['Borough'] == 'Not assigned')].index, inplace = True)

In [11]:
#reset the index
df.reset_index(drop = True, inplace = True)

In [12]:
#check if still has 'Not assigned' data
df[df['Borough'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


In [13]:
df[df['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood
6,M7A,Queen's Park,Not assigned


In [14]:
df.Neighbourhood.replace('Not assigned',"Queen's Park", inplace = True)

In [15]:
df[df['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


In [16]:
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [20]:
#group the neighborhoods by the poscode
df_final = df.groupby('Postcode')['Neighbourhood'].apply(','.join).reset_index()

In [21]:
#add the 'Borough' column
df_final = df_final.merge(df[['Postcode','Borough']],on = 'Postcode', how = 'inner').drop_duplicates()
df_final.reset_index(inplace = True, drop = True)

In [22]:
#put the columns in the right sequence
df_final = df_final[['Postcode','Borough','Neighbourhood']]
df_final.sort_values(by = 'Postcode', inplace = True)

In [23]:
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Getting the Geographical Coordinates of the Neighborhoods

In [25]:
web_coord = 'http://cocl.us/Geospatial_data'
coord = pd.read_csv(web_coord)

In [26]:
coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [32]:
#rename the column 'Postal Code', so we can merge coord with the df_final
coord.rename(columns = {'Postal Code':'Postcode'}, inplace = True)
coord.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [39]:
df_coord = df_final.merge(coord, on = 'Postcode')
df_coord.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848
