# Segmenting and Clustering Neighborhoods in Toronto

#### Import Libraries

In [292]:
from bs4 import BeautifulSoup
import requests
import urllib
import urllib.request

#### Convert url to HTML, parse into lists

In [293]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'



r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
#print(soup.prettify())



In [294]:
table = soup.find('table', class_ = 'wikitable sortable')
#print(match.prettify())

header = table.tbody.tr.th.text
print(header)


Postcode


In [295]:
for table in soup.find_all('table', class_ = 'wikitable sortable'):
    header = table.tbody.tr.text
    print(header)
header = header.rstrip('\n')
header = header.lstrip('\n')
header = header.replace('\n', ',')
header_list = header.split(',')
header_list


Postcode
Borough
Neighbourhood



['Postcode', 'Borough', 'Neighbourhood']

In [296]:
entries_list = []
count = 0
for table in soup.find_all('table', class_ = 'wikitable sortable'):
    entry = table.tbody.text
    entry = entry.rstrip('\n')
    entry = entry.rstrip(' ')
    entry = entry.lstrip('\n')
    entry = entry.lstrip(' ')
    entry = entry.replace('\n', ',')
    
    entry_list.append(entry)
    entry_list = entry.split(',')
    print()
    




In [297]:
#clean it up a bit
for element in entry_list:
    try:
        entry_list.remove('')
    except ValueError:
        pass
#entry_list
del entry_list[0:3] # we already have these in header_list
entry_list[0:10]

['M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A']

#### Convert data to pandas dataframe

In [298]:
import pandas as pd
import numpy as np

In [299]:
df_toronto = pd.DataFrame(data = np.array(entry_list).reshape(289,3), columns = header_list)
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Clean,Segment DataFrame

#### Drop entries with no Borough

In [300]:
df_toronto.replace('Not assigned', np.nan, inplace=True)
df_toronto = df_toronto.dropna(subset=['Borough'])
print(df_toronto.shape)
df_toronto.head(10)

(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


#### Replace 'Not assigned' neighbourhood with Borough value

In [301]:
df_toronto.fillna(0, inplace=True)
df_toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,0
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [302]:
mask = df_toronto['Neighbourhood'] == 0
df_toronto.loc[mask, 'Neighbourhood'] = df_toronto['Borough']
df_toronto['Neighbourhood'] = df_toronto['Neighbourhood'].mask(mask, df_toronto['Borough'])
print(df_toronto.shape)
df_toronto.head(10)


(212, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


#### Group by Postcode

In [303]:
df_toronto_grouped = df_toronto.groupby(['Postcode', 'Borough']).agg({'Neighbourhood':lambda x: ', '.join(x)}).reset_index()
df_toronto_grouped.head(10) #only shows Scarborough because dataframe sorted by postcode

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [304]:
#If you want to see more of the dataframe to verify it is consistent with the desired solution on the coursera page
df_toronto_grouped

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Shape

In [305]:
df_toronto_grouped.shape

(103, 3)