# Segmenting and Clustering Neighborhoods in Toronto

### Import libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Web Scraping Table

In [2]:
web = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(web.text, 'html.parser')
table = soup.find('table', class_ = 'wikitable sortable')

In [3]:
df_wiki = pd.read_html(str(table), header = 0)[0]
df_wiki.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Clean Table

#### Ignore cells with a borough that is Not assigned.

In [4]:
df_wiki = df_wiki[df_wiki["Borough"]!= "Not assigned"]

In [5]:
df_wiki

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West
282,M8Z,Etobicoke,Mimico NW
283,M8Z,Etobicoke,The Queensway West
284,M8Z,Etobicoke,Royal York South West


#### Join neighbourhood with same postcode

In [6]:
df_wiki = df_wiki.groupby(["Postcode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))

#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [7]:
for index, row in df_wiki.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']

In [8]:
df_wiki

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


#### Shape of dataframe

In [11]:
df_wiki.shape

(103, 3)