## Segmenting and Clustering Neighborhoods in Toronto: Part I

This script is used to download and clean data for the clustering project.

### Import libraries

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup # used to parse data from website

### Parse table from website

We can define a function to parse the url and search for table content

In [96]:
def parse_url_table(url):
    
    # parse url
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # parse table from url
    table = soup.find_all("table")[0]
    
    # find column names
    col_names = []
    th_tags = table.find_all('th')
    for th in th_tags:
        col_names.append(th.get_text().rstrip("\n"))
    
    df = pd.DataFrame(columns=col_names)
    for row in table.find_all('tr'):
        cols = row.find_all('td')
        if len(cols)>0:
            temp = []
            for col in cols:
                temp.append(col.get_text().rstrip("\n"))
            df = df.append(pd.Series(temp,index=df.columns),ignore_index=True)
        
        
    return df

Get table from the Wikipedia page

In [127]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = parse_url_table(url)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Define a function to clean the table

In [128]:
def clean_table(df):
    
    # drop rows with 'Not assigned' Borough
    df = df[df.Borough!='Not assigned']
    
    # set 'Not assigned' Neighborhood the same name as Borough
    df[df.Neighborhood.isna()].loc[:,'Neighborhood'] = df[df.Neighborhood.isna()].loc[:,'Borough']
    
    # clean Neighborhood, change '/' to ', '
    temp = df['Neighborhood'].values
    for idx, istr in enumerate(temp):
        temp[idx] = istr.replace(' / ',', ')
        
    df.assign(Neighborhood = temp)
    
    df = df.reset_index(drop=True)
    
    
    return df

Clean the table

In [129]:
df = clean_table(df)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [131]:
df.shape

(103, 3)