# Segmenting and Clustering Neighborhoods in Toronto

In [3]:
#Import necessary Libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [4]:
# Scrape the Wikipedia page to obtain data that is in list of zip codes in Canada
zip_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
bs_source = requests.get(zip_url).text
soup = BeautifulSoup(bs_source, 'xml')
soup_table = soup.find('table')

In [5]:
# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
column_names = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns = column_names)

In [6]:
# Search and extract all the Toronto zip codes
for tr_cell in soup_table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
        
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [7]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [13]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned
df = df[df['Borough']!='Not assigned']

In [17]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df[df['Neighborhood']=='Not assigned'] = df['Borough']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [20]:
# More than one neighborhood can exist in one postal code area.
# Combine these into one row. Create temp DataFrame that is Grouping the neighborhoods by zip
temp_df=df.groupby('PostalCode')['Neighborhood'].apply(lambda x: "%s" % ','.join(x))
temp_df=temp_df.reset_index(drop=False)
temp_df.rename(columns={'Neighborhood':'Neighborhood_Joined'}, inplace=True)
temp_df.head()

Unnamed: 0,PostalCode,Neighborhood_Joined
0,M1B,"Malvern, Rouge"
1,M1C,"Rouge Hill, Port Union, Highland Creek"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [24]:
# Create a Merged DataFrame by combining the temp DF and original DF
merged_df=pd.merge(df, temp_df, on='PostalCode')
merged_df.drop(['Neighborhood'], axis=1, inplace=True)
merged_df.drop_duplicates(inplace=True)
merged_df.rename(columns={'Neighborhood_Joined':'Neighborhood'}, inplace=True)

merged_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [25]:
# In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe
merged_df.shape

(103, 3)