# Notebook 1 (  Segmenting and Clustering Neighbourhoods in Toronto )

In [108]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

<h> Scraping the Wikipedia page and creating a dataframe </h>

In [109]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
page = BeautifulSoup(url,'lxml')
table = soup.find('table')
column_names = ['PostalCode','Borough','Neighborhood']
df = pd.DataFrame(columns=column_names)

<h> Sorting to find rows which have values for all three column names </h>

In [110]:
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

<h> Visualizing the resultant dataframe </h>

In [111]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


<h> Dropping the cells which do not have an assigned Borough </h>

In [112]:
index = df[df['Borough'] =='Not assigned'].index
df.drop(index , inplace=True)

<h> Visualizing the resultant dataframe </h>

In [113]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


<h> Changing the cells which have neighborhood not assigned to the subsequent name of borough </h>

In [114]:
df.loc[df['Neighborhood'] =='Not assigned' , 'Neighborhood'] = df['Borough']

<h> Visualizing the resultant dataframe </h>

In [115]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


<h> Combining postal code areas with more than one neighborhood, seperating them with commas </h>

In [117]:
df1 = df.groupby(['PostalCode','Borough'], sort=False).agg( ', '.join)
df2=df1.reset_index()

<h> Visualizing the resultant dataframe </h>

In [118]:
df2.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


<h> Finding out the number of rows in the resulting dataframe </h>

In [119]:
df2.shape

(103, 3)