# Segmenting and Clustering Neighborhoods in Toronto


### Step 1 : Getting the data from the Wikipedia page

Using the BeutifulSoup library, we extract the data and create a dataframe using the pandas library

In [12]:
# Import the libraries 
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [13]:
# set the URL containing the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
Canada_data = BeautifulSoup(source, 'lxml')

After retrieving the data from the Wikipedia page, we construct the proper Dataframe and append the data.

In [14]:
# Create the Dataframe 
column_names = ['Postcode','Borough','Neighborhood']
toronto_df = pd.DataFrame(columns = column_names)

# Scrape the Wikipedia page to find Postalcode, Borough & Neighborhood 
content = Canada_data.find('div', class_='mw-parser-output')
table = content.table.tbody
postalcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            i = i + 1
        elif i == 1:
            borough = td.text
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
    toronto_df = toronto_df.append({'Postcode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

In [15]:
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M9Z,0,0
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


As we can observe, many values do not exist and are not assigned. We move to the next step where we are going to 'clean' our dataframe.

In [20]:
# Cleaning our data..

toronto_df = toronto_df[toronto_df.Borough!='Not assigned']
toronto_df = toronto_df[toronto_df.Borough!= 0]
toronto_df.reset_index(drop = True, inplace = True)
i = 0
for i in range(0,toronto_df.shape[0]):
    if toronto_df.iloc[i][2] == 'Not assigned':
        toronto_df.iloc[i][2] = toronto_df.iloc[i][1]
        i = i+1
                                 
toronto_data = toronto_df.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
toronto_data.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In the following steps we will remove the cells that have a "Not assigned" borough, joing neighborhoods that have the same postal code area and assign Neighborhoods same as boroughs where needed.

In [31]:
toronto_data = toronto_data.dropna()
empty = 'Not assigned'
toronto_data = toronto_data[(toronto_data.Postcode != empty ) & (toronto_data.Borough != empty) & (toronto_data.Neighborhood != empty)]

toronto_data.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [24]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
                    
grp = toronto_data.groupby(['Postcode', 'Borough'])
toronto_data = grp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [29]:
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [30]:
print(toronto_data.shape)

(103, 3)
