# __Segmenting and Clustering Neighborhoods in Toronto__
## __PART 1: Creating the dataframe from the wikipedia page__

__Before we get the data and start exploring it, let's import the libraries that we will need.__

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

__First, we need to download raw text  data from the wikipedia page__

In [2]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page= requests.get(wikipedia_link).text
soup = BeautifulSoup(raw_wikipedia_page,'xml')
# print(soup.prettify())

__We find the the table in the page and extract the data inside__

In [3]:
table = soup.find('table')
# print(table)

Postalcode    = []
Borough       = []
Neighbourhood = []

for tr_cell in table.find_all('tr'):
    counter = 1
    P_tmp = 'inan'
    B_tmp = 'inan'
    N_tmp = 'inan'
    
    if len(tr_cell.find_all('td')) == 0:   # This is to prevent the tr without td
        continue
    
    for td_cell in tr_cell.find_all('td'):
        if counter == 1: 
            P_tmp = td_cell.text
        elif counter == 2: 
            B_tmp = td_cell.text
        elif counter == 3: 
            N_tmp = str(td_cell.text).strip()
        else:
            continue

        counter +=1
    
    Postalcode.append(P_tmp)
    Borough.append(B_tmp)
    Neighbourhood.append(N_tmp)

In [4]:
df = pd.DataFrame({'Postal Code':Postalcode, 'Borough':Borough, 'Neighbourhood':Neighbourhood})
print('The shape of the table with all Not Assigned Values: ', df.shape)
df = df[df['Borough'] != 'Not assigned']
print('The shape of the table without Not Assigned Values: ', df.shape)

The shape of the table with all Not Assigned Values:  (289, 3)
The shape of the table without Not Assigned Values:  (212, 3)


__The dataframe still have not assigned neighbourhood values, so wee need to copy borough values to reassign them__

In [5]:
df['Neighbourhood'].replace('Not assigned', df['Borough'], inplace=True)
df = df.groupby(['Postal Code','Borough'])['Neighbourhood'].agg(lambda e: ', '.join(e)).reset_index()

In [6]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


__It is easier to upload the dataframe to .csv in order to use it in other notebooks__

In [7]:
df.to_csv('Toronto_Part_1.csv', index=False)

In [8]:
print('The number of rows of the dataframe is', len(df))
print(df.shape)

The number of rows of the dataframe is 103
(103, 3)
