# This notebook will be used for the development of the Capstone Project
### *By Isaac Bautista*

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


# Week 3 - Segmenting and Clustering Neighborhoods in Toronto

Retrieving the Wikipedia HTML document as plain text, extracting the Toronto table and finding all of its rows in HTML format

In [11]:
result = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
# result.text

html = result.text

soup = BeautifulSoup(html, 'lxml')

tables = soup.find_all('table')
toronto_table = tables[0]

toronto_table_trs = toronto_table.find_all('tr')
toronto_table_trs[0:5]

[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>, <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>]

Cleaning the rows from unwanted characters to obtain an array of string rows

In [12]:
toronto_table_rows = []

toronto_table_ths = toronto_table_trs[0].find_all('th')
toronto_table_ths_str = str(toronto_table_ths)
toronto_table_head = BeautifulSoup(toronto_table_ths_str, 'lxml').get_text()
toronto_table_head = toronto_table_head.lstrip('[')
toronto_table_head = toronto_table_head.rstrip('\n]')
toronto_table_head = ','.join(toronto_table_head.split(', '))
toronto_table_rows.append(toronto_table_head)

for n_toronto_table_tr in toronto_table_trs:
    
    n_toronto_table_tr_tds = n_toronto_table_tr.find_all('td')
    if len(n_toronto_table_tr_tds) == 0:
        continue
    n_toronto_table_tr_tds_str = str(n_toronto_table_tr_tds)
    n_toronto_row = BeautifulSoup(n_toronto_table_tr_tds_str, 'lxml').get_text()
    n_toronto_row = n_toronto_row.lstrip('[')
    n_toronto_row = n_toronto_row.rstrip('\n]')
    n_toronto_row = ','.join(n_toronto_row.split(', '))
    toronto_table_rows.append(n_toronto_row)

toronto_table_rows[0:5]

['Postcode,Borough,Neighbourhood',
 'M1A,Not assigned,Not assigned',
 'M2A,Not assigned,Not assigned',
 'M3A,North York,Parkwoods',
 'M4A,North York,Victoria Village']

Creating the data frame

In [5]:
toronto_df01 = pd.DataFrame(toronto_table_rows)
toronto_df01 = toronto_df01[0].str.split(',', expand=True)

print('toronto_df01.shape:', toronto_df01.shape)
toronto_df01.head()

toronto_df01.shape: (289, 3)


Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


Moving up the first row as the data frame header

In [6]:
toronto_df02 = toronto_df01.rename(columns=toronto_df01.iloc[0])
toronto_df02 = toronto_df02.drop([0])
toronto_df02.reset_index(drop=True, inplace=True)

print('toronto_df02.shape:', toronto_df02.shape)
toronto_df02.head()

toronto_df02.shape: (288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Removing the rows whose Borough value is Not assigned

In [7]:
toronto_df03 = toronto_df02.drop(toronto_df02[toronto_df02['Borough'] == 'Not assigned'].index)
toronto_df03.reset_index(drop=True, inplace=True)

print('toronto_df03.shape:', toronto_df03.shape)
toronto_df03.head()

toronto_df03.shape: (211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


Joining rows' neighbourhoods by grouping them by Postcode and Borough

In [8]:
toronto_df04 = toronto_df03.groupby(['Postcode', 'Borough'], sort=False)['Neighbourhood'].apply(', '.join).reset_index()

print('toronto_df04.shape:', toronto_df04.shape)
toronto_df04.head()

toronto_df04.shape: (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


Assigning the Borough value to the Neighbourhood that is set to Not assigned for each row

In [9]:
toronto_df = toronto_df04
# toronto_df.replace({'Neighbourhood': 'Not assigned'}, toronto_df[toronto_df['Neighbourhood'] == 'Not assigned']['Borough'].values[0], inplace=True)
toronto_df.loc[toronto_df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = toronto_df.loc[toronto_df['Neighbourhood'] == 'Not assigned', 'Borough']

toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [10]:
print('toronto_df.shape:', toronto_df.shape)

toronto_df.shape: (103, 3)
