# Notebook to scrape Toronto Neighbourhoods

In [1]:
from bs4 import BeautifulSoup # Library to scrape website
#import urllib # Library for opening URLs
import requests # library to handle requests
import re #Library to handle RegEX
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files

## Extract the html page from Wikipedia

In [2]:
URL_page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' #URL for Wikipedia page to scrape Toronto Neighbourhoods
URL_request = requests.get(URL_page) # request library to get page from URL

In [3]:
soup = BeautifulSoup(URL_request.content, 'html.parser') # BeautifulSoup library to for parsing html page
name_table = soup.find('table', attrs={'class': 'wikitable'}) # extract html tag table with wikitable class
#data_table = name_table.text.strip() # extract the whole table into text
table_head = name_table.find_all('th') # find the table headers: ['Postcode', 'Borough', 'Neighbourhood']
table_rows = name_table.find_all('tr') # find all the table rows

## Populate the dataframe

In [4]:
# define the column names
column_names = []
for th_elem in table_head: # <th> tag contains the table headers which we append to a columns list
    column_names.append(th_elem.text.strip())

In [5]:
# Populate the dataframe with table data

#Table format
'''
<tr>
    <td>Postcode value</td>
    <td>Borough value</td>
    <dt>Neighbourhood value</td>
</tr>
'''
rows_list = []
for tr_elem in table_rows[1:]: # for each <tr> tag found in the table
    td_all = tr_elem.find_all('td') # find all <td> tags in each <tr> tag
    data_row = [td_elem.text.strip() for td_elem in td_all]
    rows_list.append(data_row)

# instantiate the dataframe
neighborhoods_df = pd.DataFrame(rows_list, columns=column_names)

## Clean and prepare the dataframe

In [6]:
# ignore cells with a borough that is Not assigned
neighborhoods_df = neighborhoods_df.loc[neighborhoods_df['Borough'] != 'Not assigned']

# groupby Postcode and combine Neighbourhood comma seperated
neighborhoods_df = neighborhoods_df.groupby(['Postcode','Borough']).Neighbourhood.apply(lambda x: ', '.join(x)).reset_index()

# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
neighborhoods_df['Neighbourhood'][neighborhoods_df['Neighbourhood'] == 'Not assigned'] = neighborhoods_df['Borough']

In [8]:
neighborhoods_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
neighborhoods_df.shape

(103, 3)