# Scraping data for Toronto

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup # web scraping library
import requests

In [9]:
scrape_target = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
website_html = requests.get(scrape_target).text
print(website_html[:154])
soup = BeautifulSoup(website_html, 'lxml')
print(soup.prettify()[:166])

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>


In [10]:
postcode_table = soup.find('table', {'class': 'wikitable sortable'})
table_elements = postcode_table.findAll('td')
print(table_elements[:10])

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
</td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
</td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td>, <td>M4A</td>]


In [11]:
split_table_elements = [[table_elements[i], table_elements[i+1], table_elements[i+2]] for i in range(0, len(table_elements), 3)]
print(split_table_elements[:3])

[[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
</td>], [<td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
</td>], [<td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td>]]


In [12]:
data_table = []

# Each element in split_table_elements contains 3 strings that represent a line in the table
# element[0] - data from the Postcode column
# element[1] - data from the Bourough column
# element[2] - data from the Neighbourhood column
for element in split_table_elements:
    # Extract the textual data using the .text accessor
    postcode = element[0].text.strip()
    bourough = element[1].text.strip()
    neighbourhood = element[2].text.strip()
        
    data_table.append([postcode, bourough, neighbourhood])
    
data_table[:5]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront']]

In [13]:
df = pd.DataFrame(data_table)
df.columns = ['Postcode', 'Bourough', 'Neighbourhood']
df.head()

Unnamed: 0,Postcode,Bourough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


# Cleaning up

In [14]:
df = df.loc[df.Bourough != 'Not assigned'] # keep only rows where Bourough is not "Not assigned"
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df.head(10)

Unnamed: 0,Postcode,Bourough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [15]:
missing_neighbourhoods = df.Neighbourhood.values == 'Not assigned' # build filter
df.Neighbourhood[missing_neighbourhoods] = df.Bourough[missing_neighbourhoods] # preform data assignment using the filter
df.head(10)

Unnamed: 0,Postcode,Bourough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


# data frame bulid-up

In [16]:
final_df = pd.DataFrame(columns=['Postcode', 'Bourough', 'Neighbourhood'])

# Loop over unique postcodes
for pc in df.Postcode.unique():
    # Extract the Bourough name
    bourough = df.loc[df.Postcode == pc].Bourough.values[0]
    # Extract the Neighbourhoods in the Bourough and join them into a list
    hoods = ', '.join(df.loc[df.Postcode == pc].Neighbourhood.values.tolist())
    
    # Append line to the DataFrame
    final_df = final_df.append({'Postcode': pc,
                                'Bourough': bourough,
                                'Neighbourhood': hoods}, ignore_index=True)

final_df.reset_index(inplace=True)
final_df.drop('index', axis=1, inplace=True)
final_df.head(10)

Unnamed: 0,Postcode,Bourough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [17]:
final_df.shape


(103, 3)