### This notebook presents two ways of scraping table data from a website using Python.  
Here, we write the output dataframe to a Dataiku dataset as we are working with the data in Dataiku subsequently. You can choose to simply write the output to a csv file using pandas.

In [4]:
from urllib.request import Request, urlopen
import requests
from bs4 import BeautifulSoup
import csv
import shutil
import re
import time

In [23]:
import numpy as np

In [5]:
site = 'https://minmed.sg/vaccinations/'

### Method 1: Using pandas only

In [31]:
### We get an error without indicating the User Agent in the headers
with urllib.request.urlopen(site) as i:
    html = i.read()
    
data = pd.read_html(html)[0]

HTTPError: HTTP Error 403: Forbidden

In [0]:
hdr = {'User-Agent': 'Mozilla/5.0'}

In [27]:
req = urllib.request.Request(site, headers=hdr)
response = urllib.request.urlopen(req)

In [28]:
response

<http.client.HTTPResponse at 0x13ab98518>

In [29]:
### Now it works after adding in the headers
with urllib.request.urlopen(req) as i:
    html = i.read()
    
data = pd.read_html(html)[0]

In [30]:
data

Unnamed: 0,Vaccinations,N,G,M,P
0,Flu Jab (Influvac Tetra),$48.15,$12.00,$0.00,$0.00
1,Prevnar 13,$200.05,$63.00,$31.00,$16.00
2,Pneumovax 23,$93.05,$40.00,$20.00,$10.00
3,Boostrix,$58.85,$15.00,$0.00,$0.00
4,Hepatitis B (Engerix B Adult),$55.64,$17.00,$0.00,$0.00
5,"Mmr-II Measles,Mumps,Rubella",$70.00,$35.00,$18.00,$9.00
6,Infanrix IPV/HIB (5-in-1),$107.00,N.A,N.A,N.A
7,Infanrix Hexa (6-in-1),$128.40,N.A,N.A,N.A
8,Chicken Pox (Varivax),$96.30,$45.00,$23.00,$11.00
9,Hepatitis A (Paed 720),$107.00,N.A,N.A,N.A


### Method 2: Using BeautifulSoup

In [6]:
hdr = {'User-Agent': 'Mozilla/5.0'}
bookpage = requests.get(site)
soup = BeautifulSoup(bookpage.text, "html.parser")

In [0]:
# print(soup.prettify())

In [10]:
soup.find('table')

<table class="ea-advanced-data-table ea-advanced-data-table-static ea-advanced-data-table-261a82cb ea-advanced-data-table-sortable ea-advanced-data-table-searchable" data-id="261a82cb"><thead><tr><th>Vaccinations</th><th>N</th><th>G</th><th><p>M</p></th><th style="width: 151px">P</th></tr></thead><tbody><tr><td><p>Flu Jab (Influvac Tetra)</p></td><td>$48.15 </td><td><p>$12.00</p></td><td><p>$0.00</p></td><td><p>$0.00</p></td></tr><tr><td>Prevnar 13</td><td><p>$200.05</p></td><td>$63.00 </td><td><p>$31.00</p></td><td>$16.00 </td></tr><tr><td>Pneumovax 23</td><td>$93.05 </td><td>$40.00 </td><td><p>$20.00</p></td><td>$10.00 </td></tr><tr><td>Boostrix</td><td><p>$58.85</p></td><td><p>$15.00</p></td><td><p>$0.00</p></td><td><p>$0.00</p></td></tr><tr><td>Hepatitis B (Engerix B Adult)</td><td>$55.64 </td><td><p>$17.00</p></td><td><p>$0.00</p></td><td><p>$0.00</p></td></tr><tr><td>Mmr-II Measles,Mumps,Rubella</td><td><p>$70.00</p></td><td><p>$35.00</p></td><td><p>$18.00</p></td><td><p>$9.00</p

In [12]:
table = soup.find('table')
rows = table.find_all('tr')

In [13]:
rows

[<tr><th>Vaccinations</th><th>N</th><th>G</th><th><p>M</p></th><th style="width: 151px">P</th></tr>,
 <tr><td><p>Flu Jab (Influvac Tetra)</p></td><td>$48.15 </td><td><p>$12.00</p></td><td><p>$0.00</p></td><td><p>$0.00</p></td></tr>,
 <tr><td>Prevnar 13</td><td><p>$200.05</p></td><td>$63.00 </td><td><p>$31.00</p></td><td>$16.00 </td></tr>,
 <tr><td>Pneumovax 23</td><td>$93.05 </td><td>$40.00 </td><td><p>$20.00</p></td><td>$10.00 </td></tr>,
 <tr><td>Boostrix</td><td><p>$58.85</p></td><td><p>$15.00</p></td><td><p>$0.00</p></td><td><p>$0.00</p></td></tr>,
 <tr><td>Hepatitis B (Engerix B Adult)</td><td>$55.64 </td><td><p>$17.00</p></td><td><p>$0.00</p></td><td><p>$0.00</p></td></tr>,
 <tr><td>Mmr-II Measles,Mumps,Rubella</td><td><p>$70.00</p></td><td><p>$35.00</p></td><td><p>$18.00</p></td><td><p>$9.00</p></td></tr>,
 <tr><td>Infanrix IPV/HIB (5-in-1)</td><td>$107.00 </td><td><p>N.A</p></td><td><p>N.A</p></td><td><p>N.A</p></td></tr>,
 <tr><td>Infanrix Hexa (6-in-1)</td><td>$128.40 </td><t

In [14]:
rows[1]

<tr><td><p>Flu Jab (Influvac Tetra)</p></td><td>$48.15 </td><td><p>$12.00</p></td><td><p>$0.00</p></td><td><p>$0.00</p></td></tr>

In [15]:
rows[1].find_all('td')

[<td><p>Flu Jab (Influvac Tetra)</p></td>,
 <td>$48.15 </td>,
 <td><p>$12.00</p></td>,
 <td><p>$0.00</p></td>,
 <td><p>$0.00</p></td>]

In [18]:
rows[1].find_all('td')[1].get_text().strip()

'$48.15'

In [19]:
data = []

In [21]:
for row in rows[1:]:
    cells = row.find_all('td')
    vaccination = cells[0].get_text().strip()
    n = cells[1].get_text().strip()
    g = cells[2].get_text().strip()
    m = cells[3].get_text().strip()
    p = cells[4].get_text().strip()
    data.append((vaccination, n, g, m, p))
        

In [22]:
data

[('Flu Jab (Influvac Tetra)', '$48.15', '$12.00', '$0.00', '$0.00'),
 ('Prevnar 13', '$200.05', '$63.00', '$31.00', '$16.00'),
 ('Pneumovax 23', '$93.05', '$40.00', '$20.00', '$10.00'),
 ('Boostrix', '$58.85', '$15.00', '$0.00', '$0.00'),
 ('Hepatitis B (Engerix B Adult)', '$55.64', '$17.00', '$0.00', '$0.00'),
 ('Mmr-II Measles,Mumps,Rubella', '$70.00', '$35.00', '$18.00', '$9.00'),
 ('Infanrix IPV/HIB (5-in-1)', '$107.00', 'N.A', 'N.A', 'N.A'),
 ('Infanrix Hexa (6-in-1)', '$128.40', 'N.A', 'N.A', 'N.A'),
 ('Chicken Pox (Varivax)', '$96.30', '$45.00', '$23.00', '$11.00'),
 ('Hepatitis A (Paed 720)', '$107.00', 'N.A', 'N.A', 'N.A'),
 ('Hepatitis A (Havrix 1440)', '$114.49', '$114.49', '$114.49', '$114.49'),
 ('Twinrix', '$128.40', '$128.40', '$128.40', '$128.40'),
 ('Tetanus Jab (Tetavax)', '$32.10', '$32.10', '$32.10', '$32.10'),
 ('Zostavax', '$304.95', '$304.95', '$304.95', '$304.95'),
 ('Gardasil 9', '$224.70', '$224.70', '$224.70', '$224.70'),
 ('Rotarix', '$141.24', '$141.24', '$

In [24]:
df = pd.DataFrame(np.array(data))

In [25]:
df.columns = ['vaccinations','public_rate','chas_green','merdeka_generation_chas_blue_orange','pioneer_generation']

In [26]:
df

Unnamed: 0,vaccinations,public_rate,chas_green,merdeka_generation_chas_blue_orange,pioneer_generation
0,Flu Jab (Influvac Tetra),$48.15,$12.00,$0.00,$0.00
1,Prevnar 13,$200.05,$63.00,$31.00,$16.00
2,Pneumovax 23,$93.05,$40.00,$20.00,$10.00
3,Boostrix,$58.85,$15.00,$0.00,$0.00
4,Hepatitis B (Engerix B Adult),$55.64,$17.00,$0.00,$0.00
5,"Mmr-II Measles,Mumps,Rubella",$70.00,$35.00,$18.00,$9.00
6,Infanrix IPV/HIB (5-in-1),$107.00,N.A,N.A,N.A
7,Infanrix Hexa (6-in-1),$128.40,N.A,N.A,N.A
8,Chicken Pox (Varivax),$96.30,$45.00,$23.00,$11.00
9,Hepatitis A (Paed 720),$107.00,N.A,N.A,N.A


In [49]:
df.shape[0]

17

In [53]:
df.iloc[0][0]

'Flu Jab (Influvac Tetra)'

In [32]:
import dataiku
from dataiku import pandasutils as pdu

In [33]:
# Get a handle to the current project
client = dataiku.api_client()
project = client.get_project(dataiku.default_project_key())

In [35]:
output_name = 'cost'

In [36]:
builder = project.new_managed_dataset_creation_helper(output_name)
builder.with_store_into("filesystem_managed")
builder.create() 



<dataikuapi.dss.dataset.DSSDataset at 0x10dd105c0>

In [38]:
# Write recipe outputs
dataiku.Dataset(output_name).write_with_schema(df)

17 rows successfully written (d9izmpghaR)


### Getting the description of vaccinations

In [0]:
# soup.find_all('p')

In [62]:
soup.find_all('p')[59].get_text().strip()

'Hepatitis A (Havrix 1440)'

In [45]:
soup.find_all('p')[60].get_text().strip()

'ADULTRequires 2 doses, 6 mth intervals.\nHepatitis A vaccine (Havrix, Vaqta) is used to prevent hepatitis A, a type of liver disease that is caused by the hepatitis A virus (HAV). Hepatitis A is usually spread when a person ingests fecal matter from contact with food, drinks, or objects which have been contaminated by feces or stool of an HAV-infected person.'

In [47]:
soup.find_all('p')[62].get_text().strip()

'ADULTRequires 3 doses, 0, 1, 6 mth intervals. No. of doses required depends on blood test results.\nENGERIX-B is a vaccine indicated for immunization against infection caused by all known subtypes of hepatitis B virus.'

In [48]:
soup.find_all('p')[64].get_text().strip()

'ADULTRequires 3 doses, 0, 1, 6 mth intervals.TWINRIX is the only dual hepatitis A and B vaccine . It’s given as a series of doses (injections) by a healthcare professional. TWINRIX is used in adults, adolescents, children, and infants to prevent hepatitis A and hepatitis B diseases.'

In [71]:
desc_table = []

In [72]:
for i in range(0,18):
    vaccination = soup.find_all('p')[59+i*2].get_text().strip()
    desc = soup.find_all('p')[60+i*2].get_text().strip()
    desc_table.append((vaccination, desc))

In [73]:
desc_table

[('Hepatitis A (Havrix 1440)',
  'ADULTRequires 2 doses, 6 mth intervals.\nHepatitis A vaccine (Havrix, Vaqta) is used to prevent hepatitis A, a type of liver disease that is caused by the hepatitis A virus (HAV). Hepatitis A is usually spread when a person ingests fecal matter from contact with food, drinks, or objects which have been contaminated by feces or stool of an HAV-infected person.'),
 ('Hepatitis B (Engerix B Adult)',
  'ADULTRequires 3 doses, 0, 1, 6 mth intervals. No. of doses required depends on blood test results.\nENGERIX-B is a vaccine indicated for immunization against infection caused by all known subtypes of hepatitis B virus.'),
 ('Twinrix',
  'ADULTRequires 3 doses, 0, 1, 6 mth intervals.TWINRIX is the only dual hepatitis A and B vaccine . It’s given as a series of doses (injections) by a healthcare professional. TWINRIX is used in adults, adolescents, children, and infants to prevent hepatitis A and hepatitis B diseases.'),
 ('Tetanus Jab (Tetavax)',
  'ADULTThi

In [74]:
df2 = pd.DataFrame(np.array(desc_table))

In [75]:
df2.columns = ['vaccinations','description']

In [76]:
builder = project.new_managed_dataset_creation_helper('desc')
builder.with_store_into("filesystem_managed")
builder.create() 
# Write recipe outputs
dataiku.Dataset('desc').write_with_schema(df2)

18 rows successfully written (l3EbOl6NmA)


References:   
https://thecleverprogrammer.com/2022/03/10/scrape-table-from-a-website-using-python/  
https://blog.devgenius.io/8-steps-in-scraping-a-table-from-a-website-using-python-e9eca91c9779  
https://stackoverflow.com/questions/47029280/python-3-add-custom-headers-to-urllib-request-request  