# House of Representatives HTML Extractor

## PART 1: Preliminaries

Import Dependencies and load the HTML source

In [1]:
from bs4 import BeautifulSoup
import re

with open("House of Representatives.html", encoding = "utf-8") as HR:
	soup = BeautifulSoup(HR, 'html.parser')

The code cell below is the upper "\<div\>" tag

In [2]:
div_tags = soup.body.find_all('div', recursive = False)
b = div_tags[1]

c contains the next hierarchy to the list

In [3]:
for i in b.div.find_all('div', recursive = False):
    if i['class'] == ['col-md-8']:
        c = i

d now contains the list while `lis` now contains all items with `<div class = "panel-body">`. This `div` tag now contains the data.

In [4]:
div_names = []

for j in c:
    if j.name == 'div':
        div_names.append(j)

d = div_names[1]
lis1 = d.find_all("div", class_ = "panel-heading")
lis2 = d.find_all("div", class_ = "panel-body")
len(lis1), len(lis2)

(13972, 13972)

The following cells below represents instances of a `lis` entry

In [5]:
test_var = lis2[0].find_all('p')

for i in test_var:
    print(i.text)

code = lis1[0].text
code

Significance: National
Date Filed: 2019-07-01
Full Title: AN ACT INSTITUTIONALIZING A NATIONAL VALUES, ETIQUETTE, AND MORAL UPRIGHTNESS PROGRAM
Principal Author/s: 1. CAYETANO, ALAN PETER "COMPAÑERO" S. 2. MADRONA. ELEANDRO JESUS F. 3. UNGAB, ISIDRO T.
Date Read: 2019-07-23
Primary Referral: BASIC EDUCATION AND CULTURE
Bill Status: Substituted by HB05829
Mother Bill Status: Republic Act RA11476 enacted on 2020-06-25


'HB00001'

In [6]:
test_var = lis2[1].find_all('p')

for i in test_var:
    print(i.text)

code = lis1[1].text
code

Significance: National
Date Filed: 2019-07-01
Full Title: AN ACT CREATING THE DEPARTMENT OF OVERSEAS FILIPINO WORKERS (OFW) AND FOREIGN EMPLOYMENT, DEFINING ITS POWERS AND FUNCTIONS, APPROPRIATING FUNDS THEREFOR, RATIONALIZING THE ORGANIZATION AND FUNCTIONS OF GOVERNMENT AGENCIES RELATED TO MIGRATION, AND FOR OTHER PURPOSES
Principal Author/s: 1. CAYETANO, ALAN PETER "COMPAÑERO" S. 2. CAYETANO, MARIA LAARNI 3. DUTERTE, PAOLO Z.
Date Read: 2019-07-23
Primary Referral: GOVERNMENT REORGANIZATION
Bill Status: Substituted by HB05832
Mother Bill Status: Republic Act RA11641 enacted on 2021-12-30


'HB00002'

## PART 2: Define the `extract_data` Function

In [7]:
!pip install more_itertools



You should consider upgrading via the 'C:\Users\james\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [49]:
features = ['Significance',
 'Date Filed',
 'Full Title',
 'Principal Author/s',
 'Date Read',
 'Primary Referral',
 'Bill Status',
 'Mother Bill Status',
 'Date Approved on Second Reading',
 'Date Approved on Third Reading',
 'Senate Bill Counterpart',
 'Date Acted Upon By the President',
 'Republic Act No.']

def extract_data(code, tag, lis):
    entry = dict()
    entry['ID'] = code.text
    test_var = tag.find_all('p')
    
    
    for j in lis:
        flag = 0
        for i in test_var:
            temp = i.text
            keys = temp[:temp.index(':')]
            values = temp[temp.index(':')+2:]
            if j == keys and j == 'Principal Author/s':
                temp_lis = re.split(r'\d+\.\s*', values)
                congs = list(set([x.strip() for x in temp_lis])-{''})
                entry[j] = ';'.join(congs).strip()
                entry['Author Count'] = len(congs)
                flag = 1
                break
            elif j == keys:
                entry[j] = values
                flag = 1
                break
        if flag == 0:
            entry[j] = None

    return entry

In [50]:
extract_data(lis1[0], lis2[0], features)

{'ID': 'HB00001',
 'Significance': 'National',
 'Date Filed': '2019-07-01',
 'Full Title': 'AN ACT INSTITUTIONALIZING A NATIONAL VALUES, ETIQUETTE, AND MORAL UPRIGHTNESS PROGRAM',
 'Principal Author/s': 'UNGAB, ISIDRO T.;CAYETANO, ALAN PETER "COMPAÑERO" S.;MADRONA. ELEANDRO JESUS F.',
 'Author Count': 3,
 'Date Read': '2019-07-23',
 'Primary Referral': 'BASIC EDUCATION AND CULTURE',
 'Bill Status': 'Substituted by HB05829',
 'Mother Bill Status': 'Republic Act RA11476 enacted on 2020-06-25',
 'Date Approved on Second Reading': None,
 'Date Approved on Third Reading': None,
 'Senate Bill Counterpart': None,
 'Date Acted Upon By the President': None,
 'Republic Act No.': None}

In [51]:
extract_data(lis1[13971], lis2[13971], features)

{'ID': 'RPT1446',
 'Significance': None,
 'Date Filed': '2022-01-31',
 'Full Title': 'MOTU PROPRIO INQUIRY, IN AID OF LEGISLATION, ON THE REPORT OF THE COMMISSION ON AUDIT REGARDING THE ALLEGED OVERPRICED MEDICAL AND HEALTH EQUIPMENT BY THE DEPARTMENT OF HEALTH UTILIZED FOR THE COVID-19 RESPONSE',
 'Principal Author/s': 'GOOD GOVERNMENT AND PUBLIC ACCOUNTABILITY',
 'Author Count': 1,
 'Date Read': '2022-01-31',
 'Primary Referral': 'GOOD GOVERNMENT AND PUBLIC ACCOUNTABILITY',
 'Bill Status': 'Pending With Rules (Included in OB on 2022-01-31)',
 'Mother Bill Status': None,
 'Date Approved on Second Reading': None,
 'Date Approved on Third Reading': None,
 'Senate Bill Counterpart': None,
 'Date Acted Upon By the President': None,
 'Republic Act No.': None}

In [52]:
extract_data(lis1[10], lis2[10], features)

{'ID': 'HB00011',
 'Significance': 'Local',
 'Date Filed': '2019-07-01',
 'Full Title': 'AN ACT CREATING THE MEGA CEBU DEVELOPMENT AUTHORITY, PRESCRIBING ITS FUNCTIONS AND DUTIES, AND PROVIDING FUNDS THEREFOR',
 'Principal Author/s': 'DEL MAR, RAUL V.',
 'Author Count': 1,
 'Date Read': '2019-07-23',
 'Primary Referral': 'GOVERNMENT ENTERPRISES AND PRIVATIZATION',
 'Bill Status': 'Pending with the Committee on GOVERNMENT ENTERPRISES AND PRIVATIZATION since 2020-08-26',
 'Mother Bill Status': None,
 'Date Approved on Second Reading': None,
 'Date Approved on Third Reading': None,
 'Senate Bill Counterpart': None,
 'Date Acted Upon By the President': None,
 'Republic Act No.': None}

## PART 3: Extracting the Data Per Se

In [53]:
our_data = []

for i in range(13972):
    our_data.append(extract_data(lis1[i], lis2[i], features))

In [13]:
our_data[13971]

{'ID': 'RPT1446',
 'Significance': None,
 'Date Filed': '2022-01-31',
 'Full Title': 'MOTU PROPRIO INQUIRY, IN AID OF LEGISLATION, ON THE REPORT OF THE COMMISSION ON AUDIT REGARDING THE ALLEGED OVERPRICED MEDICAL AND HEALTH EQUIPMENT BY THE DEPARTMENT OF HEALTH UTILIZED FOR THE COVID-19 RESPONSE',
 'Principal Author/s': ';GOOD GOVERNMENT AND PUBLIC ACCOUNTABILITY',
 'Author Count': None,
 'Date Read': '2022-01-31',
 'Primary Referral': 'GOOD GOVERNMENT AND PUBLIC ACCOUNTABILITY',
 'Bill Status': 'Pending With Rules (Included in OB on 2022-01-31)',
 'Mother Bill Status': None,
 'Date Approved on Second Reading': None,
 'Date Approved on Third Reading': None,
 'Senate Bill Counterpart': None,
 'Date Acted Upon By the President': None,
 'Republic Act No.': None}

In [54]:
our_data[2667]

{'ID': 'HB02674',
 'Significance': 'Local',
 'Date Filed': '2019-07-24',
 'Full Title': 'AN ACT ESTABLISHING A DISTRICT BRANCH OF THE LAND TRANSPORTATION OFFICE (LTO) IN THE MUNICIPALITY OF LABO IN THE 1ST DISTRICT OF CAMARINES NORTE, AND APPROPRIATING FUNDS THEREFOR',
 'Principal Author/s': 'TALLADO, JOSEFINA B.;UNGAB, ISIDRO T.;SARMIENTO, EDGAR S.',
 'Author Count': 3,
 'Date Read': '2019-07-31',
 'Primary Referral': 'TRANSPORTATION',
 'Bill Status': 'House agreed to the amendments of the Senate on 2022-02-02',
 'Mother Bill Status': None,
 'Date Approved on Second Reading': '2019-12-18',
 'Date Approved on Third Reading': '2020-02-04',
 'Senate Bill Counterpart': None,
 'Date Acted Upon By the President': None,
 'Republic Act No.': None}

In [55]:
our_data[6942]

{'ID': 'HB06952',
 'Significance': 'National',
 'Date Filed': '2020-06-03',
 'Full Title': 'AN ACT PROVIDING FOR COVID-19 RESPONSE AND RECOVERY INTERVENTIONS AND PROVIDING MECHANISM TO ACCELERATE THE RECOVERY AND BOLSTER THE RESILIENCY OF THE PHILIPPINE ECONOMY, PROVIDING FUNDS THEREFOR, AND FOR OTHER PURPOSES',
 'Principal Author/s': 'DEFENSOR,, MICHAEL T.;VILLAFUERTE, LUIS RAYMUND "LRAY" JR F.;ROMUALDEZ, FERDINAND MARTIN G.;SY-ALVARADO, "KUYA" JOSE ANTONIO R.',
 'Author Count': 4,
 'Date Read': '2020-06-03',
 'Primary Referral': 'RULES',
 'Bill Status': 'Substituted by HB06953',
 'Mother Bill Status': 'Republic Act RA11494 enacted on 2020-09-11',
 'Date Approved on Second Reading': None,
 'Date Approved on Third Reading': None,
 'Senate Bill Counterpart': None,
 'Date Acted Upon By the President': None,
 'Republic Act No.': None}

## PART 4: Abstract

Define a function `get_abstract` that fetches the abstract of a particular bill. <br/>
<b> NOTE </b>: This will not be implemented yet

In [16]:
# import requests

# url = 'https://www.congress.gov.ph/legisdocs/fetch_history.php'
# payload = {'rowid': '#HB00001-18'}

# def get_abstract(url, payload):
#     x = requests.post(url, data = payload)
#     response_html = BeautifulSoup(x.text, 'html.parser')
#     tr = response_html.find_all('tr')[2]
    
#     return tr.td.text[11:]

In [17]:
#do not run this cell more than once. This will append 'Abstract' more than once.
# features.append('Abstract')

In [18]:
# for i in range(13972):
#     payload['rowid'] = '#' + lis1[i].text + '-18'
#     our_data[i]['Abstract'] = get_abstract(url, payload)
#     print(i)

Sample data after implementing the feature:

In [19]:
# our_data[1]

In [20]:
# our_data[13971]

## PART 5: Exporting the Data

In [56]:
import csv

csv_file = "hor_bills.csv"

try:
    with open(csv_file, 'w', encoding = 'utf-8', newline = '') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['ID'] + features + ['Author Count'])
        writer.writeheader()
        for data in our_data:
            writer.writerow(data)
except IOError:
    print("I/O Error!")