In [1]:
import pandas as pd
import requests
import json
import xml.etree.ElementTree as ET
from selenium import webdriver
from selenium.webdriver.common.by import By

In [2]:
bill_types = ['hr', 's', 'hjres', 'sjres', 'hconres', 'sconres', 'hres', 'sres']
congresses = ['113', '114', '115', '116', '117']
base_url = 'https://www.govinfo.gov/bulkdata/json/BILLSTATUS/{}/{}'

In [None]:
bills = {}
driver = webdriver.Chrome()

for congress in congresses:
    bills[congress] = {}
    for bill_type in bill_types:
        bill_counter = 1
        error_count = 0
        bills[congress][bill_type] = {}
        
        # get list of bills for that congress and type
        url = base_url.format(congress, bill_type)
        data = driver.get(f'https://www.govinfo.gov/bulkdata/json/BILLSTATUS/{congress}/{bill_type}')
        pre = driver.find_element(By.TAG_NAME, "pre").text
        j = json.loads(pre)
        
        # iterate through bills
        total_bills = len(j['files'])
        print('-------------------------------------')
        print(f"Starting on {total_bills} {bill_type} bills from the {congress}th congress")
        for file in j['files']: 
            try:
                # get bill summary
                r = requests.get(file['link'])
                metadata = ET.fromstring(r.text)
                bill_num = metadata[0][0].text
                summary = metadata[0].find('summaries')[0][0].find('text').text
                bills[congress][bill_type][bill_num] = {}
                bills[congress][bill_type][bill_num]['summary'] = summary

                # get title
                text_url = metadata[0].find('textVersions')[0].find('formats')[0][0].text
                r = requests.get(text_url)
                text_page = ET.fromstring(r.text)
                title = text_page[1].find('official-title').text
                bills[congress][bill_type][bill_num]['title'] = title

                # get full text
                form = text_page.find('form')   
                bill_text = ''.join(form.itertext())
                bills[congress][bill_type][bill_num]['text'] = bill_text
            except:
                error_count += 1
                print(f'Bypassed an error; last bill num was {bill_num}')
            
            if bill_counter % 100 == 0:
                print(f'processed {bill_counter} bills')
            bill_counter += 1
            
        print('-------------------------------------')
        error_pct = round(error_count / total_bills * 100, 2)
        print(f"{error_count} errors out of {total_bills} bills")
        print(f"Error rate: {error_pct}%")
            
                    
        print(f'Bypassed {error_count} total errors collecting {bill_type} bills from the {congress}th congress')
            

-------------------------------------
Starting on 5886 hr bills from the 113th congress
processed 100 bills
processed 200 bills
processed 300 bills
Bypassed an error; last bill num was 1067
processed 400 bills
processed 500 bills
processed 600 bills
processed 700 bills
processed 800 bills
processed 900 bills
Bypassed an error; last bill num was 4818
Bypassed an error; last bill num was 2866
processed 1000 bills
processed 1100 bills
processed 1200 bills
processed 1300 bills
Bypassed an error; last bill num was 3422
processed 1400 bills
processed 1500 bills
processed 1600 bills
processed 1700 bills
processed 1800 bills
Bypassed an error; last bill num was 4779
Bypassed an error; last bill num was 4785
Bypassed an error; last bill num was 4794
Bypassed an error; last bill num was 4776
processed 1900 bills
processed 2000 bills
Bypassed an error; last bill num was 947
processed 2100 bills
processed 2200 bills
Bypassed an error; last bill num was 4819
processed 2300 bills
processed 2400 bill