This notebook crawls through the GovInfo website (operated by the U.S Government Publishing Office). The crawler downloads and extracts all bill full-texts, summaries, and bill statuses for the U.S. Congress that are available on GovInfo, which is currently the 113th through the 118th Congress, inclusive. The notebook should take under 20 minutes to run. 

In [11]:
import requests
import json
import os
import zipfile

In [12]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'application/json'
}
s = requests.Session()
s.headers.update(headers)

bills = s.get('https://www.govinfo.gov/bulkdata/json/BILLS/')
congresses = bills.json()['files']

for congress in congresses:
    if not congress['name'].isnumeric():
        continue  # Skip folders like /resources/
    print('Getting', congress['name'])
    response = s.get(congress['link'])
    sessions = response.json()['files']

    for session in sessions:
        print('Getting', session['name'])
        response = s.get(session['link'])
        bill_types = response.json()['files']

        for bill_type in bill_types:
            print('Getting', bill_type['name'])
            response = s.get(bill_type['link'])
            bills = response.json()['files']

            for bill in bills:
                if bill['mimeType'] == 'application/zip':
                    dest_dir = f"./data/bills/{congress['name']}/{session['name']}/{bill_type['name']}/"
                    os.makedirs(dest_dir, exist_ok=True)
                    response = requests.get(bill['link'])
                    if response.status_code == 200:
                        with open(dest_dir + bill['name'], 'wb') as f:
                            f.write(response.content)
                    else:
                        print(f"Failed to download. HTTP Status Code: {response.status_code}")

Getting 118
Getting 1
Getting sres
Getting hres
Getting hconres
Getting hr
Getting hjres
Getting s
Getting sjres
Getting sconres
Getting 117
Getting 2
Getting s
Getting sconres
Getting sres
Getting hr
Getting hres
Getting hjres
Getting sjres
Getting hconres
Getting 1
Getting sconres
Getting sres
Getting hres
Getting hr
Getting hjres
Getting hconres
Getting s
Getting sjres
Getting 116
Getting 2
Getting hr
Getting hres
Getting hjres
Getting sjres
Getting s
Getting sres
Getting sconres
Getting hconres
Getting 1
Getting hres
Getting sres
Getting sjres
Getting hconres
Getting hjres
Getting hr
Getting s
Getting sconres
Getting 115
Getting 2
Getting s
Getting hconres
Getting hjres
Getting hr
Getting sres
Getting hres
Getting sconres
Getting sjres
Getting 1
Getting sconres
Getting hconres
Getting hjres
Getting hres
Getting hr
Getting sres
Getting s
Getting sjres
Getting 114
Getting 2
Getting hr
Getting s
Getting hres
Getting hconres
Getting sres
Getting hjres
Getting sjres
Getting sconres
Gett

In [13]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'application/json'
}
s = requests.Session()
s.headers.update(headers)

statuses = s.get('https://www.govinfo.gov/bulkdata/json/BILLSTATUS/')
congresses = statuses.json()['files']

for congress in congresses:
    if not congress['name'].isnumeric():
        continue
    print('Getting', congress['name'])
    response = s.get(congress['link'])
    bill_types = response.json()['files']

    for bill_type in bill_types:
        print('Getting', bill_type['name'])
        response = s.get(bill_type['link'])
        statuses = response.json()['files']

        for status in statuses:
            if status['mimeType'] == 'application/zip':
                dest_dir = f"./data/BILLSTATUS/{congress['name']}/{bill_type['name']}/"
                os.makedirs(dest_dir, exist_ok=True)
                response = requests.get(status['link'])
                if response.status_code == 200:
                    with open(dest_dir + status['name'], 'wb') as f:
                        f.write(response.content)
                else:
                    print(f"Failed to download. HTTP Status Code: {response.status_code}")

Getting 118
Getting sres
Getting hr
Getting hconres
Getting hjres
Getting hres
Getting s
Getting sjres
Getting sconres
Getting 117
Getting sconres
Getting s
Getting sres
Getting hres
Getting hconres
Getting hjres
Getting hr
Getting sjres
Getting 116
Getting hconres
Getting hres
Getting hjres
Getting sjres
Getting hr
Getting sres
Getting s
Getting sconres
Getting 115
Getting sres
Getting sconres
Getting sjres
Getting hres
Getting s
Getting hconres
Getting hjres
Getting hr
Getting 114
Getting sres
Getting hconres
Getting sconres
Getting hres
Getting hr
Getting s
Getting sjres
Getting hjres
Getting 113
Getting s
Getting hjres
Getting hconres
Getting hres
Getting sconres
Getting sjres
Getting sres
Getting hr
Getting 112
Getting hconres
Getting hres
Getting hr
Getting hjres
Getting sres
Getting s
Getting sjres
Getting sconres
Getting 111
Getting hres
Getting s
Getting hr
Getting sres
Getting hjres
Getting sconres
Getting hconres
Getting sjres
Getting 110
Getting hr
Getting sres
Getting hres

In [14]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'application/json'
}
s = requests.Session()
s.headers.update(headers)

summaries = s.get('https://www.govinfo.gov/bulkdata/json/BILLSUM/')
congresses = summaries.json()['files']

for congress in congresses:
    if not congress['name'].isnumeric():
        continue
    print('Getting', congress['name'])
    response = s.get(congress['link'])
    bill_types = response.json()['files']

    for bill_type in bill_types:
        print('Getting', bill_type['name'])
        response = s.get(bill_type['link'])
        summaries = response.json()['files']

        for summary in summaries:
            if summary['mimeType'] == 'application/zip':
                dest_dir = f"./data/billsummaries/{congress['name']}/{bill_type['name']}/"
                os.makedirs(dest_dir, exist_ok=True)
                response = requests.get(summary['link'])
                if response.status_code == 200:
                    with open(dest_dir + summary['name'], 'wb') as f:
                        f.write(response.content)
                else:
                    print(f"Failed to download. HTTP Status Code: {response.status_code}")

Getting 118
Getting sres
Getting hjres
Getting hr
Getting hconres
Getting hres
Getting sjres
Getting s
Getting sconres
Getting 117
Getting hr
Getting hjres
Getting hres
Getting sres
Getting hconres
Getting s
Getting sconres
Getting sjres
Getting 116
Getting hr
Getting hres
Getting hjres
Getting hconres
Getting sjres
Getting sres
Getting s
Getting sconres
Getting 115
Getting hres
Getting s
Getting hr
Getting hconres
Getting hjres
Getting sconres
Getting sjres
Getting sres
Getting 114
Getting hr
Getting s
Getting hconres
Getting hres
Getting hjres
Getting sconres
Getting sres
Getting sjres
Getting 113
Getting hconres
Getting hres
Getting hjres
Getting hr
Getting sconres
Getting s
Getting sjres
Getting sres


In [15]:
def unzip_all_in_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".zip"):
                zip_path = os.path.join(root, file)
                extract_to = root

                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_to)
                print(f"Extracted: {zip_path}")

In [16]:
unzip_all_in_directory('./data')

Extracted: ./data\bills\113\1\hconres\BILLS-113-1-hconres.zip
Extracted: ./data\bills\113\1\hjres\BILLS-113-1-hjres.zip
Extracted: ./data\bills\113\1\hr\BILLS-113-1-hr.zip
Extracted: ./data\bills\113\1\hres\BILLS-113-1-hres.zip
Extracted: ./data\bills\113\1\s\BILLS-113-1-s.zip
Extracted: ./data\bills\113\1\sconres\BILLS-113-1-sconres.zip
Extracted: ./data\bills\113\1\sjres\BILLS-113-1-sjres.zip
Extracted: ./data\bills\113\1\sres\BILLS-113-1-sres.zip
Extracted: ./data\bills\113\2\hconres\BILLS-113-2-hconres.zip
Extracted: ./data\bills\113\2\hjres\BILLS-113-2-hjres.zip
Extracted: ./data\bills\113\2\hr\BILLS-113-2-hr.zip
Extracted: ./data\bills\113\2\hres\BILLS-113-2-hres.zip
Extracted: ./data\bills\113\2\s\BILLS-113-2-s.zip
Extracted: ./data\bills\113\2\sconres\BILLS-113-2-sconres.zip
Extracted: ./data\bills\113\2\sjres\BILLS-113-2-sjres.zip
Extracted: ./data\bills\113\2\sres\BILLS-113-2-sres.zip
Extracted: ./data\bills\114\1\hconres\BILLS-114-1-hconres.zip
Extracted: ./data\bills\114\1\

In [17]:
def delete_zip_files(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.zip'):
                file_path = os.path.join(root, file)
                os.remove(file_path)
                print(f"Deleted {file_path}")

delete_zip_files('./data')

Deleted ./data\bills\113\1\hconres\BILLS-113-1-hconres.zip
Deleted ./data\bills\113\1\hjres\BILLS-113-1-hjres.zip
Deleted ./data\bills\113\1\hr\BILLS-113-1-hr.zip
Deleted ./data\bills\113\1\hres\BILLS-113-1-hres.zip
Deleted ./data\bills\113\1\s\BILLS-113-1-s.zip
Deleted ./data\bills\113\1\sconres\BILLS-113-1-sconres.zip
Deleted ./data\bills\113\1\sjres\BILLS-113-1-sjres.zip
Deleted ./data\bills\113\1\sres\BILLS-113-1-sres.zip
Deleted ./data\bills\113\2\hconres\BILLS-113-2-hconres.zip
Deleted ./data\bills\113\2\hjres\BILLS-113-2-hjres.zip
Deleted ./data\bills\113\2\hr\BILLS-113-2-hr.zip
Deleted ./data\bills\113\2\hres\BILLS-113-2-hres.zip
Deleted ./data\bills\113\2\s\BILLS-113-2-s.zip
Deleted ./data\bills\113\2\sconres\BILLS-113-2-sconres.zip
Deleted ./data\bills\113\2\sjres\BILLS-113-2-sjres.zip
Deleted ./data\bills\113\2\sres\BILLS-113-2-sres.zip
Deleted ./data\bills\114\1\hconres\BILLS-114-1-hconres.zip
Deleted ./data\bills\114\1\hjres\BILLS-114-1-hjres.zip
Deleted ./data\bills\114\1