# Retrieve Congressional Speeches (GovInfo)

This notebook fetches speeches from the Congressional Record via the GovInfo API.

- Set a date range and your `GOVINFO_API_KEY` (env var or prompt).
- The code paginates through Daily Edition issues (CREC), fetches granules, downloads XML/TXT, and extracts individual `<speaking>` blocks.
- Output is written to JSONL (one speech per line), with optional CSV export.

Note: Fetching all history is very large. Start with a small range, confirm results, then scale up.

In [None]:
# Configuration and imports
import os, time, sys, json, re
from datetime import date, timedelta
from typing import Iterator, Dict, Any, List, Optional, Tuple

try:
    import requests
except ModuleNotFoundError:
    # Install requests if missing (uncomment if running in a clean kernel)
    # %pip install requests
    raise

BASE = "https://api.govinfo.gov"
API_KEY = os.getenv('GOVINFO_API_KEY')
if not API_KEY:
    try:
        from getpass import getpass
        API_KEY = getpass('Enter GOVINFO_API_KEY: ').strip()
    except Exception:
        API_KEY = input('Enter GOVINFO_API_KEY: ').strip()

assert API_KEY, 'A GOVINFO_API_KEY is required. Get one from https://api.govinfo.gov/docs/'

# Tuning
RATE_DELAY = 0.2  # seconds between API calls to be polite
PAGE_SIZE = 100   # max page size supported by GovInfo

def _get(path: str, params: Optional[Dict[str, Any]] = None, stream: bool = False):
    params = dict(params or {})
    params['api_key'] = API_KEY
    for attempt in range(6):
        r = requests.get(BASE + path, params=params, timeout=60, stream=stream)
        if r.status_code in (429, 502, 503, 504):
            time.sleep(min(2 ** attempt, 10))
            continue
        r.raise_for_status()
        return r
    r.raise_for_status()

def iter_crec_packages(start_date: str, end_date: str, page_size: int = PAGE_SIZE) -> Iterator[Dict[str, Any]]:
    """Yield Daily Edition (CREC) packages between dates (inclusive).
    Dates are YYYY-MM-DD.
    """
    offset = 0
    while True:
        resp = _get('/collections/CREC', params={
            'startDate': start_date,
            'endDate': end_date,
            'pageSize': page_size,
            'offset': offset,
        }).json()
        items = resp.get('packages', []) or []
        if not items:
            break
        for p in items:
            yield p
        if len(items) < page_size:
            break
        offset += page_size
        time.sleep(RATE_DELAY)

def iter_granules(package_id: str, page_size: int = PAGE_SIZE) -> Iterator[Dict[str, Any]]:
    offset = 0
    while True:
        resp = _get(f'/packages/{package_id}/granules', params={
            'pageSize': page_size,
            'offset': offset,
        }).json()
        items = resp.get('granules', []) or []
        if not items:
            break
        for g in items:
            yield g
        if len(items) < page_size:
            break
        offset += page_size
        time.sleep(RATE_DELAY)

def get_granule_summary(package_id: str, granule_id: str) -> Dict[str, Any]:
    return _get(f'/packages/{package_id}/granules/{granule_id}/summary').json()

def fetch_granule_text(package_id: str, granule_id: str) -> Tuple[Optional[str], Dict[str, Any]]:
    """Return (text, summary). Prefers XML, falls back to TXT/HTML.
    """
    summary = get_granule_summary(package_id, granule_id)
    dl = summary.get('download') or {}
    url = dl.get('xmlLink') or dl.get('txtLink') or dl.get('htmLink') or dl.get('htmlLink')
    if not url:
        return None, summary
    # Some links are direct to govinfo.gov content and do not require the API key
    r = requests.get(url, timeout=90)
    r.raise_for_status()
    return r.text, summary

def compact_whitespace(s: str) -> str:
    # Avoid regex/backslashes to keep notebook JSON simple
    return ' '.join((s or '').split())
def extract_speeches_from_xml(xml_text: str) -> List[Dict[str, Any]]:
    """Parse CREC XML and extract <speaking> blocks.
    Returns a list of dicts with keys: speaker, bioguide_id, text.
    If no <speaking> found, falls back to paragraphs as a single block.
    """
    import xml.etree.ElementTree as ET
    speeches: List[Dict[str, Any]] = []
    try:
        root = ET.fromstring(xml_text)
    except ET.ParseError:
        return speeches

    def tagname(el):
        return el.tag.split('}')[-1]

    for node in root.iter():
        if tagname(node) == 'speaking':
            speaker = node.attrib.get('speaker') or node.attrib.get('speaker_name') or node.attrib.get('who') or ''
            bioguide = (node.attrib.get('bioGuideId') or node.attrib.get('bioguide_id') or
                        node.attrib.get('bioGuideID') or node.attrib.get('bioguideId') or '')
            text = compact_whitespace(''.join(node.itertext()))
            if text:
                speeches.append({
                    'speaker': speaker,
                    'bioguide_id': bioguide,
                    'text': text,
                })

    if not speeches:
        # Fallback: collect paragraphs
        paras: List[str] = []
        for node in root.iter():
            if tagname(node) == 'p':
                t = compact_whitespace(''.join(node.itertext()))
                if t:
                    paras.append(t)
        if paras:
            speeches.append({'speaker': '', 'bioguide_id': '', 'text': '

'.join(paras)})

    return speeches

def parse_page_from_granule_id(granule_id: str) -> Optional[str]:
    # Example: CREC-2024-09-05-pt1-PgS1234-2 -> PgS1234-2
    m = re.search(r'(Pg[SH][0-9]+(?:-[0-9]+)?)', granule_id or '')
    return m.group(1) if m else None

print('Config ready.')


In [None]:
# Driver: set date range and output
# Tip: Start with a small range to validate, then expand.
END = date.today()
START = END - timedelta(days=1)  # change to desired range
start_date = START.isoformat()
end_date = END.isoformat()

OUTPUT_DIR = 'data'
os.makedirs(OUTPUT_DIR, exist_ok=True)
JSONL_PATH = os.path.join(OUTPUT_DIR, f'speeches_{start_date}_to_{end_date}.jsonl')
CSV_PATH = os.path.join(OUTPUT_DIR, f'speeches_{start_date}_to_{end_date}.csv')

max_packages: Optional[int] = None   # e.g., 5 to limit during testing
max_granules_per_package: Optional[int] = None  # e.g., 50 to limit during testing

print(f'Fetching CREC packages from {start_date} to {end_date}...')
count_packages = 0
count_granules = 0
count_speeches = 0

with open(JSONL_PATH, 'w', encoding='utf-8') as out:
    for p in iter_crec_packages(start_date, end_date):
        package_id = p.get('packageId')
        pkg_date = p.get('dateIssued')
        if not package_id:
            continue
        count_packages += 1
        print(f'Package {count_packages}: {package_id} ({pkg_date})')
        granules_seen = 0
        for g in iter_granules(package_id):
            granule_id = g.get('granuleId')
            chamber = (g.get('granuleClass') or '').upper()  # HOUSE / SENATE / EXTENSIONS
            if not granule_id:
                continue
            granules_seen += 1
            if max_granules_per_package and granules_seen > max_granules_per_package:
                break

            try:
                text, summary = fetch_granule_text(package_id, granule_id)
            except Exception as e:
                print(f'  - Failed to fetch {granule_id}: {e}')
                continue
            if not text:
                continue
            speeches = extract_speeches_from_xml(text)
            page = parse_page_from_granule_id(granule_id)
            title = (summary.get('title') or g.get('title') or '').strip()

            for sp in speeches:
                rec = {
                    'date': pkg_date,
                    'package_id': package_id,
                    'granule_id': granule_id,
                    'chamber': chamber,
                    'page': page,
                    'title': title,
                    **sp,
                }
                out.write(json.dumps(rec, ensure_ascii=False) + '
')
                count_speeches += 1

            count_granules += 1
            if count_granules % 25 == 0:
                print(f'  - Processed {count_granules} granules, {count_speeches} speeches so far...')
            time.sleep(RATE_DELAY)

        if max_packages and count_packages >= max_packages:
            break

print(f'Done. Packages: {count_packages}, granules: {count_granules}, speeches: {count_speeches}.')
print(f'JSONL written: {JSONL_PATH}')


In [None]:
# Optional: convert JSONL to CSV for quick analysis
import csv, json

def jsonl_to_csv(jsonl_path: str, csv_path: str, field_order: Optional[List[str]] = None):
    rows = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
            rows.append(json.loads(line))
    if not rows:
        print('No rows to write.')
        return
    if not field_order:
        # reasonable default order
        field_order = [
            'date','chamber','speaker','bioguide_id','title','page','package_id','granule_id','text'
        ]
    with open(csv_path, 'w', newline='', encoding='utf-8') as f:
        w = csv.DictWriter(f, fieldnames=field_order, extrasaction='ignore')
        w.writeheader()
        for r in rows:
            w.writerow(r)
    print(f'CSV written: {csv_path} ({len(rows)} rows)')

# Uncomment to run after JSONL generation
# jsonl_to_csv(JSONL_PATH, CSV_PATH)
