In [None]:
from smolagents import CodeAgent, DuckDuckGoSearchTool, OpenAIServerModel
import os
model = OpenAIServerModel(
    model_id="gpt-4o-mini",
    api_key=os.environ["OPENAI_API_KEY"],
)
agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model)
agent.run("Who is Brandon Eychaner?")

In [1]:
import logging 
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [11]:
"""This module includes methods and classes for collecting data from the Congress API."""
import logging 
import os
import requests
from urllib.parse import urlparse
from urllib.parse import parse_qs
import time

from tqdm import tqdm

from src.data_structures.bills import Bill, ResultType 

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Set up constants for the module
congress_api_key = os.environ["CONGRESS_API_KEY"]
BASE_URL = os.environ["CONGRESS_API_URL"]
BILL_ENDPOINT = "bill"
RESULT_LIMIT = 250
API_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0)',
    'accept': 'application/xml',
    "x-api-key": congress_api_key}
DATE_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
RATE_LIMIT_CONSTANT = 5000 / 60 / 60 # 5000 requests per hour


def extract_offset(url: str) -> int:
    """
    Extract the offset from a URL.

    Args:
        url (str): The URL to extract the offset from.

    Returns:
        int: The offset value.
    """
    parsed_url = urlparse(url)
    offset = parse_qs(parsed_url.query)['offset'][0]
    if offset:
        offset = int(offset)
        return offset
    return -1


def retrieve_congress_bills(from_date: str, to_date: str, offset: int = 0) -> list[Bill]:
    """
    Retrieve bills from the Congress API within a date range.

    Args:
        from_date (str): The start date for the search in the format "YYYY-MM-DDTHH:MM:SSZ".
        to_date (str): The end date for the search in the format "YYYY-MM-DDTHH:MM:SSZ".
        offset (int): The offset for the search.

    Returns:
        list[Bill]: A list of Bill objects.

    Raises:
        HTTPError: An error occurred while making the request
    """
    params = {
        "api_key": congress_api_key,
        "format": ResultType.JSON,
        "limit": RESULT_LIMIT,
        "fromDateTime": from_date,
        "toDateTime": to_date
    }
    
    if offset > 0:
        params["offset"] = offset

    response = requests.get(BASE_URL + BILL_ENDPOINT, headers=API_HEADERS, params=params)
    
    if response.status_code == 200:
        data = response.json()
        bills = data.get("bills", [])
        try:
            prepared_bills = [Bill.model_validate(bill) for bill in bills]
        except: 
            print(f"Problem with the following metadata: {offset} offset, {from_date} from_date, {to_date} to_date")
            prepared_bills = []
            pass
        # Check if there is a next page
        if "next" in data.get("pagination", {}):
            offset = extract_offset(data["pagination"]["next"])
            return (prepared_bills, data, offset, data["pagination"]["count"])
        return (prepared_bills, data, -1, 0)
    else:
        response.raise_for_status()


def determine_wait(start_time: float, offset: int) -> float:
    """
    Determine the wait time based on the rate limit constant.

    Args:
        start_time (float): The start time of the request.
        offset (int): The offset for the request.
    """
    current_time = time.time()
    elapsed_time = current_time - start_time
    print(f"Elapsed time: {elapsed_time}")
    requests = max(RESULT_LIMIT, offset) / RESULT_LIMIT
    rate = elapsed_time / requests
    if rate < RATE_LIMIT_CONSTANT:
        wait_time = RATE_LIMIT_CONSTANT - rate
        print(f"Sleeping for {wait_time} seconds.")
        time.sleep(wait_time)


def gather_congress_bills(from_date: str, to_date: str) -> list[Bill]:
    """
    Gather all bills from the Congress API within a date range.
    
    Args:
        from_date (str): The start date for the search in the format "YYYY-MM-DDTHH:MM:SSZ".
        to_date (str): The end date for the search in the format "YYYY-MM-DDTHH:MM:SSZ".

    Returns:
        list[Bill]: A list of Bill objects.
    """
    start = time.time()
    bills = []
    responses = []
    offset = 0
    total_count = None
    pbar = None

    while offset != -1:

        result, response, offset, count = retrieve_congress_bills(from_date, to_date, offset)
        bills.extend(result)
        responses.append(response)
        if total_count is None:
            total_count = count
            pbar = tqdm(total=total_count, desc="Retrieving bills")
        pbar.update(len(result))
        determine_wait(start, offset) # Prevent rate limiting
    if pbar:
        pbar.close()
    return bills, responses


def get_bill_actions(bill: Bill):
    """
    Get the details of a specific bill.

    Args:
        congress (int): The Congress number.
        billType (str): The type of bill.
        billNumber (int): The bill number.

    Returns:
        Bill: A Bill object.
    """
    congress, billType, billNumber = bill.congress, bill.type, bill.number
    params = {
        "format": ResultType.JSON,
        
    }
    # Put 
    response = requests.get(f"{BASE_URL}{BILL_ENDPOINT}/{congress}/{billType.lower()}/{billNumber}/actions", headers=API_HEADERS, params=params)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        response.raise_for_status()

bills, responses = gather_congress_bills("2024-02-15T00:00:00Z", "2024-03-30T00:00:00Z")

Retrieving bills: 200it [00:00, 3711773.45it/s]

Elapsed time: 1.5199980735778809





In [52]:
"""
    CDG Client - An example client for the Congress.gov API.

    @copyright: 2022, Library of Congress
    @license: CC0 1.0
"""
from urllib.parse import urljoin

import requests


API_VERSION = "v3"
ROOT_URL = "https://api.congress.gov/"
RESPONSE_FORMAT = "json"


class _MethodWrapper:
    """ Wrap request method to facilitate queries.  Supports requests signature. """

    def __init__(self, parent, http_method):
        self._parent = parent
        self._method = getattr(parent._session, http_method)

    def __call__(self, endpoint, *args, **kwargs):  # full signature passed here
        response = self._method(
            urljoin(self._parent.base_url, endpoint), *args, **kwargs
        )
        # unpack
        if response.headers.get("content-type", "").startswith("application/json"):
            return response.json()
        else:
            return response.content


class CDGClient:
    """ A sample client to interface with Congress.gov. """

    def __init__(
        self,
        api_key,
        api_version=API_VERSION,
        response_format=RESPONSE_FORMAT,
        raise_on_error=True,
    ):
        self.base_url = urljoin(ROOT_URL, api_version) + "/"
        self._session = requests.Session()

        # do not use url parameters, even if offered, use headers
        self._session.params = {"format": response_format}
        self._session.headers.update({"x-api-key": api_key})

        if raise_on_error:
            self._session.hooks = {
                "response": lambda r, *args, **kwargs: r.raise_for_status()
            }

    def __getattr__(self, method_name):
        """Find the session method dynamically and cache for later."""
        method = _MethodWrapper(self, method_name)
        self.__dict__[method_name] = method
        return method

In [None]:
from datetime import datetime
from enum import StrEnum
from typing import List, Optional
from pydantic import BaseModel, HttpUrl

class Summary(BaseModel):
    actionDate: datetime
    actionDesc: str
    text: str
    updateDate: datetime
    versionCode: str

class Committee(BaseModel):
    name: str
    systemCode: str
    url: HttpUrl
    
class SourceSystem(BaseModel):
    name: str

class Action(BaseModel):
    actionDate: Optional[datetime] = None
    committees: Optional[List[Committee]] = None
    sourceSystem: SourceSystem
    text: str
    type: str
    actionCode: Optional[str] = None
    actionTime: Optional[datetime] = None

class Activity(BaseModel):
    date: datetime
    name: str

class Committee(BaseModel):
    activities: List[Activity]
    chamber: str
    name: str
    systemCode: str
    type: str
    url: HttpUrl

class LatestAction(BaseModel):
    actionDate: datetime
    text: str
    actionTime: Optional[datetime] = None

class Amendment(BaseModel):
    congress: int
    latestAction: LatestAction
    number: str
    purpose: str
    type: str
    updateDate: datetime
    url: HttpUrl

class RelationshipDetail(BaseModel):
    identifiedBy: str
    type: str

class BillMetadata(BaseModel):
    congress: int
    latestAction: LatestAction
    number: int
    relationshipDetails: Optional[List[RelationshipDetail]] = []
    title: str
    type: str
    url: HttpUrl

class Title(BaseModel):
    title: str
    titleType: str
    titleTypeCode: int
    updateDate: datetime
    billTextVersionCode: Optional[str] = None
    billTextVersionName: Optional[str] = None
    
class Format(BaseModel):
    type: str
    url: HttpUrl

class TextVersion(BaseModel):
    date: datetime
    formats: List[Format]
    type: str

class PolicyArea(BaseModel):
    name: str
    updateDate: Optional[datetime] = None

class LegislativeSubject(BaseModel):
    name: str
    updateDate: datetime

class Subjects(BaseModel):
    legislativeSubjects: List[LegislativeSubject]
    policyArea: PolicyArea

class CountUrl(BaseModel):
    count: int
    url: HttpUrl

class LatestAction(BaseModel):
    actionDate: datetime
    text: str

class Member(BaseModel):
    bioguideId: str
    firstName: str
    fullName: str
    lastName: str
    party: str
    state: str
    url: HttpUrl
    middleName: Optional[str]
    district: Optional[int]
    isOriginalCosponsor: Optional[bool] = None
    isByRequest: Optional[str]

class ChamberCode(StrEnum):
    house = "H"
    senate = "S"
    
class Bill(BaseModel):
    congress: int
    constitutionalAuthorityStatementText: str
    introducedDate: datetime
    latestAction: LatestAction
    number: str
    originChamber: str
    originChamberCode: ChamberCode
    policyArea: PolicyArea
    sponsors: List[Member]
    title: str
    type: str
    updateDate: datetime
    updateDateIncludingText: datetime

In [14]:
full_data = []

In [106]:
client = CDGClient(api_key=os.environ["CONGRESS_API_KEY"], response_format=RESPONSE_FORMAT)

additional_data = ['actions', 'amendments', 'committees', 'cosponsors', 'relatedbills', 'subjects', 'summaries', 'text', 'titles']

def get_additional_data(bill_data: dict) -> dict:
    """
    Get additional data for a bill.

    Args:
        bill_data (dict): The bill data.

    Returns:
        dict: The additional data.
    """
    congress = bill_data["congress"]
    bill_type = bill_data["type"].lower()
    bill_number = bill_data["number"]
    client = CDGClient(api_key=os.environ["CONGRESS_API_KEY"], response_format=RESPONSE_FORMAT)
    for s in additional_data:
        data = client.get(f"bill/{congress}/{bill_type}/{bill_number}/{s}")
        bill_data[s] = data
    return bill_data

for i, bill in enumerate(bills[0:1]): 
    start = time.time()
    congress = bill.congress
    bill_type = bill.type.lower()
    bill_number = bill.number
    # use requests args and kwargs below modify the request:
    bill_data = client.get(f"bill/{congress}/{bill_type}/{bill_number}")
    bill_data = get_additional_data(bill_data["bill"])

In [107]:
bill_data["amendments"]

{'amendments': [],
 'pagination': {'count': 0},
 'request': {'billNumber': '47',
  'billType': 'sjres',
  'billUrl': 'https://api.congress.gov/v3/bill/118/sjres/47?format=json',
  'congress': '118',
  'contentType': 'application/json',
  'format': 'json'}}

In [108]:
def retrieve_laws(start_date: str, end_date: str, congress: int, offset: int) -> dict:
    """
    Retrieve the details of a specific law.

    Args:
        start_date (str): The start date for the search in the format "YYYY-MM-DDTHH:MM:SSZ".
        end_date (str): The end date for the search in the format "YYYY-MM-DDTHH:MM:SSZ".
    Returns:
        dict: A dictionary of the law details.
    """
    params = {
        "api_key": congress_api_key,
        "format": ResultType.JSON,
    }
    API_HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0)',
        'accept': 'application/xml',
        "x-api-key": congress_api_key}
    
    if offset > 0:
        params["offset"] = offset
    
    response = requests.get(f"{BASE_URL}law/{congress}", headers=API_HEADERS, params=params)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        response.raise_for_status()

laws = retrieve_laws("2024-02-15T00:00:00Z", "2024-03-30T00:00:00Z", 118, 0)

In [None]:
def retrieve_specific_law(congress: int, law_type: str, law_number: int) -> dict:
    """
    Retrieve the details of a specific law.

    Args:
        congress (int): The Congress number.
        law_type (str): The type of law.
        law_number (int): The law number.
    Returns:
        dict: A dictionary of the law details.
    """
    params = {
        "api_key": congress_api_key,
        "format": ResultType.JSON,
    }
    API_HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:39.0)',
        'accept': 'application/xml',
        #"x-api-key": congress_api_key
        }
    
    response = requests.get(f"{BASE_URL}law/{congress}/{law_type}/{law_number}", headers=API_HEADERS, params=params)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        response.raise_for_status()

retrieve_specific_law("118", "hr", '5103')

HTTPError: 404 Client Error: Not Found for url: https://api.congress.gov/v3/law/118/hr/5103?api_key=hfHawlMleTdqUWQkARvg1x9Mnpj5LlaumfwszonK&format=json

In [121]:
client = CDGClient(api_key=os.environ["CONGRESS_API_KEY"], response_format=RESPONSE_FORMAT)

In [125]:
client.get("law/117/pub/108")

{'bill': {'actions': {'count': 74,
   'url': 'https://api.congress.gov/v3/bill/117/hr/3076/actions?format=json'},
  'amendments': {'count': 48,
   'url': 'https://api.congress.gov/v3/bill/117/hr/3076/amendments?format=json'},
  'cboCostEstimates': [{'description': 'As ordered reported by the House Committee on Oversight and Reform on May 13, 2021\n',
    'pubDate': '2021-07-14T17:27:00Z',
    'title': 'H.R. 3076, Postal Service Reform Act of 2021',
    'url': 'https://www.cbo.gov/publication/57356'},
   {'description': "As Posted on February 3, 2022,\nand as Amended by Amendment #1, the Manager's Amendment, as Posted on February 4, 2022\n",
    'pubDate': '2022-02-04T18:03:00Z',
    'title': 'Estimated Budgetary Effects of Rules Committee Print 117-32 for H.R. 3076, the Postal Service Reform Act of 2022',
    'url': 'https://www.cbo.gov/publication/57821'}],
  'committeeReports': [{'citation': 'H. Rept. 117-89',
    'url': 'https://api.congress.gov/v3/committee-report/117/HRPT/89?forma

In [None]:
client.get("law/118/hr")

HTTPError: 404 Client Error: Not Found for url: https://api.congress.gov/v3/law/118/hr?format=json

In [113]:
laws['bills'][0]

{'congress': 118,
 'latestAction': {'actionDate': '2025-01-04',
  'text': 'Became Public Law No: 118-229.'},
 'laws': [{'number': '118-229', 'type': 'Public Law'}],
 'number': '5103',
 'originChamber': 'House',
 'originChamberCode': 'H',
 'title': 'FISHES Act',
 'type': 'HR',
 'updateDate': '2025-02-01',
 'updateDateIncludingText': '2025-02-01',
 'url': 'https://api.congress.gov/v3/bill/118/hr/5103?format=json'}

In [131]:
b = BillMetadata(**laws['bills'][0])

In [142]:
a = [list(laws['bills'][i].keys()) for i in range(len(laws['bills']))]

# Flatten list, make set
set([item for sublist in a for item in sublist])

{'congress',
 'latestAction',
 'laws',
 'number',
 'originChamber',
 'originChamberCode',
 'title',
 'type',
 'updateDate',
 'updateDateIncludingText',
 'url'}

In [None]:
{'congress': 118,
 'latestAction': {'actionDate': '2025-01-04',
  'text': 'Became Public Law No: 118-229.'},
 'laws': [{'number': '118-229', 'type': 'Public Law'}],
 'number': '5103',
 'originChamber': 'House',
 'originChamberCode': 'H',
 'title': 'FISHES Act',
 'type': 'HR',
 'updateDate': '2025-02-01',
 'updateDateIncludingText': '2025-02-01',
 'url': 'https://api.congress.gov/v3/bill/118/hr/5103?format=json'}

# Turn that into a Law class
class LawMeta(BaseModel):
    number: str
    type: str

class Note(BaseModel):
    def __init__(self, text: str):
        self.text = ["\n".join(x["text"]) for x in text]

class Chamber(StrEnum):
    HOUSE = "House"
    SENATE = "Senate"

class LawType(StrEnum):
    PUBLIC = "Public Law"
    PRIVATE = "Private Law"

class LawMetadata(BaseModel):
    type: LawType
    number: str

class Law(BaseModel):
    congress: int
    latestAction: LatestAction
    laws: List[LawMeta]
    number: str
    originChamber: Chamber
    originChamberCode: ChamberCode
    title: str
    type: str
    updateDate: datetime
    updateDateIncludingText: datetime
    url: HttpUrl

class BillMetadata(BaseModel):
    congress: int
    latestAction: LatestAction
    number: int
    relationshipDetails: Optional[List[RelationshipDetail]] = []
    title: str
    type: str
    url: HttpUrl

class Bill(BaseModel):
    congress: int
    constitutionalAuthorityStatementText: str
    introducedDate: datetime
    latestAction: LatestAction
    laws: Optional[List[LawMeta]]
    number: str
    originChamber: str
    originChamberCode: ChamberCode
    policyArea: PolicyArea
    sponsors: List[Member]
    title: str
    type: str
    updateDate: datetime
    updateDateIncludingText: datetime
    notes: Optional[Note]

