## Getting Data

In [125]:
def full_name(name):
    name_arr = name.split(', ')
    if len(name_arr) == 2:
        name_arr = np.append(name_arr[1], name_arr[0])
    else:
        name_arr = [name_arr[1]] + [name_arr[0]] + name_arr[2:]
    return ' '.join(name_arr)

In [134]:
import glob
import json
import pandas as pd
import numpy as np

# Define a function to extract data from a given JSON filepath
def extract_data_from_json(filepath):
    with open(filepath, "r") as file:
        data = json.load(file)
        
    # Extract the relevant fields
    short_title = data['short_title'] if data['short_title'] else data['official_title']
    full_title = data['official_title']
    
    # Identify the chamber based on directory name
    # if "hconres" in filepath:
    #     chamber = "House"
    # elif "sjres" in filepath:
    #     chamber = "Senate"
    # else:
    #     chamber = "Unknown"
    
    
    summary = data['summary']['text'] if data['summary'] else np.NaN
    sponsor = full_name(data['sponsor']['name']) if data['sponsor'] else np.NaN
    cosponsors = [full_name(entry['name']) for entry in data['cosponsors']]
    last_action_date = data['actions'][-1]['acted_at'] if data['actions'] else None
    link = None  # Placeholder as no link is provided in the JSON

    return {
        'Short Title': short_title,
        'Full Title': full_title,
        'Summary': summary,
        'Sponsors': sponsor,
        'Cosponsors': cosponsors,
        'Last Action Date': last_action_date,
    }

bills = {}
for congress in range(109, 118):
    # Get list of all data.json files in the Bills directory
    filepaths = glob.glob(f"Bills/{congress}/*/*/*.json", recursive=True)

    # Extract data from each file and collate into a list of dictionaries
    data_list = [extract_data_from_json(filepath) for filepath in filepaths]

    # Convert the list of dictionaries into a pandas DataFrame
    df_all = pd.DataFrame(data_list)
    
    bills[congress] = df_all.to_json(orient="records")

In [253]:
import xmltodict
import re
filepaths_118 = glob.glob(f"Bills/118/*/*/*.xml", recursive=True)

def extract_data_from_json_118(filepath):
    with open(filepath) as xml_file:
        data = xmltodict.parse(xml_file.read())
    data = data['billStatus']['bill']
 

    # Extract the relevant fields
    short_title = data['title']
    full_title = data['title']
    
    if 'summaries' in data.keys():
        if isinstance(data['summaries']['summary'], list):
            summary = data['summaries']['summary'][0]['text']
        else:
            summary = data['summaries']['summary']['text']
    else:
        summary = ''
    

    sponsor_pattern = r'(?:Sen\.|Rep\.|Del\.|Resident Commissioner)\s(.*?)\s\['
    sponsor = full_name(re.search(sponsor_pattern, data['sponsors']['item']['fullName']).group(1))
    
    if 'cosponsors' in data.keys():
        if isinstance(data['cosponsors']['item'], list):
            cosponsors = [full_name(re.search(sponsor_pattern, d['fullName']).group(1)) for d in data['cosponsors']['item']] 
        else:
            cosponsors = [full_name(re.search(sponsor_pattern, data['cosponsors']['item']['fullName']).group(1))] 
    else:
        cosponsors = []
    
    last_action_date = data['latestAction']['actionDate'] if data['latestAction']['actionDate'] else None

    return {
        'Short Title': short_title,
        'Full Title': full_title,
        'Summary': summary,
        'Sponsors': sponsor,
        'Cosponsors': cosponsors,
        'Last Action Date': last_action_date,
        }

    
# Extract data from each file and collate into a list of dictionaries
data_list_118 = [extract_data_from_json_118(filepath) for filepath in filepaths_118]

# Convert the list of dictionaries into a pandas DataFrame
df_all_118 = pd.DataFrame(data_list_118)

bills[118] = df_all_118.to_json(orient="records")

In [257]:
# Export bills into json file
with open('bills.json', 'w') as fp:
    json.dump(bills, fp)

# LLM Knowledge Graph Construction

Data was downloaded in bulk from: https://www.propublica.org/datastore/dataset/congressional-data-bulk-legislation-bills. The text for each bill is stored in JSON format.

In [46]:
from langchain.document_loaders import DirectoryLoader, JSONLoader

json_kwargs = {'jq_schema': '.summary',
              'text_content':False}

loader = DirectoryLoader('./', glob="Bills/117/*/*/*.json", show_progress=True, loader_cls=JSONLoader, loader_kwargs=json_kwargs)

In [51]:
docs = loader.load()


  0%|                                                                                      | 0/12299 [00:00<?, ?it/s][A
  0%|                                                                             | 10/12299 [00:00<02:14, 91.04it/s][A
  0%|▏                                                                            | 20/12299 [00:00<03:00, 68.05it/s][A
  0%|▏                                                                            | 28/12299 [00:00<04:07, 49.53it/s][A
  0%|▏                                                                            | 35/12299 [00:00<04:07, 49.60it/s][A
  0%|▎                                                                            | 41/12299 [00:00<04:00, 50.94it/s][A
  0%|▎                                                                            | 48/12299 [00:00<03:40, 55.51it/s][A
  0%|▎                                                                            | 55/12299 [00:00<03:30, 58.04it/s][A
  1%|▍                         

In [3]:
print(f'docs is of type {type(docs)} and length {len(docs)}; each element is of type {type(docs[0])}')
print(f'elements of type {type(docs[0])} have the following attributes: {dir(docs[0])}')

docs is of type <class 'list'> and length 12299; each element is of type <class 'langchain.schema.document.Document'>
elements of type <class 'langchain.schema.document.Document'> have the following attributes: ['Config', '__abstractmethods__', '__annotations__', '__class__', '__class_vars__', '__config__', '__custom_root_type__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__exclude_fields__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_validators__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__include_fields__', '__init__', '__init_subclass__', '__iter__', '__json_encoder__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__post_root_validators__', '__pre_root_validators__', '__pretty__', '__private_attributes__', '__reduce__', '__reduce_ex__', '__repr__', '__repr_args__', '__repr_name__', '__repr_str__', '__rich_repr__', '__schema_cache__', '__setattr__', '__setstate__', '__signature__', '__sizeof__', '__slots__', '__str__

In [None]:
from human