In [43]:
'''
This is a slightly modified version of a script written for the hyperdemocracy 2023-06 workshop session.
This notebook grabs a huggingface dataset containing info on U.S Congress bills, does some inspection and processing,
and creates a simple grpah using the networkx module
'''

from datasets import load_dataset
import pandas as pd
from bs4 import BeautifulSoup
import re
import openai
import networkx as nx

In [4]:
def load_assembly_records(
    ds_name="hyperdemocracy/us-congress-bills",
    process=True, 
    strip_html=True, 
    remove_empty_body=True,
    col_order=None
) -> pd.DataFrame: 
    '''
    Function to load huggingface dataset from huggingface, specifically, the senate data
    '''
    ds = load_dataset(ds_name, split="train") # Load the train split of from hyperdemocracy dataset
    df = ds.to_pandas() # convert to pd dataframe
    if process: 
        df['congress_num'] = None   # Init new columns
        df['legis_class'] = None
        df['legis_num'] = None
        for irow, row in df.iterrows():
             # For each dataset row, extract 3 info from key column, and populate the new columns
            congress_num, legis_class, legis_num = split_key(row['id'])
            df.loc[irow, 'congress_num'] = congress_num
            df.loc[irow, 'legis_class'] = legis_class
            df.loc[irow, 'legis_num'] = legis_num

    if remove_empty_body: 
        # ??!?
        df = df[df['text']!='']
        df = df[df['summary_text']!='']

    if strip_html: 
        # Extract all text from bills with BeautifulSoup module

        # The bill's text is already in the "text" column
        # df['body'] = df['body'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
        df['summary_text'] = df['summary_text'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

        # Already have congress gov url
        # df['congress_gov_url'] = df['key'].apply(url_from_key)

    """reorder columns based on a list of column names in passed order"""
    if col_order is not None: 
        colset = set(df.columns.tolist())
        ordered = []
        for col in col_order: 
            if col not in colset: 
                raise ValueError(f"Column {col} not in dataframe.")
            else: 
                ordered.append(col)
                colset.remove(col)
        ordered += list(colset)
        df = df[ordered]


    return df

def url_from_key(key): 
    """Return congress.gov url from key."""
    # TODO add assembled url builder option here as well
    url_map = {
        "HR": "house-bill",
        "HCONRES": "house-concurrent-resolution",
        "HRES": "house-resolution",
        "HJRES": "house-joint-resolution",
        "S": "senate-bill",
        "SCONRES": "senate-concurrent-resolution",
        "SRES": "senate-resolution",
        "SJRES": "senate-joint-resolution",
    }
    congress_num, legis_class, legis_num = split_key(key)
    url_legis_class = url_map[legis_class]
    url = f"https://www.congress.gov/bill/{congress_num}th-congress/{url_legis_class}/{legis_num}"
    return url

def split_key(key):
    """
    TODO: add a link explaining this notation and variable names
    """
    congress_num, legis_class, legis_num = re.match("(\d+)(\D+)(\d+)", key).groups()
    return congress_num, legis_class, legis_num

In [5]:
ds_name="hyperdemocracy/us-congress-bills"
ds = load_dataset(ds_name, split="train") # Load the train split of from hyperdemocracy dataset
df = ds.to_pandas() # convert to pd dataframe 

In [36]:
'''
First, obtain a dataframe containing all entries, where at least one column value is not what it is suppose to be
To know this, we wanna see if any columns contain more than one datatype
'''

df_types = df.applymap(type)
for c in df_types.columns:
    print(c, df_types[c].unique())

  df_types = df.applymap(type)


id [<class 'str'>]
title [<class 'str'>]
congress [<class 'int'>]
type [<class 'str'>]
number [<class 'int'>]
origin_chamber [<class 'str'>]
sponsors [<class 'numpy.ndarray'>]
cosponsors [<class 'numpy.ndarray'>]
congress_gov_url [<class 'str'>]
govtrack_url [<class 'str'>]
summary_text [<class 'str'> <class 'NoneType'>]
summary_meta [<class 'dict'> <class 'NoneType'>]
subjects [<class 'numpy.ndarray'>]
policy_area [<class 'str'> <class 'NoneType'>]
bill [<class 'dict'>]
metadata_xml [<class 'str'>]
text_type [<class 'str'> <class 'NoneType'>]
text_date [<class 'str'> <class 'NoneType'>]
text_url [<class 'str'> <class 'NoneType'>]
text_xml [<class 'str'> <class 'NoneType'>]
text [<class 'str'> <class 'NoneType'>]


In [35]:
'''
From output above, we know summary, policy area, text type and other stuff may be empty
Now to do some inspecting, looking at entries with None in them

From output below, we know a majority of bills don't have a summary. They do however, have text available
'''

df_none_summary_text = df[df['summary_text'].apply(type) == type(None)]
# print(df_none_summary_text.head())
print("No. of bills without summaries: " + str(len(df_none_summary_text)))

No. of bills without summaries: 5268


19      2023-03-23T04:00:00+00:00
21      2023-03-28T04:00:00+00:00
24      2023-03-30T04:00:00+00:00
26      2023-04-13T04:00:00+00:00
27      2023-04-25T04:00:00+00:00
                  ...            
9267    2023-09-07T04:00:00+00:00
9268    2023-09-07T04:00:00+00:00
9328    2023-03-01T05:00:00+00:00
9333    2023-03-02T05:00:00+00:00
9336    2023-03-07T05:00:00+00:00
Name: text_date, Length: 5268, dtype: object

In [33]:
'''
Checking bills without texts shows that 71 bills don't have their text scanned yet. At least, that's the assumption as to why 
the text fields of these bills are empty. Without text, naturally, there is no summary for these bills
'''


df_no_text = df[df['text_type'].apply(type) == type(None)]
print("Number of bills without text: " + str(len(df_no_text)))

Number of bills without text: 71


In [1]:
'''
Do dataframe processing, same as in the py notebook
'''

df['congress_num'] = None   # Init new columns
df['legis_class'] = None
df['legis_num'] = None

for irow, row in df.iterrows():
    # For each dataset row, extract 3 info from key column, and populate the new columns
    congress_num, legis_class, legis_num = split_key(row['id'])
    df.loc[irow, 'congress_num'] = congress_num
    df.loc[irow, 'legis_class'] = legis_class
    df.loc[irow, 'legis_num'] = legis_num

# Filter all bills without summaries and text
df = df[df['text_type'].apply(type) != type(None)]
df = df[df['summary_text'].apply(type) != type(None)]


NameError: name 'df' is not defined

In [45]:

'''
Make a graph containing nodes which are bills without summaries and sponsors
'''

G = nx.Graph()
for _, record in df.iterrows():
    node = (record['id'], {"kind": "record", "name": record["title"]})  # Adding Record (Bill) nodes
    G.add_nodes_from([node])
    # Make sponsor nodes
    for sponsor in record['sponsors']:
        # For each bill, add sponsor nodes
        node = (sponsor['bioguideId'], {"name_tag": sponsor['fullName'], "kind": "person"})
        G.add_nodes_from([node])
        edge = (record['id'], sponsor['bioguideId'], {"kind": 'sponsor'})
        G.add_edges_from([edge])

    # Make cosponsor nodes TODO


