In [1]:
import pandas as pd
import os
from search_engine import SearchEngine

engine = SearchEngine()

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)

In [2]:
optimal = pd.read_csv('./crs_matches_with_congresses_12-7-23.csv')

# Clean up the congress_match name
optimal['match'] = optimal['match'].apply(lambda x: x.replace('\n', ''))

# Attach report name
report_names = pd.read_csv('./SearchResults.csv')
optimal['ProductNumber'] = optimal.path.apply(lambda x: x.split('.')[0])
optimal = optimal.merge(report_names[['ProductNumber', 'Title']], on='ProductNumber', how='inner')

optimal.groupby('Title').size().sort_values(ascending=False).head(150).index.values.tolist()

['Drug Pricing and Intellectual Property: The Legislative Landscape for the 117th Congress',
 "Changes in the House of Representatives' Initial Consideration of Regular Appropriations Measures, 113th-116th Congresses",
 'U.S. Sanctions: Legislation in the 117th Congress',
 'Unemployment Insurance: Legislative Issues in the 117th Congress, First Session',
 'Proposals for a COVID-19 Congressional Advisory Commission in the 117th Congress: A Comparative Analysis',
 'Tax Provisions in the Build Back Better Act: Rules Committee Print 117-18',
 'Immigration Legislation and Issues in the 117th Congress',
 'Workforce Innovation and Opportunity Act of 2022 (H.R. 7309)',
 'Senate Finance Committee Tax Provisions in the Build Back Better Act',
 'Department of Homeland Security Appropriations: FY2022',
 'Build Back Better Act (BBBA) Health Coverage Provisions: House-Passed and Senate-Released Language',
 'Alternative Fuels and Vehicles: Legislative Proposals',
 'Commander John Scott Hannon Veteran

In [3]:
# Map CRS titles to manually create queries
queries = {
    'Drug Pricing and Intellectual Property: The Legislative Landscape for the 117th Congress': [
        'drug pricing and intellectual property',
        'drug pricing',
        'intellectual property'
    ],
    'U.S. Sanctions: Legislation in the 117th Congress': [
        'us sanctions',
        'u.s. sanctions',
        'united states sanctions'
    ],
    'Unemployment Insurance: Legislative Issues in the 117th Congress, First Session': [
        'unemployment insurance'
    ],
    'Tax Provisions in the Build Back Better Act: Rules Committee Print 117-18': [
        'tax provisions build back better act',
        'build back better act',
        'build back better',
        'tax provisions in the build back better act, rules committee',
        'build back better rules committee'
    ],
    'Immigration Legislation and Issues in the 117th Congress': [
        'immigration'
    ],
    'Workforce Innovation and Opportunity Act of 2022 (H.R. 7309)': [
        'workforce innovation and opportunity act of 2022',
        'workforce innovation and opportunity act',
        'workforce innovation and opportunity',
        'workplace innovation',
        'H.R. 7309'
    ],
    'Department of Homeland Security Appropriations: FY2022': [
        'department of homeland security appropriations',
        'homeland security',
        'homeland security appropriations',
        'homeland security FY2022',
        'homeland security 2022'
    ],
    'Alternative Fuels and Vehicles: Legislative Proposals': [
        'alternative fuels and vehicles',
        'alternative fuels',
    ],
    'Voter Registration Records and List Maintenance for Federal Elections': [
        'voter registration',
        'voter registration records for federal election',
        'federal voter registration',
        'voter registration records and list maintenance'
    ],
    'Federal Firearms Law: Selected Developments in the Executive, Legislative, and Judicial Branches': [
        'firearms law',
        'federal firearms law'
    ],
    'Federal Research and Development (R&D) Funding: FY2022': [
        'federal r&d',
        'federal research and development',
        'r&d funding 2022'
    ],
    'Energy and Water Development: FY2022 Appropriations': [
        'energy and water development',
        'energy and water appropriations'
    ],
    'Advanced Research Projects Agency for Health (ARPA-H): Congressional Action and Selected Policy Issues': [
        'arpa-h',
        'advanced research projects agency for health',
        'arpa health'
    ],
    'Climate Change Adaptation: Department of Commerce': [
        'climate change adaptation',
        'climate change',
        'climate change adaptation department of commerce',
        'climate adaptation commerce department'
    ],
    'Marine Harmful Algal Blooms (HABs): Background, Statutory Authorities, and Issues for Congress': [
        'marine harmful algal blooms',
        'harmful algae',
        'marine harmful algal blooms statutory authorities'
    ],
    'Expanding Broadband: Potential Role of Municipal Networks to Address the Digital Divide': [
        'municipal networks digital divide',
        'municipal networks',
        'expanding broadband digital divide',
        'municipal broadband'
    ],
    'Food Insecurity Among College Students: Background and Policy Options ': [
        'food insecurity',
        'food insecurity college students'
    ],
    'Dam Removal and the Federal Role': [
        'dam removal'
    ],
    'Capital Gains Taxes: An Overview of the Issues': [
        'capital gains taxes'
    ],
    'Pipeline Cybersecurity: Federal Programs': [
        'pipeline cybersecurity'
    ],
    "Puerto Rico's Public Debts: Accumulation and Restructuring": [
        'puerto rico debt',
        'puerto rico public debt',
        'puerto rico public debt restructuring'
    ]
    
}

In [4]:
data = [(key, value) for key, values in queries.items() for value in values]
manual = pd.DataFrame(data, columns=['Title', 'Query'])

In [5]:
manual.head(5)

Unnamed: 0,Title,Query
0,Drug Pricing and Intellectual Property: The Le...,drug pricing and intellectual property
1,Drug Pricing and Intellectual Property: The Le...,drug pricing
2,Drug Pricing and Intellectual Property: The Le...,intellectual property
3,U.S. Sanctions: Legislation in the 117th Congress,us sanctions
4,U.S. Sanctions: Legislation in the 117th Congress,u.s. sanctions


In [6]:
# Reformat to match the results dataframes
optimal['primary_key'] = 'hr_' + optimal['match'].apply(lambda x: x.split(' ')[-1]) + '_' + optimal.congress_match

# Merge with the manually-generated queries
optimal = optimal.merge(manual, on='Title', how='left')
optimal = optimal[optimal['Query'].notna()]

Now go to congress.gov and manually enter these queries. Filter to return H.R. only, and only the previous 10 congresses (otherwise there will be too many results to bulk download, as congress.gov limits it to 5000).

Search all fields, including bill text.

These have too many results on congress.gov to bulk download:
* federal r&d   # FAILED
* federal research and development  # FAILED
* homeland security   # FAILED
* immigration   # FAILED

In [7]:
print(optimal.Query.nunique())
for q in optimal.Query.unique().tolist():
    print(q)

58
drug pricing and intellectual property
drug pricing
intellectual property
climate change adaptation
climate change
climate change adaptation department of commerce
climate adaptation commerce department
puerto rico debt
puerto rico public debt
puerto rico public debt restructuring
unemployment insurance
food insecurity
food insecurity college students
energy and water development
energy and water appropriations
alternative fuels and vehicles
alternative fuels
federal r&d
federal research and development
r&d funding 2022
pipeline cybersecurity
marine harmful algal blooms
harmful algae
marine harmful algal blooms statutory authorities
voter registration
voter registration records for federal election
federal voter registration
voter registration records and list maintenance
dam removal
firearms law
federal firearms law
tax provisions build back better act
build back better act
build back better
tax provisions in the build back better act, rules committee
build back better rules commit

In [8]:
import numpy as np

def calculate_map(retrieved_docs_df, relevant_docs):
    rel_ret_at_k = 0
    precision = []

    for i, row in retrieved_docs_df.iterrows():
        rank = i + 1
        if row['primary_key'] in relevant_docs:
            rel_ret_at_k += 1
            p_at_k = rel_ret_at_k / rank
            precision.append(p_at_k)

    for _ in range(len(relevant_docs - set(retrieved_docs_df.primary_key.values))):
        precision.append(0)

    return np.mean(precision)

In [9]:
bert_mrrs = []
congress_mrrs = []

bert_p50s = []
congress_p50s = []
bert_p500s = []
congress_p500s = []

bert_r50s = []
congress_r50s = []
bert_r500s = []
congress_r500s = []

bert_map = []
congress_map = []

for i, path in enumerate(os.listdir('./congress_gov_searches/')):
    query = path.split('.csv')[0]

    # PARAMETERIZATION BUG THAT NEEDS TO BE ADDRESSED
    if query in ['arpa-h', 'r&d funding 2022']:
        continue

    print(f'\nProcessing query {i}:', query)

    congress_gov = pd.read_csv(
        f'./congress_gov_searches/{query}.csv', 
        skiprows=2
    )
    # Get top 1000
    congress_gov = congress_gov[:1000]
    congress_gov['primary_key'] = 'hr_' + congress_gov['Legislation Number'] \
        .apply(lambda x: x.lower().split(' ')[1]) + '_' + congress_gov['Congress'].apply(lambda x: x.split(' ')[0])

    params = {
        'query': query,
        'number_to_return': 1000,
        'exact_match_strings': [],  
        'date_range': {
            'start_year': 1970,
            'start_month': 1,
            'start_day': 1,
            'end_year': 2050,
            'end_month': 12,
            'end_day': 31
        },
        'get_sponsors': False,
        'chamber': 'any',
        'require_bipartisan': False
    }

    bert_result = engine.retrieve_summary(params)
    bert = pd.DataFrame(bert_result)
    bert['primary_key'] = 'hr_' + bert.bill_number.astype(str) + '_' + bert.congress.astype(str) + 'th'

    optimal_this_query = optimal[optimal['Query'] == query]
    relevant_docs = optimal_this_query.primary_key.values

    bert_filter = bert[bert.primary_key.isin(relevant_docs)]
    congress_filter = congress_gov[congress_gov.primary_key.isin(relevant_docs)]

    # Calculate BERT MRR
    if len(bert_filter) > 0:
        bert_mrr = 1 / (1 + bert_filter.index.values[0])
        bert_mrrs.append(bert_mrr)
    else:
        bert_mrrs.append(0)

    # Calculate Congress MRR
    if len(congress_filter) > 0:
        congress_mrr = 1 / (1 + congress_filter.index.values[0])
        congress_mrrs.append(congress_mrr)
    else:
        congress_mrrs.append(0)


    # Calculate BERT Precision@50
    if len(bert_filter) > 0:
        bert50 = bert[:50]
        p50 = bert50[bert50.primary_key.isin(relevant_docs)].primary_key.nunique() / 50
        bert_p50s.append(p50)
    else:
        bert_p50s.append(0)

    # Calculate Congress Precision@50
    if len(congress_filter) > 0:
        congress50 = congress_gov[:50]
        p50 = congress50[congress50.primary_key.isin(relevant_docs)].primary_key.nunique() / 50
        congress_p50s.append(p50)
    else:
        congress_p50s.append(0)

    # Calculate BERT Precision@500
    if len(bert_filter) > 0:
        bert500 = bert[:500]
        p500 = bert500[bert500.primary_key.isin(relevant_docs)].primary_key.nunique() / 500
        bert_p500s.append(p500)
    else:
        bert_p500s.append(0)

    # Calculate Congress Precision@500
    if len(congress_filter) > 0:
        congress500 = congress_gov[:500]
        p500 = congress500[congress500.primary_key.isin(relevant_docs)].primary_key.nunique() / 500
        congress_p500s.append(p500)
    else:
        congress_p500s.append(0)

    # Calculate BERT r@50
    if len(bert_filter) > 0:
        bert50 = bert[:50]
        r50 = bert50[bert50.primary_key.isin(relevant_docs)].primary_key.nunique() / optimal_this_query.primary_key.nunique()
        bert_r50s.append(r50)
    else:
        bert_r50s.append(0)

    # Calculate Congress r@50
    if len(congress_filter) > 0:
        congress50 = congress_gov[:50]
        r50 = congress50[congress50.primary_key.isin(relevant_docs)].primary_key.nunique() / optimal_this_query.primary_key.nunique()
        congress_r50s.append(r50)
    else:
        congress_r50s.append(0)

    # Calculate BERT r@500
    if len(bert_filter) > 0:
        bert500 = bert[:500]
        r500 = bert500[bert500.primary_key.isin(relevant_docs)].primary_key.nunique() / optimal_this_query.primary_key.nunique()
        bert_r500s.append(r500)
    else:
        bert_r500s.append(0)

    # Calculate Congress r@500
    if len(congress_filter) > 0:
        congress500 = congress_gov[:500]
        r500 = congress500[congress500.primary_key.isin(relevant_docs)].primary_key.nunique() / optimal_this_query.primary_key.nunique()
        congress_r500s.append(r500)
    else:
        congress_r500s.append(0)

    if len(bert_filter) > 0:    
        bert_map.append(calculate_map(bert, set(relevant_docs)))
    else:
        bert_map.append(0)

    if len(congress_filter) > 0:
        congress_map.append(calculate_map(congress_gov, set(relevant_docs)))
    else:
        congress_map.append(0)


Processing query 0: advanced research projects agency for health
select ft.id as id 
 from full_texts ft  
         join (
            select ft_id, bm25(congress_bm25) as score
            from congress_bm25
            where congress_bm25 match ? 
            order by score
            limit ?
        ) as bm25_ft on ft.id = bm25_ft.ft_id 
 where ft.file_chamber = 'hr' 
 limit ? 
['project OR research OR advanc OR agenc OR health', '500', '500']
Get initial docs: 1.5150096416473389
Get precomputed embeddings: 0.45245885848999023
Clean query: 0.0
Embed query: 0.0458829402923584
Score docs: 0.01630687713623047
BERT rerank: 0.5417466163635254
Get remaining data: 0.22310209274291992
Get sponsors: 0.0

Processing query 1: alternative fuels and vehicles
select ft.id as id 
 from full_texts ft  
         join (
            select ft_id, bm25(congress_bm25) as score
            from congress_bm25
            where congress_bm25 match ? 
            order by score
            limit ?
       

In [10]:
import numpy as np

print('BERT MRR:', np.mean(bert_mrrs))
print('BERT P@50:', np.mean(bert_p50s))
print('BERT P@500:', np.mean(bert_p500s))
print('BERT R@50:', np.mean(bert_r50s))
print('BERT R@500:', np.mean(bert_r500s))
print('BERT MAP:', np.mean(bert_map))

print('\nCONGRESS MRR:', np.mean(congress_mrrs))
print('CONGRESS P@50:', np.mean(congress_p50s))
print('CONGRESS P@500:', np.mean(congress_p500s))
print('CONGRESS R@50:', np.mean(congress_r50s))
print('CONGRESS R@500:', np.mean(congress_r500s))
print('CONGRESS MAP:', np.mean(congress_map))


BERT MRR: 0.1614843050513482
BERT P@50: 0.035
BERT P@500: 0.01903846153846154
BERT R@50: 0.14474621544378685
BERT R@500: 0.4170194789573528
BERT MAP: 0.04109991547255217

CONGRESS MRR: 0.07289855045796147
CONGRESS P@50: 0.03461538461538462
CONGRESS P@500: 0.024653846153846155
CONGRESS R@50: 0.1929614840788766
CONGRESS R@500: 0.46625704698187226
CONGRESS MAP: 0.05013851358506685
