In [2]:
# # LOAD ORIGINAL BILLS

import json
from tqdm import tqdm
import random

# Read data from JSON
parsed_data = []
first_50 = []
all = []

# Bill names
bill_names = [
    "US_118_SRES_391",
    "US_118_SRES_479",
    "US_118_S_183",
    "US_118_SRES_215",
    "US_118_HR_4310",
    "US_118_HRES_364",
    "US_118_HRES_715",
    "US_118_SRES_205",
    "US_118_HRES_642",
    "US_118_HRES_589",
    "US_118_HR_4999",
    "US_118_HR_5649",
    "US_118_HR_6264",
    "US_118_HR_1925",
    "US_118_S_3060",
    "US_118_S_102",
    "US_118_HRES_262",
    "US_118_S_1885",
    "US_118_HR_6038",
    "US_118_HRES_511",
    "US_118_HR_3925",
    "US_118_HRES_432",
    "US_118_S_2837",
    "US_118_SRES_102",
    "US_118_HR_5679",
]
# Bill fields: 'billId', 'title', 'introducedDate', 'billText', 'crsSummary', 'briefSummary', 'verboseSummary'
chosen_bills = []
with open("data.jsons", "r") as file:
    for line in tqdm(file, desc="Reading bills"):
        json_object = json.loads(line)
        parsed_data.append(json_object)
    short_bills = [
        bill
        for bill in parsed_data
        # if the length of the bill text is less than 1000 words and the bill contains the world "health"
        if len(bill["billText"].split()) < 1000 and "youth" in bill["billText"].lower()
    ]
    chosen_bills = [bill for bill in short_bills if bill["billId"] in bill_names]
print("Read! ✅")
# print the title and bilLId for 25 RANDOMLY CHOSEN bills from the short bills list
random_bills = random.sample(short_bills, 25)
for bill in random_bills:
    print(f"- {bill['title']} ({bill['billId']})")

Reading bills: 0it [00:00, ?it/s]

Reading bills: 11902it [00:00, 29858.45it/s]


Read! ✅
- Calling on the Secretary of Education to work with stakeholders to immediately eliminate race-based Native logos, mascots, and names from State educational institutions, and calling on State educational institutions and national sports franchises to cease the unsanctioned use of such logos, mascots, and names. (US_118_HRES_589)
- Agricultural Access to Addiction and Mental Health Care Act (US_118_HR_4382)
- Reentry Resource Guide Act of 2023 (US_118_HR_4404)
- Supporting the designation of April 2023 as the "Month of the Military Child". (US_118_HRES_301)
- Learn and Serve America Reinvestment Act (US_118_HR_5679)
- Recognizing August 11, 2023, as the 50th anniversary of hip-hop. (US_118_HRES_618)
- Supporting the designation of a "Boy Scouts of America Day" in celebration of its 113th anniversary. (US_118_HRES_111)
- ACCESS Act (US_118_HR_6043)
- Stop Human Trafficking in School Zones Act (US_118_HR_30)
- Protect Vulnerable Immigrant Youth Act (US_118_HR_4285)
- Youth Poison

In [3]:
# make a formatted markdown doc with the title, billId, billSummary, and Billtext for all the bills in chosen_bills

with open("bills.md", "w") as file:
    for bill in chosen_bills:
        file.write(f"# {bill['title']} ({bill['billId']})\n")
        file.write(f"## Summary\n{bill['briefSummary']}\n")
        file.write(f"## Bill Text\n{bill['billText']}\n")

In [4]:
from dotenv import load_dotenv
from openai import OpenAI
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
import json

gen_embeddings_from_scratch = False

# load openAI API key
load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def gen_bill_embeddings_by_bill_text(bills):
    bill_embeddings = []
    for bill in tqdm(bills, desc="Generating embeddings"):
        embedding = gen_embedding(bill["billText"][:8191])
        bill_embeddings.append((bill["billId"], bill["title"], embedding))
    return bill_embeddings

def gen_bill_embeddings_by_bill_summary(bills):
    bill_embeddings = []
    for bill in tqdm(bills, desc="Generating embeddings"):
        embedding = gen_embedding(bill["briefSummary"][:8191])
        bill_embeddings.append((bill["billId"], bill["title"], embedding))
    return bill_embeddings

def gen_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-large",
    )
    return np.array(response.data[0].embedding)


def load_embeddings_from_file():
    with open("bill_embeddings.jsons", "r") as file:
        bill_embeddings = []
        for line in file:
            json_object = json.loads(line)
            bill_embeddings.append(
                (
                    json_object["billId"],
                    json_object["title"],
                    np.array(json_object["embedding"]),
                )
            )
        return bill_embeddings


# # or load them
# if not gen_embeddings_from_scratch:
#     bill_embeddings = load_embeddings_from_file()
#     print("Embeddings loaded! ✅")


# Compute cosine similarity
def cosine_sim(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]


def search_bills(query, bill_embeddings, parsed_data):
    query_embedding = gen_embedding(query)
    similarities = []
    for bill_id, title, embedding in bill_embeddings:
        for bill in parsed_data:
            if bill["billId"] == bill_id:
                similarity = cosine_sim(query_embedding, embedding)
                similarities.append((bill, similarity))
                break
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    # Return the most relevant bills with their entire objects
    return similarities[0:5]

In [6]:
# generate embeddings for chosen_bills
billTextEmbeddings = gen_bill_embeddings_by_bill_text(chosen_bills)
billSummaryEmbeddings = gen_bill_embeddings_by_bill_summary(chosen_bills)
while True:
    query = input("What's your search?")
    if query == "exit":
        break
    query_embedding = gen_embedding(query)
    print("Searching...")
    print("By Bill Text")
    bill_text_results = search_bills(query, billTextEmbeddings, chosen_bills)
    for i in range(5):
        print(f"{i+1}. {bill_text_results[i][0]['title']} ({bill_text_results[i][0]['billId']})")
    print("By Bill Summary")
    bill_summary_results = search_bills(query, billSummaryEmbeddings, chosen_bills)
    for i in range(5):
        print(f"{i+1}. {bill_summary_results[i][0]['title']} ({bill_summary_results[i][0]['billId']})")


Generating embeddings: 100%|██████████| 25/25 [00:07<00:00,  3.27it/s]
Generating embeddings: 100%|██████████| 25/25 [00:05<00:00,  4.36it/s]


Searching...
By Bill Text
1. Students Helping Young Students Act of 2023 (US_118_S_183)
2. Dr. William W. Sullivan TRIO Upward Bound Student Stipend Support Act of 2023 (US_118_HR_6264)
3. Building Youth Workforce Skills Act (US_118_HR_5649)
4. Supporting the teaching of climate change in schools. (US_118_HRES_262)
5. Learn and Serve America Reinvestment Act (US_118_HR_5679)
By Bill Summary
1. Students Helping Young Students Act of 2023 (US_118_S_183)
2. Building Youth Workforce Skills Act (US_118_HR_5649)
3. Supporting the teaching of climate change in schools. (US_118_HRES_262)
4. A resolution designating November 2023 as "National Homeless Children and Youth Awareness Month". (US_118_SRES_479)
5. Calling on the United States and international donors to prioritize investments in children and youth in development and humanitarian assistance policies, programs, and activities. (US_118_HRES_715)
Searching...
By Bill Text
1. Supporting the teaching of climate change in schools. (US_118_H