In [40]:
# # LOAD ORIGINAL BILLS

import json
from tqdm import tqdm
import random

# Read data from JSON
parsed_data = []
first_50 = []
all = []

# Bill names
bill_names = [
    "US_118_SRES_391",
    "US_118_SRES_479",
    "US_118_S_183",
    "US_118_SRES_215",
    "US_118_HR_4310",
    "US_118_HRES_364",
    "US_118_HRES_715",
    "US_118_SRES_205",
    "US_118_HRES_642",
    "US_118_HRES_589",
    "US_118_HR_4999",
    "US_118_HR_5649",
    "US_118_HR_6264",
    "US_118_HR_1925",
    "US_118_S_3060",
    "US_118_S_102",
    "US_118_HRES_262",
    "US_118_S_1885",
    "US_118_HR_6038",
    "US_118_HRES_511",
    "US_118_HR_3925",
    "US_118_HRES_432",
    "US_118_S_2837",
    "US_118_SRES_102",
    "US_118_HR_5679",
]
# Bill fields: 'billId', 'title', 'introducedDate', 'billText', 'crsSummary', 'briefSummary', 'verboseSummary'
chosen_bills = []
with open("data.jsons", "r") as file:
    for line in tqdm(file, desc="Reading bills"):
        json_object = json.loads(line)
        parsed_data.append(json_object)
    short_bills = [
        bill
        for bill in parsed_data
        # if the length of the bill text is less than 1000 words and the bill contains the world "health"
        if len(bill["billText"].split()) < 1000 and "youth" in bill["billText"].lower()
    ]
    chosen_bills = [bill for bill in short_bills if bill["billId"] in bill_names]
print("Read! ✅")
# print the title and bilLId for 25 RANDOMLY CHOSEN bills from the short bills list
random_bills = random.sample(short_bills, 25)
for bill in random_bills:
    print(f"- {bill['title']} ({bill['billId']})")

Reading bills: 11902it [00:00, 30820.06it/s]


Read! ✅
- Expressing support for America's Black workers and affirming the need to pass legislation to reduce inequalities and discrimination in the workforce. (US_118_HRES_182)
- Expanded Coverage for Former Foster Youth Act (US_118_S_2837)
- Find and Protect Foster Youth Act (US_118_S_1146)
- Expressing support for the designation of September 2023 as "National Kinship Care Month". (US_118_HRES_694)
- Youth Coastal Fishing Program Act of 2023 (US_118_S_1860)
- Agricultural Access to Substance Use Disorder Treatment and Mental Health Care Act of 2023 (US_118_S_3206)
- Reentry Resource Guide Act of 2023 (US_118_HR_4404)
- ALYSSA Act (US_118_HR_4999)
- Foster Care Stabilization Act of 2023 (US_118_S_102)
- Improving CARE for Youth Act (US_118_S_2556)
- Learn and Serve America Reinvestment Act (US_118_HR_5679)
- Expressing support for the goals of Sports Eye Safety Month by promoting the importance of playing sports with the proper protective eyewear. (US_118_HRES_364)
- ACADEMIC Act of 

In [41]:
# make a formatted markdown doc with the title, billId, billSummary, and Billtext for all the bills in chosen_bills

with open("bills.md", "w") as file:
    for bill in chosen_bills:
        file.write(f"# {bill['title']} ({bill['billId']})\n")
        file.write(f"## Summary\n{bill['briefSummary']}\n")
        file.write(f"## Bill Text\n{bill['billText']}\n")

In [42]:
from dotenv import load_dotenv
from openai import OpenAI
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
import json

# load openAI API key
load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def gen_bill_embeddings_by_bill_text(bills):
    bill_embeddings = []
    for bill in tqdm(bills, desc="Bill text embed"):
        embedding = gen_embedding(bill["billText"][:8191])
        bill_embeddings.append((bill["billId"], bill["title"], embedding))

        # save the embeddings to a file
        with open("bill_embedding_text_subset.jsons", "a") as file:
            json.dump(
                {
                    "billId": bill["billId"],
                    "title": bill["title"],
                    "embedding": embedding.tolist(),
                },
                file,
            )
            file.write("\n")
    return bill_embeddings


def gen_bill_embeddings_by_bill_summary(bills):
    bill_embeddings = []
    for bill in tqdm(bills, desc="Bill summary embed"):
        embedding = gen_embedding(bill["briefSummary"][:8191])
        bill_embeddings.append((bill["billId"], bill["title"], embedding))

                # save the embeddings to a file
        with open("bill_embedding_text_summary.jsons", "a") as file:
            json.dump(
                {
                    "billId": bill["billId"],
                    "title": bill["title"],
                    "embedding": embedding.tolist(),
                },
                file,
            )
            file.write("\n")
    return bill_embeddings


def gen_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-large",
    )
    return np.array(response.data[0].embedding)


def load_embeddings_from_file(path):
    with open(path, "r") as file:
        bill_embeddings = []
        for line in file:
            json_object = json.loads(line)
            bill_embeddings.append(
                (
                    json_object["billId"],
                    json_object["title"],
                    np.array(json_object["embedding"]),
                )
            )
        return bill_embeddings


# # or load them
# if not gen_embeddings_from_scratch:
#     bill_embeddings = load_embeddings_from_file()
#     print("Embeddings loaded! ✅")


# Compute cosine similarity
def cosine_sim(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]


def search_bills(query, bill_embeddings, parsed_data):
    query_embedding = gen_embedding(query)
    similarities = []
    for bill_id, title, embedding in bill_embeddings:
        for bill in parsed_data:
            if bill["billId"] == bill_id:
                similarity = cosine_sim(query_embedding, embedding)
                similarities.append((bill, similarity))
                break
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    # Return the most relevant bills with their entire objects
    return similarities[0:5]

In [43]:
gen_embeddings_from_scratch = True

# generate embeddings for chosen_bills
if gen_embeddings_from_scratch:
    # delete bill_embedding_text_subset.jsons and bill_embedding_text_summary.jsons
    if os.path.exists("bill_embedding_text_subset.jsons"):
        os.remove("bill_embedding_text_subset.jsons")
    if os.path.exists("bill_embedding_text_summary.jsons"):
        os.remove("bill_embedding_text_summary.jsons")
    billTextEmbeddings = gen_bill_embeddings_by_bill_text(chosen_bills)
    billSummaryEmbeddings = gen_bill_embeddings_by_bill_summary(chosen_bills)
else:
    billTextEmbeddings = load_embeddings_from_file("bill_embedding_text_subset.jsons")
    billSummaryEmbeddings = load_embeddings_from_file("bill_embedding_text_summary.jsons")
while True:
    query = input("What's your search?")
    if query == "exit":
        break
    query_embedding = gen_embedding(query)
    print(f"Searching for {query}...")
    print("By Bill Text")
    bill_text_results = search_bills(query, billTextEmbeddings, chosen_bills)
    for i in range(5):
        print(f"{i+1}. {bill_text_results[i][0]['title']} ({bill_text_results[i][0]['billId']})\t {bill_text_results[i][1]:.4f}")
    print("By Bill Summary")
    bill_summary_results = search_bills(query, billSummaryEmbeddings, chosen_bills)
    for i in range(5):
        print(f"{i+1}. {bill_summary_results[i][0]['title']} ({bill_summary_results[i][0]['billId']})\t {bill_summary_results[i][1]:.4f}")


Bill text embed: 100%|██████████| 25/25 [00:06<00:00,  4.09it/s]
Bill summary embed: 100%|██████████| 25/25 [00:07<00:00,  3.47it/s]


Searching for Students...
By Bill Text
1. Students Helping Young Students Act of 2023 (US_118_S_183)	 0.3080
2. Dr. William W. Sullivan TRIO Upward Bound Student Stipend Support Act of 2023 (US_118_HR_6264)	 0.2192
3. Building Youth Workforce Skills Act (US_118_HR_5649)	 0.1837
4. Supporting the teaching of climate change in schools. (US_118_HRES_262)	 0.1790
5. Youth Mental Health Research Act (US_118_S_3060)	 0.1769
By Bill Summary
1. Students Helping Young Students Act of 2023 (US_118_S_183)	 0.3004
2. Building Youth Workforce Skills Act (US_118_HR_5649)	 0.1875
3. Calling on the United States and international donors to prioritize investments in children and youth in development and humanitarian assistance policies, programs, and activities. (US_118_HRES_715)	 0.1678
4. Supporting the teaching of climate change in schools. (US_118_HRES_262)	 0.1668
5. A resolution designating November 2023 as "National Homeless Children and Youth Awareness Month". (US_118_SRES_479)	 0.1637
Searchin