In [1]:
import json
from tqdm import tqdm

# Read data from JSON
parsed_data = []
first_50 = []
all = []
# Bill fields: 'billId', 'title', 'introducedDate', 'billText', 'crsSummary', 'briefSummary', 'verboseSummary'
with open("data.jsons", "r") as file:
    for line in tqdm(file, desc="Reading bills"):
        json_object = json.loads(line)
        parsed_data.append(json_object)
    health_bills = [
        bill
        for bill in parsed_data
        if "health" in bill["title"].lower() and len(bill["billText"].split()) <= 500
    ]
    first_50 = health_bills[:50]
    all = [ bill for bill in parsed_data]
print('Read! ✅')

Reading bills: 11902it [00:00, 37455.80it/s]

Read! ✅





In [2]:
from dotenv import load_dotenv
from openai import OpenAI
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity

gen_embeddings_from_scratch = False

# load openAI API key
load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def gen_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-large",
    )
    return np.array(response.data[0].embedding)


def gen_bill_embeddings(bills):
    bill_embeddings = []
    for bill in tqdm(bills, desc="Generating embeddings"):
        embedding = gen_embedding(bill["verboseSummary"])
        bill_embeddings.append((bill["billId"], bill["title"], embedding))
    return bill_embeddings


def load_embeddings_from_file():
    with open("bill_embeddings.jsons", "r") as file:
        bill_embeddings = []
        for line in file:
            json_object = json.loads(line)
            bill_embeddings.append(
                (
                    json_object["billId"],
                    json_object["title"],
                    np.array(json_object["embedding"]),
                )
            )
        return bill_embeddings


# either gen your embeddings
if gen_embeddings_from_scratch:
    bill_embeddings = gen_bill_embeddings(all)
    print("Embeddings generated! ✅")
    # output embeddings to file
    with open("bill_embeddings.jsons", "w") as file:
        for bill_id, title, embedding in bill_embeddings:
            json.dump(
                {"billId": bill_id, "title": title, "embedding": embedding.tolist()},
                file,
            )
            file.write("\n")

# or load them
if not gen_embeddings_from_scratch:
    bill_embeddings = load_embeddings_from_file()
    print("Embeddings loaded! ✅")

# Compute cosine similarity
def cosine_sim(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]

# Search function
def search_bills(query, bill_embeddings):
    query_embedding = gen_embedding(query)
    similarities = [
        (bill_id, title, cosine_sim(query_embedding, embedding))
        for bill_id, title, embedding in bill_embeddings
    ]
    # Sort by similarity
    similarities.sort(key=lambda x: x[2], reverse=True)
    # Return the most relevant bill
    return similarities[0:5]


# Interactive search with top 5 ranking
while True:
    query = input("Enter a query: ")
    if query == "exit":
        break
    results = search_bills(query, bill_embeddings)
    print(f"Query: {query}")
    print("Top 5 results:")
    for i, (bill_id, title, similarity) in enumerate(results):
        print(f"{i+1}. {title} (similarity: {similarity:.2f})")

Embeddings loaded! ✅
Query: how is life my guy
Top 5 results:
1. Life.Gov Act (similarity: 0.18)
2. Fentanyl Trafficker Elimination Act (similarity: 0.16)
3. Honoring the life and legacy of Jack Trice. (similarity: 0.15)
4. Expressing congratulations to T.J. Hopkins for his promotion to the Cincinnati Reds. (similarity: 0.14)
5. Smarter Sentencing Act of 2023 (similarity: 0.14)
Query: 
Top 5 results:
1. Condemning antisemitism on college campuses. (similarity: 0.17)
2. Condemning antisemitism on college campuses. (similarity: 0.16)
3. Equality Act (similarity: 0.16)
4. SALT Deductibility Act (similarity: 0.16)
5. Reaffirming the support of the United States to our strongest ally in the region, Israel, and recognizing the authoritarian and extremist regime of the Islamic Republic of Iran as a threat to Israel, the region, the United States, and global stability. (similarity: 0.16)
