In [1]:
# # LOAD ORIGINAL BILLS

import json
from tqdm import tqdm

# Read data from JSON
parsed_data = []
first_50 = []
all = []
# Bill fields: 'billId', 'title', 'introducedDate', 'billText', 'crsSummary', 'briefSummary', 'verboseSummary'
with open("data.jsons", "r") as file:
    for line in tqdm(file, desc="Reading bills"):
        json_object = json.loads(line)
        parsed_data.append(json_object)
    health_bills = [
        bill
        for bill in parsed_data
        if "health" in bill["title"].lower() and len(bill["billText"].split()) <= 500
    ]
    first_50 = health_bills[:50]
    all = [ bill for bill in parsed_data]
print('Read! ✅')
pass

Reading bills: 0it [00:00, ?it/s]

Reading bills: 11902it [00:00, 28106.36it/s]

Read! ✅





In [2]:
from dotenv import load_dotenv
from openai import OpenAI
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
import json

gen_embeddings_from_scratch = False

# load openAI API key
load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def gen_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-large",
    )
    return np.array(response.data[0].embedding)


def load_embeddings_from_file():
    with open("bill_embeddings.jsons", "r") as file:
        bill_embeddings = []
        for line in file:
            json_object = json.loads(line)
            bill_embeddings.append(
                (
                    json_object["billId"],
                    json_object["title"],
                    np.array(json_object["embedding"]),
                )
            )
        return bill_embeddings


# or load them
if not gen_embeddings_from_scratch:
    bill_embeddings = load_embeddings_from_file()
    print("Embeddings loaded! ✅")


# Compute cosine similarity
def cosine_sim(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]


def search_bills(query, bill_embeddings, parsed_data):
    query_embedding = gen_embedding(query)
    similarities = []
    for bill_id, title, embedding in bill_embeddings:
        for bill in parsed_data:
            if bill["billId"] == bill_id:
                similarity = cosine_sim(query_embedding, embedding)
                similarities.append((bill, similarity))
                break
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    # Return the most relevant bills with their entire objects
    return similarities[0:5]


# Interactive search with top 5 ranking
results = []
while True:
    query = input("Enter a query: ")
    if query == "exit":
        break
    results = search_bills(query, bill_embeddings, parsed_data)
    print(f"Query: {query}")
    print("Top 5 results:")
    for i, (bill, similarity) in enumerate(results):
        # print(f"{i+1}. {bill['title']} (similarity: {similarity:.2f}")
        print(
            f"{i+1}. {bill['title']} (similarity: {similarity:.2f}) \n\
            number of occurrences of keyword in bill text: {bill['billText'].lower().count(query.lower())}\n\
            number of occurrences of keyword in bill summary: {bill['crsSummary'].lower().count(query.lower())}\n\
            number of occurrences of keyword in bill title: {bill['title'].lower().count(query.lower())}\n)"
        )

Embeddings loaded! ✅
Query: school shooting gun violence
Top 5 results:
1. Identifying Mass Shooters Act (similarity: 0.52) 
            number of occurrences of keyword in bill text: 0
            number of occurrences of keyword in bill summary: 0
            number of occurrences of keyword in bill title: 0
)
2. Expressing support for the designation of June 2, 2023, as "National Gun Violence Awareness Day" and June 2023 as "National Gun Violence Awareness Month". (similarity: 0.51) 
            number of occurrences of keyword in bill text: 0
            number of occurrences of keyword in bill summary: 0
            number of occurrences of keyword in bill title: 0
)
3. Condemning the horrific shootings that occurred in Louisville, Kentucky, on April 10, 2023, in the Old National Bank building, and on April 15, 2023, in Chickasaw Park, honoring the memory of the victims of the attacks, expressing condolences and support to all those impacted by these tragedies, and reaffirming the

In [9]:
# generate embeddings for 

Health CARE Training Act
