This script is mostly run on Sagemaker studio. Running it locally is not recommended as it requires a lot of memory.

In [12]:
from datetime import datetime
import pandas as pd
import requests
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from InstructorEmbedding import INSTRUCTOR

import weaviate
import json
import os
from dotenv import load_dotenv


load_dotenv()
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
weaviate_api_key = os.getenv('WEAVIATE_API_KEY')
weaviate_url = os.getenv('WEAVIATE_URL')
openai_key = os.getenv('OPENAI_API_KEY')

client = weaviate.Client(
    url = weaviate_url,
    auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key), 
    additional_headers = {
        "X-OpenAI-Api-Key": openai_key
    }
)

  from .autonotebook import tqdm as notebook_tqdm
            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from tqdm import tqdm

combined_df = pd.read_csv("complete_cleaned_full_text.csv")

model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L12-cos-v5')
# model = INSTRUCTOR('hkunlp/instructor-large')
# instruction = "Represent the legislation bills for retrieval:"

print("Max Sequence Length:", model.max_seq_length)
model.max_seq_length = 512

for field in ['BillText', 'statesummary', 'ShortBillName']:
    combined_df[f'{field}_vector'] = pd.Series(dtype='object')

for index, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    # Vectorize the specified fields
    for field in ['BillText', 'statesummary', 'ShortBillName']:
        field_value = row[field]
        
        if pd.notna(field_value):
            vector = model.encode(str(field_value)).tolist()  # Ensure field_value is string
            combined_df.at[index, f'{field}_vector'] = vector
        else:
            combined_df.at[index, f'{field}_vector'] = []

combined_df.to_csv("combined_df.csv", index=False)

In [None]:
import numpy as np
import time

def clean_vector(vector):
    # Check if the vector is not None and is a list
    if vector is not None and isinstance(vector, list):
        # Replace NaN or Inf values with 0
        return [0 if np.isnan(x) or np.isinf(x) else x for x in vector]
    else:
        # If the vector is None or not a list, return an empty list or a default vector
        return []

for field in ['BillText_vector', 'statesummary_vector', 'ShortBillName_vector']:
    combined_df[field] = combined_df[field].apply(clean_vector)
    
    
if client.schema.exists("Legislation"):
    client.schema.delete_class("Legislation")
    
# for pre-vectorized data
class_obj = {
    "class": "Legislation",
    "vectorizer": "none",
    "moduleConfig": {
        "generative-openai": {}  # Ensure the `generative-openai` module is used for generative queries
    }
}

client.schema.create_class(class_obj)


from tqdm import tqdm
import time

# for pre-vectorized data        
client.batch.configure(batch_size=100)

with client.batch as batch:
    # Wrap combined_df.iterrows() with tqdm for a progress bar
    for index, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
        try:
            bill_text_vector = clean_vector(row.get('BillText_vector', []))

            properties = {
                "BillID": row['BillID'],
                "StateCode": row['StateCode'],
                "StateBillID": row['StateBillID'],            
                "ShortBillName": row['ShortBillName'],
                "Created": row['Created'],
                "SponsorParty": row['SponsorParty'],
                "billtype": row['billtype'],
                "status": row['status'],
                "CommitteeCategories": row['CommitteeCategories'],
                "statesummary": row['statesummary'],
                "BillText": row['BillText']
            }

            # Attempt to add the data object to the batch
            batch.add_data_object(properties, "Legislation", vector=bill_text_vector)
            
        except Exception as e:
            # Log the error and skip this record
            print(f"Skipping record at index {index} due to error: {e}")
            continue  # Skip the rest of the current loop iteration



In [4]:
import pandas as pd
combined_df = pd.read_csv("../data_storage/legislation/combined_df.csv")

In [11]:
combined_df[combined_df['StateBillID']=="SB728"]

Unnamed: 0,BillID,StateCode,StateBillID,ShortBillName,Created,SponsorParty,billtype,status,CommitteeCategories,statesummary,BillText,BillText_vector,statesummary_vector,ShortBillName_vector
4798,1652788,MO,SB728,Creates provisions relating to public elementa...,2023-12-07 03:20:37.230000000,R,Bill,In Committee,Education,"""AN ACT To amend chapters 161, 170, and 171, R...","""Missouri MO SB 728 MO SB728 MOSB728 MO SB 728...","[0.02366095595061779, -0.04649743437767029, -0...","[0.0017390374559909105, -0.004405404906719923,...","[0.002935373457148671, -0.001205891720019281, ..."
