In [1]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure
import weaviate.classes.config as wc
import os
import json


In [None]:
'''!!!!INSERT YOUR KEYS AND URL HERE!!!!'''

wcd_url = ''
wcd_api_key = ''
cohere_api_key = ''


In [3]:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(wcd_api_key),             # Replace with your Weaviate Cloud key
    headers={"X-Cohere-Api-Key": cohere_api_key},           # Replace with your Cohere API key
)

# print(client.is_ready())  # Should print: `True`

# client.close()  # Free up resources

In [4]:
# Recreate the Louisiana collection with the updated schema
client.collections.create(
    name="Transcripts",
    properties=[
        wc.Property(name="text", data_type=wc.DataType.TEXT),
        wc.Property(name="start", data_type=wc.DataType.NUMBER),
        wc.Property(name="duration", data_type=wc.DataType.NUMBER),
        wc.Property(name="video_id", data_type=wc.DataType.TEXT),
        wc.Property(name="state", data_type=wc.DataType.TEXT)
    ],
    # Configure the vectorizer module (Cohere or OpenAI depending on your keys)
    vectorizer_config=wc.Configure.Vectorizer.text2vec_cohere(),
    # Configure the generative module
    generative_config=wc.Configure.Generative.cohere()
)

<weaviate.collections.collection.sync.Collection at 0x10ab79d10>

In [5]:
json_dir = "../States"

transcripts = client.collections.get("Transcripts")
state_dirs = [d for d in os.listdir(json_dir) if os.path.isdir(os.path.join(json_dir, d))]


error_count = 0
for state in state_dirs:
    state_dir = os.path.join(json_dir, state)
    print(f"Processing state: {state}")

    # Get the transcripts collection
    transcripts = client.collections.get("Transcripts")

    with transcripts.batch.dynamic() as batch:
        for filename in os.listdir(state_dir):
            if filename.endswith('.json'):
                file_path = os.path.join(state_dir, filename)
                with open(file_path, "r") as f:
                    try:
                        data = json.load(f)
                        if isinstance(data, list):
                            for item in data:
                                if isinstance(item, dict):
                                    batch.add_object({
                                        "text": item.get("text", ""),
                                        "start": item.get("start", 0),
                                        "duration": item.get("duration", 0),
                                        "state": state  # Add the state name from the directory
                                    })
                                else:
                                    print(f"Skipping invalid entry in {filename}: {item}")
                        else:
                            print(f"Unexpected JSON structure in {filename}: {type(data)}")
                    except Exception as e:
                        print(f"Failed to process {filename}: {e}")
                        error_count += 1
                        if error_count > 10:
                            print("Batch import stopped due to excessive errors.")
                            break

    failed_objects = transcripts.batch.failed_objects
    if failed_objects:
        print(f"Number of failed imports for {state}: {len(failed_objects)}")
        print(f"First failed object: {failed_objects[0]}")

print("Batch upload completed.")


Processing state: Mississippi
Processing state: Arkansas
Processing state: Louisiana
Batch upload completed.
