In [3]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure
import weaviate.classes.config as wc
import os
import json
import re


In [None]:
'''!!!!INSERT YOUR KEYS AND URL HERE!!!!'''
from dotenv import load_dotenv

load_dotenv()

# wcd_url = os.getenv('WEAVIATE_URL')
# wcd_api_key = os.getenv('WEAVIATE_KEY')
# cohere_api_key = os.getenv('COHERE_KEY')

wcd_url = ''
wcd_api_key = ''
cohere_api_key = ''


In [73]:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    
    auth_credentials=Auth.api_key(wcd_api_key),             
    headers={"X-Cohere-Api-Key": cohere_api_key},           
)

# print(client.is_ready())  # Should print: `True`

# client.close()  # Free up resources

In [15]:
# Do not run - Collection exists

client.collections.create(
    name="TranscriptsV2",
    properties=[
        wc.Property(name="text", data_type=wc.DataType.TEXT),
        wc.Property(name="start", data_type=wc.DataType.NUMBER),
        wc.Property(name="duration", data_type=wc.DataType.NUMBER),
        wc.Property(name="video_id", data_type=wc.DataType.TEXT),
        wc.Property(name="state", data_type=wc.DataType.TEXT)
    ],
    # Configure the vectorizer module
    vectorizer_config=wc.Configure.Vectorizer.text2vec_cohere(),
    # Configure the generative module
    generative_config=wc.Configure.Generative.cohere()
)

<weaviate.collections.collection.sync.Collection at 0x10c79df70>

In [9]:
# %%
def combine_segments(data, window_size=3, video_id="unknown"):
    """
    Groups transcript segments into larger chunks for better retrieval.
    
    Args:
        data (list): List of transcript segments (dict with 'text', 'start', 'duration').
        window_size (int): Number of consecutive segments to merge.
        video_id (str): Identifier for the video (YouTube video ID).
    
    Returns:
        list: List of combined transcript chunks.
    """
    combined_segments = []
    for i in range(0, len(data), window_size):
        window = data[i:i + window_size]

        # Filter out empty or single-word segments
        filtered_text = [seg["text"] for seg in window if len(seg["text"].split()) > 1]

        if filtered_text:
            combined_text = " ".join(filtered_text)  # Merge text from multiple segments
            combined_segments.append({
                "text": combined_text,
                "start": window[0]["start"],  # Start time of the first segment in the window
                "duration": sum(seg["duration"] for seg in window),  # Total duration of the window
                "video_id": video_id  # Ensure video_id is included
            })

    return combined_segments

# %%
# Function to extract video ID from filename
def extract_video_id(filename):
    # Common pattern for YouTube video IDs in filenames
    video_id_match = re.search(r'([a-zA-Z0-9_-]{11})', filename)
    if video_id_match:
        return video_id_match.group(1)
    return "unknown"

In [None]:
from datetime import datetime

# directory = "TX/combined"
directory = ''
os.listdir(directory)
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        src = os.path.join(directory, filename)
        newname = "TEXAS_PSC_" + filename[:-16] + ".json"
        dst = os.path.join(directory, newname)
        print(f"Renaming\n  {src}\n→ {dst}")
        os.rename(src, dst)
    # print(filename[:-5])
print("Renaming Complete!")

Renaming
  TX/combined/2025-01-31_transcript.json
→ TX/combined/TEXAS_PSC_2025-01-31.json
Renaming
  TX/combined/2024-04-25_transcript.json
→ TX/combined/TEXAS_PSC_2024-04-25.json
Renaming
  TX/combined/2024-12-19_transcript.json
→ TX/combined/TEXAS_PSC_2024-12-19.json
Renaming
  TX/combined/2025-02-20_transcript.json
→ TX/combined/TEXAS_PSC_2025-02-20.json
Renaming
  TX/combined/2024-07-25_transcript.json
→ TX/combined/TEXAS_PSC_2024-07-25.json
Renaming
  TX/combined/2024-08-15_transcript.json
→ TX/combined/TEXAS_PSC_2024-08-15.json
Renaming
  TX/combined/2024-05-02_transcript.json
→ TX/combined/TEXAS_PSC_2024-05-02.json
Renaming
  TX/combined/2024-08-29_transcript.json
→ TX/combined/TEXAS_PSC_2024-08-29.json
Renaming
  TX/combined/2024-06-13_transcript.json
→ TX/combined/TEXAS_PSC_2024-06-13.json
Renaming
  TX/combined/2024-10-03_transcript.json
→ TX/combined/TEXAS_PSC_2024-10-03.json
Renaming
  TX/combined/2025-02-13_transcript.json
→ TX/combined/TEXAS_PSC_2025-02-13.json
Renaming
 

In [71]:
json_dir = '../../States/ArkansasComb/2'

print(f"Processing files in directory: {json_dir}")

# Get the transcripts collection
transcripts = client.collections.get("TranscriptsV2")

with transcripts.batch.dynamic() as batch:
    for filename in os.listdir(json_dir):
        if not filename.endswith(".json"):
            continue

        file_path = os.path.join(json_dir, filename)
        with open(file_path, "r") as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    for item in data:
                        if isinstance(item, dict):
                            batch.add_object({
                                "text":     item.get("text", ""),
                                "start":    item.get("start", 0),
                                "duration": item.get("duration", 0),
                                "state": "ARK",
                                "video_id": filename[:-5]
                            })
                        else:
                            print(f"Skipping invalid entry in {filename}: {item}")
                else:
                    print(f"Unexpected JSON structure in {filename}: {type(data)}")
            except Exception as e:
                print(f"Failed to process {filename}: {e}")
                error_count += 1
                if error_count > 10:
                    print("Batch import stopped due to excessive errors.")
                    break

# After the with‐block, inspect any failures
failed = transcripts.batch.failed_objects
if failed:
    print(f"Number of failed imports: {len(failed)}")
    print(f"First failed object: {failed[0]}")

print("Batch upload completed.")
client.close()

Processing files in directory: ../../States/ArkansasComb/2
Batch upload completed.


In [None]:
from weaviate.classes.query import Filter

## Helper function that finds and lists count of objects under a specific key in the collection

collection = client.collections.get("TranscriptsV2")
state = "ARK"
resp = collection.aggregate.over_all(
    total_count=True,
    filters=Filter.by_property("state").equal(state)
) 

print(f"Total {state} objects:", resp.total_count)

client.close()


WeaviateClosedClientError: The `WeaviateClient` is closed. Run `client.connect()` to (re)connect!

In [None]:
## More complex filtering process. Helper function to check uploads

# build each individual filter
state_filter  = Filter.by_property("state").equal("NOLA")
docket_filter = Filter.by_property("video_id").like("Commission Hearing in Docket No")

compound = (
    Filter.by_property("state").equal("NOLA")
    & Filter.by_property("video_id").like("Commission Hearing in Docket No")
)


collection.data.delete_many(where=compound)


# fetch all matching objects (page through if more than `limit`)
resp = collection.query.fetch_objects(
    filters=compound,
    limit=10000
)



print(f"Found {len(resp.objects)} matching records:")
for obj in resp.objects:
    print(" •", obj.uuid, obj.properties["video_id"])



Found 0 matching records:


In [None]:
json_dir = "../../States/Done"

transcripts = client.collections.get("TranscriptsV2")
state_dirs = [d for d in os.listdir(json_dir) if os.path.isdir(os.path.join(json_dir, d))]


error_count = 0
for state in state_dirs:
    state_dir = os.path.join(json_dir, state)
    print(f"Processing state: {state}")

    # Get the transcripts collection
    transcripts = client.collections.get("TranscriptsV2")

    with transcripts.batch.dynamic() as batch:
        for filename in os.listdir(state_dir):
            if filename.endswith('.json'):
                file_path = os.path.join(state_dir, filename)
                with open(file_path, "r") as f:
                    try:
                        data = json.load(f)
                        if isinstance(data, list):
                            for item in data:
                                if isinstance(item, dict):
                                    batch.add_object({
                                        "text": item.get("text", ""),
                                        "start": item.get("start", 0),
                                        "duration": item.get("duration", 0),
                                        "state": state  # Add the state name from the directory
                                    })
                                else:
                                    print(f"Skipping invalid entry in {filename}: {item}")
                        else:
                            print(f"Unexpected JSON structure in {filename}: {type(data)}")
                    except Exception as e:
                        print(f"Failed to process {filename}: {e}")
                        error_count += 1
                        if error_count > 10:
                            print("Batch import stopped due to excessive errors.")
                            break

    failed_objects = transcripts.batch.failed_objects
    if failed_objects:
        print(f"Number of failed imports for {state}: {len(failed_objects)}")
        print(f"First failed object: {failed_objects[0]}")

print("Batch upload completed.")


Processing state: NOLA




Batch upload completed.


In [4]:
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

Current working directory: /Users/jackzemke/Desktop/Entergy/Entergy-AI/RAG


In [None]:
transcripts = client.collections.get("TranscriptsV2")


json_dir = "/Users/jackzemke/Desktop/Entergy/Entergy-AI/RAG"  # Root directory for parsers
transcript_dirs = [
    os.path.join(json_dir, "NOLA_transcripts")
]

window_size = 3  # Number of segments to combine
error_count = 0
for transcript_dir in transcript_dirs:
    # Extract state from directory name
    state_match = re.search(r'([A-Z]+)_PSC', os.path.basename(transcript_dir))
    # state = state_match.group(1) if state_match else "unknown"
    state = "NOLA"
    
    print(f"Processing directory: {transcript_dir}")
    
    with transcripts.batch.dynamic() as batch:
        for filename in os.listdir(transcript_dir):
            if filename.endswith('.json'):
                file_path = os.path.join(transcript_dir, filename)
                try:
                    with open(file_path, "r") as f:
                        data = json.load(f)
                        
                        # Extract video ID from filename
                        video_id = os.path.splitext(filename)[0]
                        
                        if isinstance(data, list):
                            # Combine segments for better context
                            combined_data = combine_segments(data, window_size, video_id)
                            
                            for item in combined_data:
                                if isinstance(item, dict):
                                    # Add state to the object
                                    item["state"] = state
                                    batch.add_object(item)
                                else:
                                    print(f"Skipping invalid entry in {filename}: {item}")
                        else:
                            print(f"Unexpected JSON structure in {filename}: {type(data)}")
                except Exception as e:
                    print(f"Failed to process {filename}: {e}")
                    error_count += 1
                    if error_count > 10:
                        print("Batch import stopped due to excessive errors.")
                        break

    failed_objects = transcripts.batch.failed_objects
    if failed_objects:
        print(f"Number of failed imports for {state}: {len(failed_objects)}")
        print(f"First failed object: {failed_objects[0]}")

print("Batch upload completed.")


Processing directory: /Users/petersapountzis/Desktop/tulane/spring2025/cmps4010/Entergy-AI/parsers/ARK_PSC_transcripts
Processing directory: /Users/petersapountzis/Desktop/tulane/spring2025/cmps4010/Entergy-AI/parsers/LA_PSC_transcripts
Processing directory: /Users/petersapountzis/Desktop/tulane/spring2025/cmps4010/Entergy-AI/parsers/MISS_PSC_transcripts
Batch upload completed.
