In [5]:
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure
import weaviate.classes.config as wc
import os
import json
import re


In [None]:
'''!!!!INSERT YOUR KEYS AND URL HERE!!!!'''
from dotenv import load_dotenv

load_dotenv()

wcd_url = os.getenv('WEAVIATE_URL')
wcd_api_key = os.getenv('WEAVIATE_KEY')
cohere_api_key = os.getenv('COHERE_KEY')


In [None]:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=wcd_url,                                    
    auth_credentials=Auth.api_key(wcd_api_key),             
    headers={"X-Cohere-Api-Key": cohere_api_key},           
)

# print(client.is_ready())  # Should print: `True`

# client.close()  # Free up resources

In [15]:
# Do not run - Collection exists

client.collections.create(
    name="TranscriptsV2",
    properties=[
        wc.Property(name="text", data_type=wc.DataType.TEXT),
        wc.Property(name="start", data_type=wc.DataType.NUMBER),
        wc.Property(name="duration", data_type=wc.DataType.NUMBER),
        wc.Property(name="video_id", data_type=wc.DataType.TEXT),
        wc.Property(name="state", data_type=wc.DataType.TEXT)
    ],
    # Configure the vectorizer module
    vectorizer_config=wc.Configure.Vectorizer.text2vec_cohere(),
    # Configure the generative module
    generative_config=wc.Configure.Generative.cohere()
)

<weaviate.collections.collection.sync.Collection at 0x10c79df70>

In [9]:
# %%
def combine_segments(data, window_size=3, video_id="unknown"):
    """
    Groups transcript segments into larger chunks for better retrieval.
    
    Args:
        data (list): List of transcript segments (dict with 'text', 'start', 'duration').
        window_size (int): Number of consecutive segments to merge.
        video_id (str): Identifier for the video (YouTube video ID).
    
    Returns:
        list: List of combined transcript chunks.
    """
    combined_segments = []
    for i in range(0, len(data), window_size):
        window = data[i:i + window_size]

        # Filter out empty or single-word segments
        filtered_text = [seg["text"] for seg in window if len(seg["text"].split()) > 1]

        if filtered_text:
            combined_text = " ".join(filtered_text)  # Merge text from multiple segments
            combined_segments.append({
                "text": combined_text,
                "start": window[0]["start"],  # Start time of the first segment in the window
                "duration": sum(seg["duration"] for seg in window),  # Total duration of the window
                "video_id": video_id  # Ensure video_id is included
            })

    return combined_segments

# %%
# Function to extract video ID from filename
def extract_video_id(filename):
    # Common pattern for YouTube video IDs in filenames
    video_id_match = re.search(r'([a-zA-Z0-9_-]{11})', filename)
    if video_id_match:
        return video_id_match.group(1)
    return "unknown"

In [None]:
# json_dir = "../States"

# transcripts = client.collections.get("Transcripts")
# state_dirs = [d for d in os.listdir(json_dir) if os.path.isdir(os.path.join(json_dir, d))]


# error_count = 0
# for state in state_dirs:
#     state_dir = os.path.join(json_dir, state)
#     print(f"Processing state: {state}")

#     # Get the transcripts collection
#     transcripts = client.collections.get("Transcripts")

#     with transcripts.batch.dynamic() as batch:
#         for filename in os.listdir(state_dir):
#             if filename.endswith('.json'):
#                 file_path = os.path.join(state_dir, filename)
#                 with open(file_path, "r") as f:
#                     try:
#                         data = json.load(f)
#                         if isinstance(data, list):
#                             for item in data:
#                                 if isinstance(item, dict):
#                                     batch.add_object({
#                                         "text": item.get("text", ""),
#                                         "start": item.get("start", 0),
#                                         "duration": item.get("duration", 0),
#                                         "state": state  # Add the state name from the directory
#                                     })
#                                 else:
#                                     print(f"Skipping invalid entry in {filename}: {item}")
#                         else:
#                             print(f"Unexpected JSON structure in {filename}: {type(data)}")
#                     except Exception as e:
#                         print(f"Failed to process {filename}: {e}")
#                         error_count += 1
#                         if error_count > 10:
#                             print("Batch import stopped due to excessive errors.")
#                             break

#     failed_objects = transcripts.batch.failed_objects
#     if failed_objects:
#         print(f"Number of failed imports for {state}: {len(failed_objects)}")
#         print(f"First failed object: {failed_objects[0]}")

# print("Batch upload completed.")


Processing state: Mississippi
Processing state: Arkansas
Processing state: Louisiana
Batch upload completed.


In [13]:
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

Current working directory: /Users/petersapountzis/Desktop/tulane/spring2025/cmps4010/Entergy-AI/RAG


In [16]:
transcripts = client.collections.get("TranscriptsV2")


json_dir = "/Users/petersapountzis/Desktop/tulane/spring2025/cmps4010/Entergy-AI/parsers"  # Root directory for parsers
transcript_dirs = [
    os.path.join(json_dir, "ARK_PSC_transcripts"),
    os.path.join(json_dir, "LA_PSC_transcripts"),
    os.path.join(json_dir, "MISS_PSC_transcripts")
]

window_size = 3  # Number of segments to combine
error_count = 0
for transcript_dir in transcript_dirs:
    # Extract state from directory name
    state_match = re.search(r'([A-Z]+)_PSC', os.path.basename(transcript_dir))
    state = state_match.group(1) if state_match else "unknown"
    
    print(f"Processing directory: {transcript_dir}")
    
    with transcripts.batch.dynamic() as batch:
        for filename in os.listdir(transcript_dir):
            if filename.endswith('.json'):
                file_path = os.path.join(transcript_dir, filename)
                try:
                    with open(file_path, "r") as f:
                        data = json.load(f)
                        
                        # Extract video ID from filename
                        video_id = os.path.splitext(filename)[0]
                        
                        if isinstance(data, list):
                            # Combine segments for better context
                            combined_data = combine_segments(data, window_size, video_id)
                            
                            for item in combined_data:
                                if isinstance(item, dict):
                                    # Add state to the object
                                    item["state"] = state
                                    batch.add_object(item)
                                else:
                                    print(f"Skipping invalid entry in {filename}: {item}")
                        else:
                            print(f"Unexpected JSON structure in {filename}: {type(data)}")
                except Exception as e:
                    print(f"Failed to process {filename}: {e}")
                    error_count += 1
                    if error_count > 10:
                        print("Batch import stopped due to excessive errors.")
                        break

    failed_objects = transcripts.batch.failed_objects
    if failed_objects:
        print(f"Number of failed imports for {state}: {len(failed_objects)}")
        print(f"First failed object: {failed_objects[0]}")

print("Batch upload completed.")


Processing directory: /Users/petersapountzis/Desktop/tulane/spring2025/cmps4010/Entergy-AI/parsers/ARK_PSC_transcripts
Processing directory: /Users/petersapountzis/Desktop/tulane/spring2025/cmps4010/Entergy-AI/parsers/LA_PSC_transcripts
Processing directory: /Users/petersapountzis/Desktop/tulane/spring2025/cmps4010/Entergy-AI/parsers/MISS_PSC_transcripts
Batch upload completed.
