In [1]:
import weaviate
from weaviate.classes.init import Auth
import json
import os
import anthropic
import time
# from tqdm import tqdm  # For progress tracking
from anthropic import Anthropic
import dotenv

In [19]:
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

weaviate_url = os.getenv('WEAVIATE_URL')
weaviate_key = os.getenv('WEAVIATE_KEY')
openai_key = os.getenv('OPENAI_KEY')
anthropic_key = os.getenv('ANTHROPIC_KEY')
hugging_face_key = os.getenv('HUGGING_FACE_KEY')



In [24]:
# Setup clients

weaviate_client.close()  # ✅ Close the connection
weaviate_client = None   # ✅ Reset the client

weaviate_client = weaviate.connect_to_weaviate_cloud(
                cluster_url=weaviate_url, 
                auth_credentials=Auth.api_key(weaviate_key),
                headers={
        "X-OpenAI-Api-Key": openai_key,  # ✅ OpenAI key
        # "X-HuggingFace-Api-Key": hugging_face_key  # ✅ Alternative header
        "Authorization": f"Bearer hugging_face_key"  # ✅ Hugging Face API token
    })
claude_client = anthropic.Client(api_key=anthropic_key)

In [9]:
def combine_segments(data, window_size=3, video_id="unknown"):
    """
    Groups transcript segments into larger chunks for better retrieval.
    
    Args:
        data (list): List of transcript segments (dict with 'text', 'start', 'duration').
        window_size (int): Number of consecutive segments to merge.
        video_id (str): Identifier for the video (YouTube video ID).
    
    Returns:
        list: List of combined transcript chunks.
    """
    combined_segments = []
    for i in range(0, len(data), window_size):
        window = data[i:i + window_size]

        # Filter out empty or single-word segments
        filtered_text = [seg["text"] for seg in window if len(seg["text"].split()) > 1]

        if filtered_text:
            combined_text = " ".join(filtered_text)  # Merge text from multiple segments
            combined_segments.append({
                "text": combined_text,
                "start": window[0]["start"],  # Start time of the first segment in the window
                "duration": sum(seg["duration"] for seg in window),  # Total duration of the window
                "video_id": video_id  # Ensure video_id is included
            })

    return combined_segments

In [22]:
from weaviate.classes.config import Property, DataType, Configure


# Define schema for "LATranscript" collection
if "LATranscript" not in weaviate_client.collections.list_all():
    weaviate_client.collections.create(
        name="PSC_Youtube_Transcript",
        properties=[
            Property(name="text", data_type=DataType.TEXT),         
            Property(name="start", data_type=DataType.NUMBER),      
            Property(name="duration", data_type=DataType.NUMBER),   
            Property(name="video_id", data_type=DataType.TEXT),
            Property(name="state", data_type=DataType.TEXT), #added state to be able to toggle what we search from
        ],
        vectorizer_config=Configure.Vectorizer.text2vec_huggingface(  
            model="sentence-transformers/all-MiniLM-L6-v2"  
        )
    )
    print("Schema created successfully!")
else:
    print("Schema already exists.")

UnexpectedStatusCodeError: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name PSC_Youtube_Transcript already exists'}]}.

In [16]:
def upload_transcripts(transcripts, video_id, state=None):
    """
    Uploads preprocessed YouTube transcripts to Weaviate with a "state" field.
    
    Args:
        transcripts (list): List of transcript segments.
        video_id (str): YouTube video identifier.
        state (str): The state the transcript belongs to (default: None).
    """
    if state is None:
        raise ValueError("State cannot be None. Please provide a state (e.g., 'Louisiana').")

    combined_transcripts = combine_segments(transcripts, window_size=3, video_id=video_id)

    # ✅ Add "state" field to each transcript
    for segment in combined_transcripts:
        segment["state"] = state

    # ✅ Get the Weaviate collection object
    collection = weaviate_client.collections.get("PSC_Youtube_Transcript")

    try:
        # ✅ Use REST-based insert (avoids gRPC failures)
        for segment in combined_transcripts:
            collection.data.insert(segment)  # Insert one-by-one via REST
            
        print(f"✅ Uploaded {len(combined_transcripts)} Louisiana transcript segments to Weaviate!")

    except Exception as e:
        print("Error inserting transcripts:", e)

In [27]:
louisiana_transcripts = [
    {"text": "Energy policy discussions were intense this year.", "start": 102.4, "duration": 5.1},
    {"text": "The Louisiana PSC is focusing on renewables.", "start": 108.5, "duration": 4.8},
    {"text": "There will be new incentives for solar energy adoption.", "start": 113.3, "duration": 6.0},
    {"text": "Hurricane preparedness is a key agenda item.", "start": 119.5, "duration": 5.2},
    {"text": "Grid modernization efforts are underway.", "start": 124.7, "duration": 4.9},
]
upload_transcripts(louisiana_transcripts, video_id="LA_Transcript_2024", state="Louisiana")

Error inserting transcripts: Object was not added! Unexpected status code: 500, with response body: {'error': [{'message': 'update vector: failed with status: 429 error: TooManyRequests: Please log in or use a HF access token'}]}.


In [26]:
collection = weaviate_client.collections.get("PSC_Youtube_Transcript")
print(collection.config.get())  # ✅ Print schema configuration

_CollectionConfig(name='PSC_Youtube_Transcript', description=None, generative_config=None, inverted_index_config=_InvertedIndexConfig(bm25=_BM25Config(b=0.75, k1=1.2), cleanup_interval_seconds=60, index_null_state=False, index_property_length=False, index_timestamps=False, stopwords=_StopwordsConfig(preset=<StopwordsPreset.EN: 'en'>, additions=None, removals=None)), multi_tenancy_config=_MultiTenancyConfig(enabled=False, auto_tenant_creation=False, auto_tenant_activation=False), properties=[_Property(name='text', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=_PropertyVectorizerConfig(skip=False, vectorize_property_name=True), vectorizer='text2vec-huggingface'), _Property(name='start', description=None, data_type=<DataType.NUMBER: 'number'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_prop

In [34]:
def batch_import(segments, batch_size=20):  # Reduced batch size
    """Import data in smaller batches"""
    for i in range(0, len(segments), batch_size):
        batch = segments[i:i + batch_size]
        retries = 3
        while retries > 0:
            try:
                with weaviate_client.batch(batch_size=batch_size, dynamic=True) as batch_processor:
                    for segment in batch:
                        batch_processor.add_data_object(
                            data_object=segment,
                            class_name="LATranscript"
                        )
                print(f"Successfully imported batch {i//batch_size}")
                time.sleep(2)  # Longer delay between batches
                break
            except Exception as e:
                print(f"Error in batch {i//batch_size}: {str(e)}")
                retries -= 1
                time.sleep(10)  # Longer delay on error
                if retries == 0:
                    print(f"Failed to import batch after 3 attempts")
transcript_dir = '/Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/Entergy-AI/parsers/CLEANED_LA_PSC_transcripts'

# Process each file
for filename in os.listdir(transcript_dir):
    if filename.endswith('.json'):
        print(f"Processing {filename}")
        filepath = os.path.join(transcript_dir, filename)
        
        try:
            with open(filepath) as f:
                data = json.load(f)
            combined_data = combine_segments(data)
            print(f"Combined {len(data)} segments into {len(combined_data)} chunks")
            
            batch_import(combined_data)
            print(f"Successfully imported {filename}")
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

print("Import complete")

Processing Louisiana Public Service Commission Live Stream - September 2023.json
Combined 4860 segments into 1607 chunks


            Use the `client.batch.configure()` method to configure your batch process, and `client.batch` to enter the context manager.

            See https://weaviate.io/developers/weaviate/client-libraries/python for details.


Successfully imported batch 0
Successfully imported batch 1
Successfully imported batch 2
Successfully imported batch 3
Successfully imported batch 4
Successfully imported batch 5
Successfully imported batch 6
Successfully imported batch 7
Successfully imported batch 8
Successfully imported batch 9
Successfully imported batch 10
Successfully imported batch 11
Successfully imported batch 12
Successfully imported batch 13
Successfully imported batch 14
Successfully imported batch 15
Successfully imported batch 16
Successfully imported batch 17
Successfully imported batch 18
Successfully imported batch 19
Successfully imported batch 20
Successfully imported batch 21
Successfully imported batch 22
Successfully imported batch 23
Successfully imported batch 24
Successfully imported batch 25
Successfully imported batch 26
Successfully imported batch 27
Successfully imported batch 28
Successfully imported batch 29
Successfully imported batch 30
Successfully imported batch 31
Successfully impor

In [38]:
def search_transcripts(query, limit=5):
    try:
        result = weaviate_client.query.get(
            "LATranscript",
            ["text", "filename", "start"]
        ).with_near_text({
            "concepts": [query]
        }).with_limit(limit).do()

        print(f"\nResults for query: '{query}'")
        print("-" * 50)
        
        # Get unique results (avoid duplicates)
        seen_texts = set()
        for t in result['data']['Get']['LATranscript']:
            if t['text'] not in seen_texts:
                seen_texts.add(t['text'])
                print(f"\nFile: {t['filename']}")
                
                # Convert timestamp to minutes and seconds
                minutes = int(t['start'] // 60)
                seconds = int(t['start'] % 60)
                print(f"Timestamp: {minutes}:{seconds:02d}")
                
                print(f"Text: {t['text']}\n")
                print("-" * 50)

    except Exception as e:
        if "data" not in result:
            print("No results found")
            return
        print(f"Error: {str(e)}")

# Test specific queries
queries = [
    "rate increases",
    "customer complaints about Entergy",
    "renewable energy projects"
]

for query in queries:
    search_transcripts(query)


Results for query: 'rate increases'
--------------------------------------------------

File: Louisiana Public Service Commission Live Stream - August 2023.json
Timestamp: 67:46
Text: continue to increase and this is for the average last two decades of this Century under that higher scenario

--------------------------------------------------

File: Louisiana Public Service Commission Live Stream - January 2023.json
Timestamp: 76:32
Text: you you know there's been a series of Federal Reserve rate increases and that's driven rates quite a bit higher

--------------------------------------------------

File: Louisiana Public Service Commission Live Stream - February 2022.json
Timestamp: 121:02
Text: interest rate increases

--------------------------------------------------

Results for query: 'customer complaints about Entergy'
--------------------------------------------------

File: Louisiana Public Service Commission Live Stream - December 2022.json
Timestamp: 194:56
Text: Utilities

In [6]:
import weaviate
import anthropic
from anthropic import Anthropic
from rich.console import Console
from rich.panel import Panel
from rich.logging import RichHandler
import logging

class PSC_RAG:
    def __init__(self, weaviate_url, weaviate_key, anthropic_key):
        # Initialize Weaviate
        self.weaviate_client = weaviate.Client(
            url=weaviate_url,
            auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_key),
            additional_headers={
                "X-OpenAI-Api-Key": openai_key  # Still needed for embeddings
            }
        )
        
        # Initialize Anthropic
        self.claude = Anthropic(api_key=anthropic_key)
        
    def get_context(self, query, limit=5):
        result = self.weaviate_client.query.get(
            "LATranscript",
            ["text", "filename", "start"]
        ).with_near_text({
            "concepts": [query]
        }).with_limit(limit).do()
        
        contexts = []
        for r in result['data']['Get']['LATranscript']:
            contexts.append(f"From {r['filename']} at {int(r['start']//60)}:{int(r['start']%60):02d}: {r['text']}")
            
        return "\n".join(contexts)
    
    def ask(self, question):
        try:
            context = self.get_context(question)
            response = self.claude.messages.create(
                model="claude-3-opus-20240229",
                max_tokens=1000,
                system="You are an expert in analyzing PSC meeting transcripts. Provide clear, specific answers based on the provided context.",
                messages=[{
                    "role": "user",
                    "content": f"""Based on these PSC meeting transcript excerpts, please answer the question and cite specific transcript dates and timestamps.
                    If you can't answer based on the provided context, say so.

                    Context:
                    {context}

                    Question: {question}"""
                }]
            )
            
            # Make sure we return a string
            if hasattr(response.content, 'text'):
                return response.content.text
            return str(response.content)
                
        except Exception as e:
            logger.error(f"Error processing question: {str(e)}")
            return "Sorry, I encountered an error processing your question."

# Usage
def main():
    console = Console()
    rag = PSC_RAG(
        weaviate_url=weaviate_url,
        weaviate_key=weaviate_key,
        anthropic_key=anthropic_key
    )
    
    while True:
        question = console.input("\n[bold cyan]Ask a question about PSC meetings (or 'quit' to exit):[/] ")

        # question = input("\nAsk a question about PSC meetings (or 'quit' to exit): ")
        if question.lower() == 'quit':
            break
            
        answer = rag.ask(question)
        # Extract just the text from the TextBlock
        if hasattr(answer, 'text'):
            answer_text = answer.text
        else:
            answer_text = str(answer)
            
        # Remove any TextBlock wrapper if present
        if answer_text.startswith('[TextBlock'):
            answer_text = answer_text.split('text=\'')[1].split('\', type=')[0]

        answer_text = answer_text.replace('\\n', '\n')

            
        console.print(Panel(
            answer_text,
            title="[bold blue]Answer[/]",
            border_style="blue",
            padding=(1, 2),
            expand=True
        ))
        
        console.print("=" * 80)


if __name__ == "__main__":
    main()

Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  self.weaviate_client = weaviate.Client(


NameError: name 'logger' is not defined