In [2]:
import weaviate
import json
import os
import anthropic
import time
from tqdm import tqdm  # For progress tracking
from anthropic import Anthropic
import dotenv

In [3]:
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

weaviate_url = os.getenv('WEAVIATE_URL')
weaviate_key = os.getenv('WEAVIATE_KEY')
openai_key = os.getenv('OPENAI_KEY')
anthropic_key = os.getenv('ANTHROPIC_KEY')



In [4]:
# Setup clients
weaviate_client = weaviate.Client(
    url=weaviate_url,
    auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_key), 
    additional_headers={"X-OpenAI-Api-Key": openai_key},
        timeout_config=(5, 60)  # (Connect timeout, Read timeout)
)
claude_client = anthropic.Client(api_key=anthropic_key)

Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  weaviate_client = weaviate.Client(
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/cl

In [None]:
def combine_segments(data, window_size=3):
    combined_segments = []
    for i in range(0, len(data), window_size):
        window = data[i:i + window_size]
        filtered_text = [seg['text'] for seg in window if len(seg['text'].split()) > 1]
        if filtered_text:
            combined_text = ' '.join(filtered_text)
            combined_segments.append({
                'text': combined_text,
                'start': window[0]['start'],
                'duration': sum([seg['duration'] for seg in window]),
                'filename': filename
            })
    return combined_segments


In [26]:
weaviate_client.schema.delete_class("LATranscript")


In [None]:
schema = {
    "class": "LATranscript",
    "vectorizer": "text2vec-openai",  # vectorizer
    "moduleConfig": {
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text"
        }
    },
    "properties": [
        {"name": "text", "dataType": ["text"]},
        {"name": "start", "dataType": ["number"]},
        {"name": "duration", "dataType": ["number"]},
        {"name": "filename", "dataType": ["text"]}
    ]
}

weaviate_client.schema.create_class(schema)

In [34]:
def batch_import(segments, batch_size=20):  # Reduced batch size
    """Import data in smaller batches"""
    for i in range(0, len(segments), batch_size):
        batch = segments[i:i + batch_size]
        retries = 3
        while retries > 0:
            try:
                with weaviate_client.batch(batch_size=batch_size, dynamic=True) as batch_processor:
                    for segment in batch:
                        batch_processor.add_data_object(
                            data_object=segment,
                            class_name="LATranscript"
                        )
                print(f"Successfully imported batch {i//batch_size}")
                time.sleep(2)  # Longer delay between batches
                break
            except Exception as e:
                print(f"Error in batch {i//batch_size}: {str(e)}")
                retries -= 1
                time.sleep(10)  # Longer delay on error
                if retries == 0:
                    print(f"Failed to import batch after 3 attempts")
transcript_dir = '/Users/petersapountzis/Desktop/tulane/fall2024/cmps4010/Entergy-AI/parsers/CLEANED_LA_PSC_transcripts'

# Process each file
for filename in os.listdir(transcript_dir):
    if filename.endswith('.json'):
        print(f"Processing {filename}")
        filepath = os.path.join(transcript_dir, filename)
        
        try:
            with open(filepath) as f:
                data = json.load(f)
            combined_data = combine_segments(data)
            print(f"Combined {len(data)} segments into {len(combined_data)} chunks")
            
            batch_import(combined_data)
            print(f"Successfully imported {filename}")
            
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

print("Import complete")

Processing Louisiana Public Service Commission Live Stream - September 2023.json
Combined 4860 segments into 1607 chunks


            Use the `client.batch.configure()` method to configure your batch process, and `client.batch` to enter the context manager.

            See https://weaviate.io/developers/weaviate/client-libraries/python for details.


Successfully imported batch 0
Successfully imported batch 1
Successfully imported batch 2
Successfully imported batch 3
Successfully imported batch 4
Successfully imported batch 5
Successfully imported batch 6
Successfully imported batch 7
Successfully imported batch 8
Successfully imported batch 9
Successfully imported batch 10
Successfully imported batch 11
Successfully imported batch 12
Successfully imported batch 13
Successfully imported batch 14
Successfully imported batch 15
Successfully imported batch 16
Successfully imported batch 17
Successfully imported batch 18
Successfully imported batch 19
Successfully imported batch 20
Successfully imported batch 21
Successfully imported batch 22
Successfully imported batch 23
Successfully imported batch 24
Successfully imported batch 25
Successfully imported batch 26
Successfully imported batch 27
Successfully imported batch 28
Successfully imported batch 29
Successfully imported batch 30
Successfully imported batch 31
Successfully impor

In [38]:
def search_transcripts(query, limit=5):
    try:
        result = weaviate_client.query.get(
            "LATranscript",
            ["text", "filename", "start"]
        ).with_near_text({
            "concepts": [query]
        }).with_limit(limit).do()

        print(f"\nResults for query: '{query}'")
        print("-" * 50)
        
        # Get unique results (avoid duplicates)
        seen_texts = set()
        for t in result['data']['Get']['LATranscript']:
            if t['text'] not in seen_texts:
                seen_texts.add(t['text'])
                print(f"\nFile: {t['filename']}")
                
                # Convert timestamp to minutes and seconds
                minutes = int(t['start'] // 60)
                seconds = int(t['start'] % 60)
                print(f"Timestamp: {minutes}:{seconds:02d}")
                
                print(f"Text: {t['text']}\n")
                print("-" * 50)

    except Exception as e:
        if "data" not in result:
            print("No results found")
            return
        print(f"Error: {str(e)}")

# Test specific queries
queries = [
    "rate increases",
    "customer complaints about Entergy",
    "renewable energy projects"
]

for query in queries:
    search_transcripts(query)


Results for query: 'rate increases'
--------------------------------------------------

File: Louisiana Public Service Commission Live Stream - August 2023.json
Timestamp: 67:46
Text: continue to increase and this is for the average last two decades of this Century under that higher scenario

--------------------------------------------------

File: Louisiana Public Service Commission Live Stream - January 2023.json
Timestamp: 76:32
Text: you you know there's been a series of Federal Reserve rate increases and that's driven rates quite a bit higher

--------------------------------------------------

File: Louisiana Public Service Commission Live Stream - February 2022.json
Timestamp: 121:02
Text: interest rate increases

--------------------------------------------------

Results for query: 'customer complaints about Entergy'
--------------------------------------------------

File: Louisiana Public Service Commission Live Stream - December 2022.json
Timestamp: 194:56
Text: Utilities

In [6]:
import weaviate
import anthropic
from anthropic import Anthropic
from rich.console import Console
from rich.panel import Panel
from rich.logging import RichHandler
import logging

class PSC_RAG:
    def __init__(self, weaviate_url, weaviate_key, anthropic_key):
        # Initialize Weaviate
        self.weaviate_client = weaviate.Client(
            url=weaviate_url,
            auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_key),
            additional_headers={
                "X-OpenAI-Api-Key": openai_key  # Still needed for embeddings
            }
        )
        
        # Initialize Anthropic
        self.claude = Anthropic(api_key=anthropic_key)
        
    def get_context(self, query, limit=5):
        result = self.weaviate_client.query.get(
            "LATranscript",
            ["text", "filename", "start"]
        ).with_near_text({
            "concepts": [query]
        }).with_limit(limit).do()
        
        contexts = []
        for r in result['data']['Get']['LATranscript']:
            contexts.append(f"From {r['filename']} at {int(r['start']//60)}:{int(r['start']%60):02d}: {r['text']}")
            
        return "\n".join(contexts)
    
    def ask(self, question):
        try:
            context = self.get_context(question)
            response = self.claude.messages.create(
                model="claude-3-opus-20240229",
                max_tokens=1000,
                system="You are an expert in analyzing PSC meeting transcripts. Provide clear, specific answers based on the provided context.",
                messages=[{
                    "role": "user",
                    "content": f"""Based on these PSC meeting transcript excerpts, please answer the question and cite specific transcript dates and timestamps.
                    If you can't answer based on the provided context, say so.

                    Context:
                    {context}

                    Question: {question}"""
                }]
            )
            
            # Make sure we return a string
            if hasattr(response.content, 'text'):
                return response.content.text
            return str(response.content)
                
        except Exception as e:
            logger.error(f"Error processing question: {str(e)}")
            return "Sorry, I encountered an error processing your question."

# Usage
def main():
    console = Console()
    rag = PSC_RAG(
        weaviate_url=weaviate_url,
        weaviate_key=weaviate_key,
        anthropic_key=anthropic_key
    )
    
    while True:
        question = console.input("\n[bold cyan]Ask a question about PSC meetings (or 'quit' to exit):[/] ")

        # question = input("\nAsk a question about PSC meetings (or 'quit' to exit): ")
        if question.lower() == 'quit':
            break
            
        answer = rag.ask(question)
        # Extract just the text from the TextBlock
        if hasattr(answer, 'text'):
            answer_text = answer.text
        else:
            answer_text = str(answer)
            
        # Remove any TextBlock wrapper if present
        if answer_text.startswith('[TextBlock'):
            answer_text = answer_text.split('text=\'')[1].split('\', type=')[0]

        answer_text = answer_text.replace('\\n', '\n')

            
        console.print(Panel(
            answer_text,
            title="[bold blue]Answer[/]",
            border_style="blue",
            padding=(1, 2),
            expand=True
        ))
        
        console.print("=" * 80)


if __name__ == "__main__":
    main()

Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  self.weaviate_client = weaviate.Client(


NameError: name 'logger' is not defined