In [None]:
import os
import pandas as pd
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
import asyncio
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from dotenv import load_dotenv

load_dotenv(override=True)

# Define Pydantic model for structured output
class CallAnalysis(BaseModel):
    """Analysis results for a customer service call transcript."""
    transcript_summary: str = Field(description="Summary of the call transcript in 2-4 sentences")
    call_intent: str = Field(description="Main purpose or intention of the call")
    agents_performance_rating: int = Field(ge=1, le=5, description="Agent performance rating (1-5)")
    agents_performance_rating_rationale: str = Field(description="Rationale for agent performance rating")
    customer_sentiment_rating: int = Field(ge=1, le=5, description="Customer sentiment rating (1-5)")
    customer_sentiment_rationale: str = Field(description="Rationale for customer sentiment rating")

def process_single_transcript(row, llm, parser, prompt):
    """Process a single transcript row"""
    conversation_id = row['conversation_id']
    call_transcript = row['call_transcript']
    
    try:
        # Create the chain with structured output
        chain = prompt | llm | parser
        
        # Get structured response
        result = chain.invoke({"call_transcript": call_transcript})
        
        # Create output row
        output_row = {
            'conversation_id': conversation_id,
            'call_transcript': call_transcript,
            'transcript_summary': result.transcript_summary,
            'call_intent': result.call_intent,
            'agents_performance_rating': result.agents_performance_rating,
            'agents_performance_rating_rationale': result.agents_performance_rating_rationale,
            'customer_sentiment_rating': result.customer_sentiment_rating,
            'customer_sentiment_rationale': result.customer_sentiment_rationale,
        }
        
        print(f"✅ Processed conversation {conversation_id}")
        return output_row, None
        
    except Exception as e:
        error_row = {
            'conversation_id': conversation_id,
            'error_message': str(e),
            'error_type': type(e).__name__,
            'timestamp': datetime.now().isoformat()
        }
        print(f"❌ Error processing conversation {conversation_id}: {e}")
        return None, error_row

def parse_csv_with_llm(input_csv: str, output_csv: str, error_csv: str, max_workers: int = 30):
    """
    Parse CSV with LLM using structured outputs and concurrency
    
    Args:
        input_csv: Path to input CSV file
        output_csv: Path to output CSV file  
        error_csv: Path to error log CSV file
        max_workers: Number of concurrent workers
    """
    
    # Initialize LLM and parser
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))
    parser = PydanticOutputParser(pydantic_object=CallAnalysis)
    
    # Create prompt with format instructions
    prompt = ChatPromptTemplate.from_messages([
        ("system", 
         """You are an AI that analyzes call transcripts between customers and agents.
         
         Extract the following information:
         - A 2-4 sentence summary of the call
         - The main intent/purpose of the call
         - Agent performance rating (1=very poor, 5=excellent)
         - Rationale for agent rating
         - Customer sentiment rating (1=very negative, 5=very positive)  
         - Rationale for sentiment rating
         
         {format_instructions}"""),
        ("user", "Analyze this call transcript:\n\n{call_transcript}")
    ]).partial(format_instructions=parser.get_format_instructions())
    
    try:
        # Read CSV with proper handling of quotes and line breaks
        df = pd.read_csv(input_csv, quotechar='"', escapechar='\\')
        print(f"Loaded {len(df)} transcripts to process")
        
        # Process transcripts concurrently
        successful_results = []
        error_results = []
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks
            futures = [
                executor.submit(process_single_transcript, row, llm, parser, prompt)
                for _, row in df.iterrows()
            ]
            
            # Collect results
            for future in futures:
                success_result, error_result = future.result()
                if success_result:
                    successful_results.append(success_result)
                if error_result:
                    error_results.append(error_result)
        
        # Save successful results
        if successful_results:
            output_df = pd.DataFrame(successful_results)
            output_df.to_csv(output_csv, index=False)
            print(f"✅ Saved {len(successful_results)} processed transcripts to {output_csv}")
        
        # Save error results
        if error_results:
            error_df = pd.DataFrame(error_results)
            error_df.to_csv(error_csv, index=False)
            print(f"⚠️ Saved {len(error_results)} errors to {error_csv}")
        
        print(f"📈 Processing complete: {len(successful_results)} successful, {len(error_results)} errors")
        
    except Exception as e:
        print(f"💥 Fatal error: {e}")
        raise

# Example usage
if __name__ == "__main__":
    input_csv = "../data/call_transcript_sample.csv"
    output_csv = "../data/analyzed_calls_langchain.csv"
    error_csv = "../data/processing_errors.csv"
    
    parse_csv_with_llm(input_csv, output_csv, error_csv, max_workers=50)