In [None]:
# Advanced RAG Content Engine - Generic Version
import os
import logging
import asyncio
from typing import List, Dict, Any
from datetime import datetime
import pandas as pd
import numpy as np

# Core imports
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

# FIXED IMPORT - SemanticChunker is in experimental module
from langchain_experimental.text_splitter import SemanticChunker

# Evaluation imports
try:
    from ragas import evaluate
    from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
    from datasets import Dataset
    RAGAS_AVAILABLE = True
except ImportError:
    print("RAGAS not available. Install with: pip install ragas")
    RAGAS_AVAILABLE = False

# Advanced features
from langchain.schema import Document
from sentence_transformers import SentenceTransformer

from dotenv import load_dotenv

load_dotenv()

class GenericContentEngine:
    def __init__(self, pdf_path: str = 'pdfs/', persist_directory: str = 'db'):
        self.pdf_path = pdf_path
        self.persist_directory = persist_directory
        self.setup_logging()
        self.metrics = {}
        self.document_sources = set()  # Track document sources dynamically
        
    def setup_logging(self):
        """Enhanced logging setup"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('content_engine.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
    def load_and_process_documents(self) -> List[Document]:
        """Advanced document loading with metadata enrichment and source detection"""
        self.logger.info("Loading documents...")
        loader = PyPDFDirectoryLoader(path=self.pdf_path, glob="**/*.pdf")
        pdfs = loader.load()
        
        # Enrich metadata and track sources
        for doc in pdfs:
            doc.metadata['load_timestamp'] = datetime.now().isoformat()
            doc.metadata['word_count'] = len(doc.page_content.split())
            
            # Extract document source/company name from filename or content
            source_file = doc.metadata.get('source', '').lower()
            if source_file:
                # Extract company/document type from filename
                filename = os.path.basename(source_file)
                self.document_sources.add(filename)
            
        self.logger.info(f"Loaded {len(pdfs)} documents from sources: {list(self.document_sources)}")
        return pdfs
    
    def advanced_chunking(self, documents: List[Document]) -> List[Document]:
        """Implement semantic chunking for better context preservation"""
        # Traditional chunking
        traditional_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=200,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
        traditional_chunks = traditional_splitter.split_documents(documents)
        
        # Semantic chunking - FIXED IMPLEMENTATION
        try:
            # Create embeddings for semantic chunker
            embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
            
            # Initialize semantic chunker with correct import
            semantic_splitter = SemanticChunker(
                embeddings=embeddings,
                breakpoint_threshold_type="percentile",
                breakpoint_threshold_amount=95
            )
            
            # Split documents using semantic chunker
            semantic_chunks = []
            for doc in documents:
                chunks = semantic_splitter.split_text(doc.page_content)
                for chunk in chunks:
                    semantic_chunks.append(Document(
                        page_content=chunk,
                        metadata={**doc.metadata, 'chunk_type': 'semantic'}
                    ))
            
            # Mark traditional chunks
            for chunk in traditional_chunks:
                chunk.metadata['chunk_type'] = 'traditional'
            
            # Combine both approaches
            all_chunks = traditional_chunks + semantic_chunks
            self.logger.info(f"Created {len(all_chunks)} chunks ({len(traditional_chunks)} traditional + {len(semantic_chunks)} semantic)")
            return all_chunks
            
        except Exception as e:
            self.logger.warning(f"Semantic chunking failed: {e}, using traditional chunking only")
            return traditional_chunks
    
    def create_hybrid_retriever(self, documents: List[Document]) -> EnsembleRetriever:
        """Create hybrid retriever with vector + BM25"""
        # Vector retriever
        embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
        vectorstore = Chroma.from_documents(
            documents, 
            embeddings, 
            persist_directory=self.persist_directory
        )
        vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
        
        # BM25 retriever
        bm25_retriever = BM25Retriever.from_documents(documents)
        bm25_retriever.k = 10
        
        # Ensemble retriever (hybrid)
        ensemble_retriever = EnsembleRetriever(
            retrievers=[vector_retriever, bm25_retriever],
            weights=[0.7, 0.3]  # Favor vector search slightly
        )
        
        return ensemble_retriever
    
    def initialize_advanced_llm(self) -> ChatGroq:
        """Initialize LLM with advanced configuration"""
        return ChatGroq(
            groq_api_key=os.getenv('GROQ_API_KEY'),
            model_name="llama-3.3-70b-versatile",
            temperature=0.3,
            max_tokens=1000
        )
    
    def create_generic_prompt(self) -> PromptTemplate:
        """GENERIC prompt template that works with any documents"""
        template = """You are an expert document analyst with access to various PDF documents. Your task is to analyze the provided context and answer questions accurately based on the available information.

Context Information:
{context}

Instructions:
1. Analyze the provided context carefully from the available documents
2. Answer the question with specific details, numbers, and facts when available
3. Cite the source document and page number when referencing specific information
4. If comparing multiple entities/companies/topics, clearly distinguish between them
5. If information is insufficient or missing, clearly state what's not available
6. Provide reasoning and evidence for your conclusions
7. Be objective and factual in your analysis

Question: {question}

Detailed Analysis and Answer:"""
        
        return PromptTemplate(
            template=template, 
            input_variables=['question', 'context']
        )
    
    def get_document_summary(self) -> str:
        """Generate a summary of loaded documents for context"""
        if self.document_sources:
            sources_list = ", ".join([os.path.splitext(source)[0] for source in self.document_sources])
            return f"Available documents: {sources_list}"
        return "Multiple documents available for analysis"
    
    def evaluate_rag_performance(self, questions: List[str], ground_truths: List[str]) -> Dict:
        """Evaluate RAG system performance using RAGAS"""
        if not RAGAS_AVAILABLE:
            self.logger.warning("RAGAS not available. Skipping evaluation.")
            return {}
            
        try:
            # Generate answers for evaluation
            answers = []
            contexts = []
            
            for question in questions:
                docs = self.retriever.invoke(question)
                context = "\n".join([d.page_content for d in docs])
                contexts.append([context])
                
                response = self.chain.invoke({
                    "question": question,
                    "context": context
                })
                answers.append(response.content)
            
            # Create evaluation dataset
            eval_dataset = Dataset.from_dict({
                "question": questions,
                "answer": answers,
                "contexts": contexts,
                "ground_truth": ground_truths
            })
            
            # Evaluate
            result = evaluate(
                eval_dataset,
                metrics=[faithfulness, answer_relevancy, context_precision, context_recall]
            )
            
            self.metrics = result
            return result
            
        except Exception as e:
            self.logger.error(f"Evaluation failed: {e}")
            return {}
    
    def setup_complete_pipeline(self):
        """Setup the complete advanced pipeline"""
        # Load and process documents
        documents = self.load_and_process_documents()
        chunks = self.advanced_chunking(documents)
        
        # Create hybrid retriever
        self.retriever = self.create_hybrid_retriever(chunks)
        
        # Initialize LLM and prompt
        self.llm = self.initialize_advanced_llm()
        self.prompt = self.create_generic_prompt()  # GENERIC PROMPT
        
        # Create chain
        self.chain = self.prompt | self.llm
        
        self.logger.info("Generic pipeline setup complete")
        self.logger.info(self.get_document_summary())

# Initialize and run the generic system
engine = GenericContentEngine()
engine.setup_complete_pipeline()

# Generic test questions that work with any documents
test_questions = [
    "What are the main topics covered in the available documents?",
    "What financial information is available in the documents?",
    "What are the key risk factors mentioned across the documents?",
    "Compare the business models or strategies mentioned in different documents",
    "What revenue or financial metrics are reported in the documents?"
]

print("=== GENERIC RAG SYSTEM TESTING ===")
print(f"Document Sources: {list(engine.document_sources)}")

for i, question in enumerate(test_questions, 1):
    print(f"\n--- Query {i}: {question} ---")
    
    try:
        # Retrieve documents
        docs = engine.retriever.invoke(question)
        context = "\n".join([d.page_content for d in docs])
        
        # Generate response
        response = engine.chain.invoke({
            "question": question,
            "context": context
        })
        
        print(f"Answer: {response.content}")
        print(f"Sources: {len(docs)} documents retrieved")
        
    except Exception as e:
        print(f"Error processing question: {e}")

print("\n=== GENERIC PIPELINE SETUP COMPLETE ===")
