In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import pandas as pd 

#QA
inputs = [
    "What is the name of the first chapter in this pdf",
    "what is the atomic number of hydrogen?",
    "Total number of chapters in this document?",
]

outputs = [
    "The first chapter is about Physical Quantities and Measurement.",
    "The atomic number of hydrogen is 1.",
    "There are a total of 16 chapters in this document."
]

# dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# write to csv
csv_path = "../data/goldens.csv"
df.to_csv(csv_path, index=False)



In [3]:
from langsmith import Client 

client = Client()
dataset_name = "llmops_dataset_new"

# Store 
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="input and expected output pairs for llmops series",
)

client.create_examples(
    inputs = [{"question": q} for q in inputs],
    outputs = [{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

{'example_ids': ['35a1e1ed-5ce6-4a36-8398-dc26e211e54e',
  '9e56b795-e746-4e12-8463-d7126676cdbd',
  '729e6156-fc01-4bda-be69-bbcfe679dbff'],
 'count': 3}

In [4]:
import sys
sys.path.append("D:\\Machine learning\\llmops_series")

from pathlib import Path
from src.document_ingestion.data_ingestion import ChatIngestor
from src.document_chat.retrieval import ConversationalRAG 
import os


# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
        self.filename = self.path.name  # Add filename attribute for compatibility
    
    def read(self) -> bytes:
        """Read method for file-like interface."""
        return self.path.read_bytes()
    
    def getbuffer(self) -> bytes:
        """Get buffer method for compatibility."""
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "D://Machine learning/llmops_series/data/Physics 9th Ch 1 Final.pdf",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriever(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

In [5]:
# Test the function 
test_input = {"question": "What is the name of the first chapter in this pdf?"}
result = answer_ai_report_question(test_input)
print("question:", test_input["question"])
print("answer:", result["answer"])

{"timestamp": "2025-11-12T16:32:54.072444Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-12T16:32:54.074458Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-12T16:32:54.074458Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-12T16:32:54.076475Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"timestamp": "2025-11-12T16:32:54.076475Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_Kv...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-12T16:32:54.076475Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-12T16:32:54.085149Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251112_213254_606830ad", "temp_dir": "data\\session_20251112_213254_606830ad", "

question: What is the name of the first chapter in this pdf?
answer: The first chapter is titled "PHYSICAL QUANTITIES AND MEASUREMENT".
