In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import pandas as pd 

#QA
inputs = [
    "What is the name of the first chapter in this pdf",
    "what is the atomic number of hydrogen?",
    "Total number of chapters in this document?",
]

outputs = [
    "The first chapter is about Physical Quantities and Measurement.",
    "The atomic number of hydrogen is 1.",
    "There are a total of 16 chapters in this document."
]

# dataset
qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)

# write to csv
csv_path = "../data/goldens.csv"
df.to_csv(csv_path, index=False)



In [3]:
from langsmith import Client 

client = Client()
dataset_name = "llmops_dataset_new"

# Store 
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="input and expected output pairs for llmops series",
)

client.create_examples(
    inputs = [{"question": q} for q in inputs],
    outputs = [{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

{'example_ids': ['16215aee-1455-4896-9bf1-33b7b31baf74',
  'd3043341-9ae9-419d-a545-fdb4d4f32216',
  '2d9bd7d4-5673-4333-b634-f85de4d5d695'],
 'count': 3}

In [4]:
import sys
sys.path.append("D:\\Machine learning\\llmops_series")

from pathlib import Path
from src.document_ingestion.data_ingestion import ChatIngestor
from src.document_chat.retrieval import ConversationalRAG 
import os


# Simple file adapter for local file paths
class LocalFileAdapter:
    """Adapter for local file paths to work with ChatIngestor."""
    def __init__(self, file_path: str):
        self.path = Path(file_path)
        self.name = self.path.name
        self.filename = self.path.name  # Add filename attribute for compatibility
    
    def read(self) -> bytes:
        """Read method for file-like interface."""
        return self.path.read_bytes()
    
    def getbuffer(self) -> bytes:
        """Get buffer method for compatibility."""
        return self.path.read_bytes()


def answer_ai_report_question(
    inputs: dict,
    data_path: str = "D://Machine learning/llmops_series/data/Physics 9th Ch 1 Final.pdf",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    k: int = 5
) -> dict:
    """
    Answer questions about the AI Engineering Report using RAG.
    
    Args:
        inputs: Dictionary containing the question, e.g., {"question": "What is RAG?"}
        data_path: Path to the AI Engineering Report text file
        chunk_size: Size of text chunks for splitting
        chunk_overlap: Overlap between chunks
        k: Number of documents to retrieve
    
    Returns:
        Dictionary with the answer, e.g., {"answer": "RAG stands for..."}
    """
    try:
        # Extract question from inputs
        question = inputs.get("question", "")
        if not question:
            return {"answer": "No question provided"}
        
        # Check if file exists
        if not Path(data_path).exists():
            return {"answer": f"Data file not found: {data_path}"}
        
        # Create file adapter
        file_adapter = LocalFileAdapter(data_path)
        
        # Build index using ChatIngestor
        ingestor = ChatIngestor(
            temp_base="data",
            faiss_base="faiss_index",
            use_session_dirs=True
        )
        
        # Build retriever
        ingestor.built_retriever(
            uploaded_files=[file_adapter],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            k=k
        )
        
        # Get session ID and index path
        session_id = ingestor.session_id
        index_path = f"faiss_index/{session_id}"
        
        # Create RAG instance and load retriever
        rag = ConversationalRAG(session_id=session_id)
        rag.load_retriever_from_faiss(
            index_path=index_path,
            k=k,
            index_name=os.getenv("FAISS_INDEX_NAME", "index")
        )
        
        # Get answer
        answer = rag.invoke(question, chat_history=[])
        
        return {"answer": answer}
        
    except Exception as e:
        return {"answer": f"Error: {str(e)}"}

In [5]:
# Test the function 
test_input = {"question": "What is the name of the first chapter in this pdf?"}
result = answer_ai_report_question(test_input)
print("question:", test_input["question"])
print("answer:", result["answer"])

{"timestamp": "2025-11-13T15:40:31.272063Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-13T15:40:31.273094Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-13T15:40:31.273094Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_Kv...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-13T15:40:31.278956Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-13T15:40:31.287452Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251113_204031_cd419d3a", "temp_dir": "data\\session_20251113_204031_cd419d3a", "faiss_dir": "faiss_index\\session_20251113_204031_cd419d3a", "sessionized": true, "timestamp": "2025-11-13T15:40:31.291087Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "Physics 9th Ch 1 Final.pdf", "saved_as": "da

question: What is the name of the first chapter in this pdf?
answer: The first chapter in this PDF is "PHYSICAL QUANTITIES AND MEASUREMENT".


In [6]:
# Example : Test with all golden questions
print("Testing all questions from the dataset:\n")
for i, q in enumerate(inputs, 1):
    test_input = {"question": q}
    result = answer_ai_report_question(test_input)
    print(f"Q{i}: {q}")
    print(f"A{i}: {result['answer']}\n")
    print("-" * 80 + "\n")    

{"timestamp": "2025-11-13T15:40:40.649717Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-13T15:40:40.655525Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-13T15:40:40.660810Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_Kv...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-13T15:40:40.662587Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-13T15:40:40.678016Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251113_204040_f78ed35e", "temp_dir": "data\\session_20251113_204040_f78ed35e", "faiss_dir": "faiss_index\\session_20251113_204040_f78ed35e", "sessionized": true, "timestamp": "2025-11-13T15:40:40.684813Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "Physics 9th Ch 1 Final.pdf", "saved_as": "da

Testing all questions from the dataset:



{"count": 23, "timestamp": "2025-11-13T15:40:42.267482Z", "level": "info", "event": "Documents loaded"}
{"chunks": 62, "chunk_size": 1000, "overlap": 200, "timestamp": "2025-11-13T15:40:42.272255Z", "level": "info", "event": "Documents split"}
{"model": "models/text-embedding-004", "timestamp": "2025-11-13T15:40:42.273469Z", "level": "info", "event": "Loading embedding model"}
{"added": 1, "index": "faiss_index\\session_20251113_204040_f78ed35e", "timestamp": "2025-11-13T15:40:45.744218Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-13T15:40:45.746237Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-13T15:40:45.756505Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-13T15:40:45.760084Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-13T15:40:45.764612Z", "level": "info", "event": "Loaded GOOGLE_API_KEY f

Q1: What is the name of the first chapter in this pdf
A1: The first chapter in this PDF is "PHYSICAL QUANTITIES AND MEASUREMENT".

--------------------------------------------------------------------------------



{"count": 23, "timestamp": "2025-11-13T15:40:50.334196Z", "level": "info", "event": "Documents loaded"}
{"chunks": 62, "chunk_size": 1000, "overlap": 200, "timestamp": "2025-11-13T15:40:50.337238Z", "level": "info", "event": "Documents split"}
{"model": "models/text-embedding-004", "timestamp": "2025-11-13T15:40:50.342198Z", "level": "info", "event": "Loading embedding model"}
{"added": 1, "index": "faiss_index\\session_20251113_204048_90cff720", "timestamp": "2025-11-13T15:40:54.325235Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-13T15:40:54.327146Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-13T15:40:54.339287Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-13T15:40:54.342068Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-13T15:40:54.345329Z", "level": "info", "event": "Loaded GOOGLE_API_KEY f

Q2: what is the atomic number of hydrogen?
A2: I'm sorry, but the provided text does not contain the answer to this question.

--------------------------------------------------------------------------------



{"count": 23, "timestamp": "2025-11-13T15:40:59.829126Z", "level": "info", "event": "Documents loaded"}
{"chunks": 62, "chunk_size": 1000, "overlap": 200, "timestamp": "2025-11-13T15:40:59.834789Z", "level": "info", "event": "Documents split"}
{"model": "models/text-embedding-004", "timestamp": "2025-11-13T15:40:59.836983Z", "level": "info", "event": "Loading embedding model"}
{"added": 1, "index": "faiss_index\\session_20251113_204058_259318b7", "timestamp": "2025-11-13T15:41:03.429249Z", "level": "info", "event": "FAISS index updated"}
{"k": 5, "fetch_k": 20, "lambda_mult": 0.5, "timestamp": "2025-11-13T15:41:03.429249Z", "level": "info", "event": "Using MMR search"}
{"timestamp": "2025-11-13T15:41:03.434259Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-13T15:41:03.434259Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-13T15:41:03.436271Z", "level": "info", "event": "Loaded GOOGLE_API_KEY f

Q3: Total number of chapters in this document?
A3: I'm sorry, but the provided documents do not contain information about the total number of chapters.

--------------------------------------------------------------------------------



In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

qa_evaluator = [LangChainStringEvaluator("cot_qa")]
dataset_name = "llmops_dataset_new"

# Run evaluation using our RAG function
experiment_results = evaluate(
    answer_ai_report_question,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-llmops_dataset-qa-rag",
    metadata={
        "variant": "RAG with FAISS and AI Engineering Report",
        "chunk_size": 1000,
        "chunk_overlap": 200,
        "k": 5
    },
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'test-llmops_dataset-qa-rag-cb5cc9c4' at:
https://smith.langchain.com/o/e8008ce9-eb06-5861-b8e8-97dd6daeac49/datasets/60a4e1ba-caf7-4cd7-9450-04c89b74c7fa/compare?selectedSessions=bc59ef6e-2008-4ab4-9664-7773588b0a2e




0it [00:00, ?it/s]{"timestamp": "2025-11-13T15:41:22.331632Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-11-13T15:41:22.334119Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-11-13T15:41:22.334119Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_Kv...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-11-13T15:41:22.334119Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["embedding_model", "retriever", "llm"], "timestamp": "2025-11-13T15:41:22.345162Z", "level": "info", "event": "YAML config loaded"}
{"session_id": "session_20251113_204122_35ef129a", "temp_dir": "data\\session_20251113_204122_35ef129a", "faiss_dir": "faiss_index\\session_20251113_204122_35ef129a", "sessionized": true, "timestamp": "2025-11-13T15:41:22.348696Z", "level": "info", "event": "ChatIngestor initialized"}
{"uploaded": "Physics 9th Ch 1 Final.pdf