# Generate Queries to test RAG System

We put the entire text of the file into context and send it to Gemini-2.5-Flash and ask it to generate a list of queries that we can use to test the RAG system.

In [1]:
from pydantic import BaseModel
from typing import List
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

class TestCase(BaseModel):
    query: str
    expected_answer: str
    citations: List[str]

class OutputSchema(BaseModel):
    test_cases: List[TestCase]

# Load document context
with open("../texts/RomeoAndJuliet.txt", "r") as f:
    document_context = f.read()

# Prepare system prompt
system_prompt = """
You are a world-class assistant specializing in generating comprehensive test cases for Retrieval-Augmented Generation (RAG) systems.

Given the full context of a document, your task is to create a diverse set of at least 100 queries to rigorously evaluate the RAG system. Each query should target specific facts, details, or concepts from the context, and once specific queries are exhausted, include broader or inferential questions that still relate to the content.

For each test case, provide:
- `query`: A clear, concise question that could be asked of the RAG system.
- `expected_answer`: The precise answer that should be returned, based strictly on the context.
- `citations`: A list of references (quotes, chapter titles, or locations) from the context that support the answer.

Ensure queries cover a wide range of topics, including factual recall, reasoning, chronology, character analysis, and thematic understanding. Avoid duplication and strive for variety in question types.

Return the output as a JSON object with a list of test cases in the following format:
{{
    "test_cases": [
        {{
            "query": "...",
            "expected_answer": "...",
            "citations": ["..."]
        }},
        ...
    ]
}}

Context:
{context}
"""

# Set up Gemini LLM with LangChain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    thinking_budget=0,
    response_mime_type="application/json",
    response_schema=OutputSchema.schema()
)

prompt = PromptTemplate(
    input_variables=["context"],
    template=system_prompt
)

chain = LLMChain(llm=llm, prompt=prompt)

# Generate the test set
result = chain.run(context=document_context)

result

/var/folders/yy/t6y4brw52tvdr8k9xbwc01d40000gn/T/ipykernel_72223/3288076892.py:61: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_schema=OutputSchema.schema()
  chain = LLMChain(llm=llm, prompt=prompt)
  result = chain.run(context=document_context)
Key '$defs' is not supported in schema, ignoring




In [4]:
import json

parsed = json.loads(result)
print(parsed["test_cases"])

[{'citations': ['Copyright \n© 2024 Amazon Web Services, Inc. and/or its affiliates. All rights reserved.'], 'expected_answer': 'The copyright for the AWS Prescriptive Guidance: Cloud design patterns, architectures, and implementations is held by Amazon Web Services, Inc. and/or its affiliates, as of 2024.', 'query': 'Who holds the copyright for this document and when was it issued?'}, {'citations': ['Table of Contents', 'Introduction .............................................................................................................................................. 1'], 'expected_answer': 'The Introduction section of the document starts on page 1.', 'query': 'On which page does the Introduction section begin?'}, {'citations': ['Table of Contents', 'Anti-corruption layer pattern .................................................................................................................... 3'], 'expected_answer': 'The Anti-corruption layer pattern section starts on page 3.

# Turn into DataFrame

In [5]:
import pandas as pd
df = pd.DataFrame(parsed["test_cases"])
df.head()

Unnamed: 0,citations,expected_answer,query
0,"[Copyright \n© 2024 Amazon Web Services, Inc. ...",The copyright for the AWS Prescriptive Guidanc...,Who holds the copyright for this document and ...
1,"[Table of Contents, Introduction ................",The Introduction section of the document start...,On which page does the Introduction section be...
2,"[Table of Contents, Anti-corruption layer patt...",The Anti-corruption layer pattern section star...,What page number is the Anti-corruption layer ...
3,"[Table of Contents, API routing patterns ........",API routing patterns are covered starting on p...,Where can I find information about API routing...
4,"[Table of Contents, Circuit breaker pattern .....",The Circuit breaker pattern begins on page 19.,What is the starting page for the Circuit brea...


In [6]:
# Reorder the columns, query first, then expected_answer, then citations
df = df[["query", "expected_answer", "citations"]]

In [8]:
# Save the DataFrame to a CSV file
df.to_csv("../tests/longCDPTestCases.csv", index=False)