# Generate Queries to test RAG System

We put the entire text of the file into context and send it to Gemini-2.5-Flash and ask it to generate a list of queries that we can use to test the RAG system.

In [None]:
from pydantic import BaseModel
from typing import List
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

class TestCase(BaseModel):
    query: str
    expected_answer: str
    citations: List[str]

class OutputSchema(BaseModel):
    test_cases: List[TestCase]

# Load document context
with open("../texts/adk-docs.txt", "r") as f:
    document_context = f.read()

# Prepare system prompt
system_prompt = """
You are a world-class assistant specializing in generating comprehensive test cases for Retrieval-Augmented Generation (RAG) systems.

Given the full context of a document, your task is to create a diverse set of at least 100 queries to rigorously evaluate the RAG system. Each query should target specific facts, details, or concepts from the context, and once specific queries are exhausted, include broader or inferential questions that still relate to the content.

For each test case, provide:
- `query`: A clear, concise question that could be asked of the RAG system.
- `expected_answer`: The precise answer that should be returned, based strictly on the context.
- `citations`: A list of references (quotes) from the context that support the answer.

Ensure queries cover a wide range of topics, including factual recall, reasoning, chronology, character analysis, and thematic understanding. Avoid duplication and strive for variety in question types.

Return the output as a JSON object with a list of test cases in the following format:
{{
    "test_cases": [
        {{
            "query": "...",
            "expected_answer": "...",
            "citations": ["..."]
        }},
        ...
    ]
}}

Context:
{context}
"""

# Set up Gemini LLM with LangChain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    thinking_budget=0,
    response_mime_type="application/json",
    response_schema=OutputSchema.schema()
)

prompt = PromptTemplate(
    input_variables=["context"],
    template=system_prompt
)

chain = LLMChain(llm=llm, prompt=prompt)

# Generate the test set
result = chain.run(context=document_context)

result

/var/folders/yy/t6y4brw52tvdr8k9xbwc01d40000gn/T/ipykernel_17908/2799777735.py:61: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_schema=OutputSchema.schema()
Key '$defs' is not supported in schema, ignoring


'{\n  "test_cases": [\n    {\n      "citations": [\n        "# Code owners file.",\n        "This file controls who is tagged for review for any given pull request.",\n        "*                                      @GoogleCloudPlatform/teams/generative-ai-devrel"\n      ],\n      "expected_answer": "The code owner for all files is the GitHub team `@GoogleCloudPlatform/teams/generative-ai-devrel`.",\n      "query": "Who is the code owner for this repository?"\n    },\n    {\n      "citations": [\n        "name: Bug report",\n        "about: Create a report to help us improve",\n        "title: \'\'",\n        "labels: \'\'",\n        "assignees: \'\'",\n        "**Describe the bug**",\n        "A clear and concise description of what the bug is.",\n        "**To Reproduce**",\n        "Steps to reproduce the behavior:",\n        "1. Go to \'...\'",\n        "2. Click on \'....\'",\n        "3. See error",\n        "**Expected behavior**",\n        "A clear and concise description of wh

In [5]:
import json

parsed = json.loads(result)
print(parsed["test_cases"])

[{'citations': ['# Code owners file.', 'This file controls who is tagged for review for any given pull request.', '*                                      @GoogleCloudPlatform/teams/generative-ai-devrel'], 'expected_answer': 'The code owner for all files is the GitHub team `@GoogleCloudPlatform/teams/generative-ai-devrel`.', 'query': 'Who is the code owner for this repository?'}, {'citations': ['name: Bug report', 'about: Create a report to help us improve', "title: ''", "labels: ''", "assignees: ''", '**Describe the bug**', 'A clear and concise description of what the bug is.', '**To Reproduce**', 'Steps to reproduce the behavior:', "1. Go to '...'", "2. Click on '....'", '3. See error', '**Expected behavior**', 'A clear and concise description of what you expected to happen.', '**Screenshots**', 'If applicable, add screenshots to help explain your problem.', '**Versions**', ' - OS: [e.g. Windows, Mac, Linux]', ' - ADK version:', ' - Python version:', '**Additional context**', 'Add any

# Turn into DataFrame

In [6]:
import pandas as pd
df = pd.DataFrame(parsed["test_cases"])
df.head()

Unnamed: 0,citations,expected_answer,query
0,"[# Code owners file., This file controls who i...",The code owner for all files is the GitHub tea...,Who is the code owner for this repository?
1,"[name: Bug report, about: Create a report to h...",The bug report template includes sections for ...,What information should be included in a bug r...
2,"[name: Feature request, about: Suggest an idea...",A feature request should describe the problem ...,What are the key sections of a feature request?
3,"[allowedCopyrightHolders:, - ""Google LLC"", a...","The header checker lint configuration allows ""...",What are the configurations for the header che...
4,"[ ""extends"": [, ""config:recommended"", ]...",The Renovate configuration extends `config:rec...,What are the general settings in the Renovate ...


In [7]:
# Reorder the columns, query first, then expected_answer, then citations
df = df[["query", "expected_answer", "citations"]]

In [10]:
# Save the DataFrame to a CSV file
df.to_csv("../tests/longADKDocsTestCases.csv", index=False)