# 🎯 Generate Synthetic Test Data with RAGAS

### Load dependencies

In [5]:
import sys, os
from pathlib import Path

from dotenv import load_dotenv
load_dotenv(Path.cwd().parent / ".env")

True

### Load the Data

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

data_path = Path.cwd().parent / "data"
selected_pdfs = [
    "CIS_Amazon_Web_Services_Foundations_Benchmark_v6.0.0.pdf",
    "OWASP_Application_Security_Verification_Standard_5.0.0_en.pdf"
    ]

docs = []
for pdf_name in selected_pdfs:
    try:
        pdf_path = data_path / pdf_name
        if pdf_path.exists():
            loader = PyMuPDFLoader(str(pdf_path))
            docs.extend(loader.load())
            print(f"Loaded: {pdf_name} - {len(docs)} total pages so far")
    except Exception as e:
        print(f"Error loading {pdf_name}: {e}")

print(f"\nTotal documents loaded: {len(docs)}")

Loaded: CIS_Amazon_Web_Services_Foundations_Benchmark_v6.0.0.pdf - 277 total pages so far
Loaded: OWASP_Application_Security_Verification_Standard_5.0.0_en.pdf - 400 total pages so far

Total documents loaded: 400


### Set the Generator and Embedding Model

In [6]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model=os.getenv("LLM_MODEL")))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

### Create the Knowledge Graph

In [2]:
from ragas.testset.graph import KnowledgeGraph, Node, NodeType

In [None]:
# set the kg
kg = KnowledgeGraph()

# insert the docs into the graph
for doc in docs:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
        )
    )
kg


KnowledgeGraph(nodes: 400, relationships: 0)

In [7]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

response = client.with_raw_response.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": "test"}],
    max_tokens=5
)

# Check the headers
headers = response.headers
print(f"Remaining Requests: {headers.get('x-ratelimit-remaining-requests')}")
print(f"Remaining Tokens: {headers.get('x-ratelimit-remaining-tokens')}")
print(f"Reset Requests: {headers.get('x-ratelimit-reset-requests')}")
print(f"Reset Tokens: {headers.get('x-ratelimit-reset-tokens')}")

Remaining Requests: 5153
Remaining Tokens: 199997
Reset Requests: 11h37m53.292s
Reset Tokens: 0s


In [8]:
transformer_llm = generator_llm
embedding_model = generator_embeddings

In [None]:
# apply transformations
from ragas.testset.transforms import default_transforms, apply_transforms
import time

transforms = default_transforms(
    documents=docs,
    llm=transformer_llm,
    embedding_model=embedding_model)

# Apply transforms with retry logic
max_retries = 5
retry_delay = 60

for idx, transform in enumerate(transforms):
    print(f"\n=== Applying Transform {idx + 1}/{len(transforms)}: {transform.__class__.__name__} ===")
    
    # Retry logic for this specific transform
    for attempt in range(max_retries):
        try:
            apply_transforms(kg, [transform])  # Apply ONE transform at a time
            print(f"✓ Transform {idx + 1} completed successfully")
            break
        except Exception as e:
            if "rate_limit" in str(e).lower() and attempt < max_retries - 1:
                wait_time = retry_delay * (2 ** attempt)
                print(f"⚠ Rate limit hit on attempt {attempt + 1}. Waiting {wait_time}s...")
                time.sleep(wait_time)
            else:
                print(f"✗ Error applying transform: {e}")
                raise
    
    # PROACTIVE DELAY between transforms to avoid rate limits
    if idx < len(transforms) - 1:  # Don't wait after the last one
        delay = 45  # 45 seconds between transforms
        print(f"Waiting {delay}s before next transform...")
        time.sleep(delay)

print("\n✓ All transforms applied successfully!")
print(f"KG Stats - Nodes: {len(kg.nodes)}, Relationships: {len(kg.relationships)}")


=== Applying Transform 1/4: SummaryExtractor ===


Applying SummaryExtractor:   0%|          | 0/376 [00:00<?, ?it/s]

✓ Transform 1 completed successfully
Waiting 45s before next transform...

=== Applying Transform 2/4: CustomNodeFilter ===


Applying CustomNodeFilter:   0%|          | 0/400 [00:00<?, ?it/s]

Node 30c33e2e-70e0-4c1a-816a-31546e1c4801 does not have a summary. Skipping filtering.
Node 45120ed0-0842-4166-920e-87e3359f03ba does not have a summary. Skipping filtering.
Node ed996de7-1d13-412e-873a-0d4b2a1df149 does not have a summary. Skipping filtering.
Node 0f7ff088-d4e8-4060-8d9d-df762d9326d7 does not have a summary. Skipping filtering.
Node c4534ff8-d77c-447b-8430-9aab26ae3a3c does not have a summary. Skipping filtering.
Node cbb482ba-f39d-41b8-bfca-351e3fa4a305 does not have a summary. Skipping filtering.
Node d6598b04-9157-44e0-b259-d79780e238ec does not have a summary. Skipping filtering.
Node 27153224-50be-492f-89b4-d3d054fc8e43 does not have a summary. Skipping filtering.
Node e2707ee5-ccf5-4e74-b10b-9bf96e7f28bc does not have a summary. Skipping filtering.
Node 6ce473b7-fa6d-40e1-82ec-f18276a4ca95 does not have a summary. Skipping filtering.
Node 4194a6d1-06ac-4a83-9853-0b16053b30ed does not have a summary. Skipping filtering.
Node acd05ffd-71ff-4236-8f16-bf3ed834e6ac d

✓ Transform 2 completed successfully
Waiting 45s before next transform...

=== Applying Transform 3/4: Parallel ===


Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/1167 [00:00<?, ?it/s]

✓ Transform 3 completed successfully
Waiting 45s before next transform...

=== Applying Transform 4/4: OverlapScoreBuilder ===


Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

✓ Transform 4 completed successfully

✓ All transforms applied successfully!
KG Stats - Nodes: 397, Relationships: 881


In [None]:
# save our graph to be able to reuse it
kg.save("golden_data_kg.json")

KnowledgeGraph(nodes: 397, relationships: 881)

In [3]:
golden_data_kg = KnowledgeGraph.load("golden_data_kg.json")
golden_data_kg

KnowledgeGraph(nodes: 397, relationships: 881)

In [9]:
# Initialize personas

#https://docs.ragas.io/en/stable/howtos/customizations/testgenerator/_persona_generator/#personas-in-testset-generation

from ragas.testset.persona import Persona

personas = [
    Persona(
        name="Cloud Engineer",
        role_description="Seeks technical implementation details, infrastructure security best practices, and cloud-native security controls. Asks about IAM policies, network segmentation, and automated security tooling integration."
    ),
    Persona(
        name="Security Engineer",
        role_description="Focuses on vulnerability assessments, threat modeling, compliance frameworks, and security architecture. Requests detailed technical guidance on security controls, incident response procedures, and security tool configurations."
    )
]

# create the test generator
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, persona_list=personas, embedding_model=embedding_model, knowledge_graph=golden_data_kg)

In [14]:
# set the query distribution
# https://docs.ragas.io/en/stable/references/synthesizers/

from ragas.testset.synthesizers import SingleHopSpecificQuerySynthesizer, MultiHopAbstractQuerySynthesizer, MultiHopSpecificQuerySynthesizer


query_distribution = [
        (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 0.5),
        # (MultiHopAbstractQuerySynthesizer(llm=generator_llm), 0.5),
        (MultiHopSpecificQuerySynthesizer(llm=generator_llm), 0.5),
]

In [15]:
# Generate dataset
dataset = generator.generate(testset_size=60, query_distribution=query_distribution)
df = dataset.to_pandas()

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/60 [00:00<?, ?it/s]

In [17]:
# save the dataset
df.to_csv("golden_test_data.csv", index=False)

df

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What is the version of the CIS Amazon Web Serv...,[Internal Only - General \nCIS Amazon Web \nSe...,The version of the CIS Amazon Web Services Fou...,single_hop_specifc_query_synthesizer
1,Wut is CIS Legal?,[Page 1 \nTerms of Use \nPlease see the below ...,CIS Legal is the department to contact for gui...,single_hop_specifc_query_synthesizer
2,What are the key components and intended audie...,[Page 2 \nTable of Contents \nTerms of Use ......,The CIS Amazon Web Services Foundations Benchm...,single_hop_specifc_query_synthesizer
3,What is RDS in the context of cloud services?,[Page 3 \n2.2 Ensure security contact informat...,"RDS refers to the Relational Database Service,...",single_hop_specifc_query_synthesizer
4,What is important to monitor for VPC changes?,[Page 4 \n5.1 Ensure unauthorized API calls ar...,Ensure VPC changes are monitored (Manual) as s...,single_hop_specifc_query_synthesizer
5,Wht are CIS Benchmrks and how do they help in ...,[Page 5 \nOverview \nAll CIS Benchmarks™ (Benc...,CIS Benchmarks™ focus on technical configurati...,single_hop_specifc_query_synthesizer
6,What Intune Benchmark for?,[Page 6 \n \nApply the Correct Version of a Be...,Intune Benchmark is applicable to the way sett...,single_hop_specifc_query_synthesizer
7,How do Build Kits relate to the Benchmark in s...,[Page 7 \nRemediation \nCIS has developed Buil...,Build Kits are designed to correspond to the B...,single_hop_specifc_query_synthesizer
8,What role does AWS Config play in security con...,[Page 8 \nTarget Technology Details \nThis doc...,AWS Config is included in the prescriptive gui...,single_hop_specifc_query_synthesizer
9,How is the CIS Benchmark developed according t...,[Page 9 \nConsensus Guidance \nThis CIS Benchm...,The CIS Benchmark is developed using a consens...,single_hop_specifc_query_synthesizer
