#### Generating Test Data for Evaluations

In [2]:
import os
import nest_asyncio
import pandas as pd
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
from ragas.testset import TestsetGenerator
from ragas.dataset_schema import EvaluationDataset
from ragas.testset.synthesizers import SingleHopSpecificQuerySynthesizer, ComparativeAbstractQuerySynthesizer

ImportError: cannot import name 'ComparativeAbstractQuerySynthesizer' from 'ragas.testset.synthesizers' (c:\Users\jdram\.pyenv\pyenv-win\versions\3.10.5\lib\site-packages\ragas\testset\synthesizers\__init__.py)

In [None]:
nest_asyncio.apply()

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

if openai_api_key is None:
    raise ValueError("OpenAI API Key not found. Please ensure you have a .env file with 'OPENAI_API_KEY'.")

In [None]:
# Check if the Weave repository already exists; if not, download it using sparse checkout
repo_dir = "weave_docs"

if not os.path.exists(repo_dir):
    os.system(f"git init {repo_dir}")
    os.chdir(repo_dir)
    os.system("git remote add origin https://github.com/wandb/weave.git")
    os.system("git sparse-checkout init --cone")
    os.system("git sparse-checkout set docs/docs/guides/tracking")
    os.system("git pull origin master")
    os.chdir("..")
else:
    print(f"{repo_dir} already exists, skipping download.")

In [None]:
path = os.path.join(repo_dir, "docs/docs/guides/tracking")
loader = DirectoryLoader(path, glob="**/*.md")
docs = loader.load()

In [None]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator = TestsetGenerator(llm=evaluator_llm)

query_distribution = [
    (ComparativeAbstractQuerySynthesizer(llm=evaluator_llm), 0.5),
    (SingleHopSpecificQuerySynthesizer(llm=evaluator_llm), 0.5),
]

dataset = generator.generate_with_langchain_docs(
    splits, 
    testset_size=30, 
    query_distribution=query_distribution
)

In [None]:
df = dataset.to_pandas()

print(df)


In [None]:
output_csv_path = "generated_testset.csv"

df.to_csv(output_csv_path, index=False)

print(f"Generated testset saved to {output_csv_path}")