### Create ragas TestsetGenerator


#### import modules and setup env

In [3]:
import os

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

os.chdir("../../")
from src import load_api_keys, load_and_process_dataset

load_api_keys()


#### Define llms

In [4]:
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()

#### Define generator and distributions

In [11]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Change resulting question type distribution
distributions = {
    simple: 0.5,
    multi_context: 0.25,
    reasoning: 0.25
}

In [6]:
documents = load_and_process_dataset()

#### Get a random sample of the documents

In [21]:
import random
from typing import List
from langchain.docstore.document import Document

def get_random_document_sample(documents: List[Document], sample_size: int = 3) -> List[Document]:
    """Shuffles a list of Document objects and returns a random sample.

    Args:
        documents (List[Document]): The list of Document objects to sample from.
        sample_size (int, optional): The size of the sample to return. Defaults to 3.

    Returns:
        List[Document]: A list of Document objects, shuffled and sampled.
    """
    
    # Create a copy to avoid modifying the original list
    shuffled_docs = documents.copy()  

    # Shuffle the copy
    random.shuffle(shuffled_docs)  

    # Return a sample
    return shuffled_docs[:sample_size]

In [22]:
## Get a sample
sample_docs = get_random_document_sample(documents, sample_size=3)

In [23]:
len(sample_docs)

3

#### Generate the testset

In [24]:
# generate testset
testset = generator.generate_with_langchain_docs(sample_docs, test_size=2, distributions=distributions, raise_exceptions=False)

embedding nodes:   0%|          | 0/12 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
testset

TestDataset(test_data=[DataRow(question='How did Donald Trump help Rory McIlroy retrieve his golf club?', contexts=['(CNN)With a little bit of help from Donald Trump, Rory McIlroy was re-united with the golf club he famously threw into the lake at Doral -- but probably wished the golf-loving tycoon had not bothered. Never one to miss a media opportunity, Trump, the owner of the Blue Monster course in Florida, got a scuba diver to retrieve the 3-iron club which world No. 1 McIlroy had thought he had seen the last of during Friday\'s second round at the WGC-Cadillac Championship. The 68-year-old American entrepreneur presented it to McIlroy before his final round Sunday, telling him that it was unlucky to continue playing with 13 clubs as against the usual 14 allowed under golf\'s rules. "He\'s never one to miss an opportunity," McIlroy told the official PGA Tour website after his round. "It was fine. It was good fun." Not that opportunity knocked for McIlroy when he chose the 3-iron to 

In [27]:
## Convert the test set to pandas dataframe
df = testset.to_pandas()

#### Save the test set to a CSV

In [28]:
import pandas as pd

# Save to a CSV file
df.to_csv("evaluation_set.csv")  # index=False to avoid saving row numbers