# Synthetic Dataset 

We generate an evaluation dataset for a RAG task

In [24]:
DATASET_DIR = "../../datasets/books/"

In [25]:
import os

DATASET_PATH = os.path.join(DATASET_DIR, "data.csv")
QA_PROMPT_PATH = os.path.join(DATASET_DIR, "qa_prompt.txt")
EVAL_PATH = os.path.join(DATASET_DIR, "eval.csv")

In [26]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4")

 ········


---

Prompt to generate QA, given a document as context

In [27]:
with open(QA_PROMPT_PATH, 'r') as file:
    prompt_template = file.read()

print(prompt_template)

Your task is to write a factoid question and an answer given the name and description of a book.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::


In [10]:
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#data loader
loader = CSVLoader(file_path=DATASET_PATH, encoding='utf-8')
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=0,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

docs_processed = []
for doc in docs:
    docs_processed += text_splitter.split_documents([doc])

In [28]:
# Cost/compute intensive operation

import random
from tqdm.auto import tqdm

EVAL_SIZE = 100
outputs = []

for sample in tqdm(random.sample(docs_processed, EVAL_SIZE)):
    
    messages = [
        (
            "system",
            prompt_template.format(context=sample.page_content),
        )
    ]
    qa = llm.invoke(messages).content

    try:
        question = qa.split("Question: ")[-1].split("Answer: ")[0]
        answer = qa.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sample.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sample.metadata["source"],
            }
        )
    except:
        continue


  0%|          | 0/100 [00:00<?, ?it/s]

In [29]:
outputs[-1]

 'question': 'Who is the author of the book Heart of a Warrior Angel?\n',
 'answer': 'Lali A. Love',
 'source_doc': '../datasets/books/data.csv'}

In [30]:
import pandas as pd

df = pd.DataFrame(outputs)
df.to_csv(EVAL_PATH, index=False)