In [None]:
# Import all required libraries for synthetic dataset generation using RAGAS

# Core libraries for version checking
import langchain
import ragas

# Environment and configuration
from dotenv import load_dotenv

# Document processing libraries
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# RAGAS wrappers for LLM and embeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator

# OpenAI integration
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

print(f"LangChain Version: {langchain.__version__}")
print(f"Ragas Version: {ragas.__version__}")

In [None]:
# Load API keys from environment file
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


LangChain Version: 0.3.26
Ragas Version: 0.3.0


In [None]:
# Set the file path for document processing
FILE_PATH = "Your file path"

True

In [None]:
# Document preprocessing pipeline
# Step 1: Load PDF document using PyMuPDFLoader
loader = PyMuPDFLoader(FILE_PATH)
docs = loader.load()

# Step 2: Split text into chunks for better processing
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = loader.load_and_split(text_splitter)
print(f"Number of document chunks: {len(docs)}")

In [None]:
# Check document metadata structure
print("Document metadata:", docs[0].metadata)

100

In [None]:
# Configure LLM and embeddings for synthetic dataset generation
# Set up the generator LLM (using GPT-4 turbo mini for cost efficiency)
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))

# Set up embeddings model for semantic understanding
generator_embeddings = LangchainEmbeddingsWrapper(
    OpenAIEmbeddings(model="text-embedding-3-large")
)

## Generate Sythetic Data

In [None]:
# Generate synthetic test dataset using RAGAS
# Initialize the test set generator with configured LLM and embeddings
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)



# Generate synthetic dataset from the loaded documents
# This creates question-answer pairs and evaluation scenarios
dataset = generator.generate_with_langchain_docs(
    docs,
    testset_size=10,  # Number of synthetic samples to generate
)

In [None]:
# Convert the generated dataset to pandas DataFrame for easier analysis
df = dataset.to_pandas()
print("Synthetic dataset generated successfully!")
print(f"Dataset shape: {df.shape}")

Applying SummaryExtractor:   0%|          | 0/84 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/100 [00:00<?, ?it/s]        Node c9590b5a-6b4d-4573-a7fd-10b337804b99 does not have a summary. Skipping filtering.
Node f97715ec-b514-4dea-ae33-8851e12d964e does not have a summary. Skipping filtering.
Applying CustomNodeFilter:   3%|▎         | 3/100 [00:00<00:18,  5.28it/s]Node 3511447e-750e-4036-aec3-7a6a5243272b does not have a summary. Skipping filtering.
Node af3c2804-debf-40b5-a590-54c08807c8d9 does not have a summary. Skipping filtering.
Applying CustomNodeFilter:  12%|█▏        | 12/100 [00:00<00:04, 21.92it/s]Node 48acd8b6-bd9b-498f-b055-c96e1e814ef8 does not have a summary. Skipping filtering.
Applying CustomNodeFilter:  22%|██▏       | 22/100 [00:01<00:03, 23.78it/s]Node 71852507-0f34-4e75-a808-624463835db3 does not have a summary. Skipping filtering.
Applying CustomNodeFilter:  42%|████▏     | 42/100 [00:02<00:02, 21.66it/s]Node 0da916dd-7f02-4db1-89e3-c0062936279e does not have a summary. Skipping filtering.
Node 1a616fcb-b7a9

In [None]:
# Display the generated synthetic dataset
df

In [None]:
# Save the synthetic dataset to CSV file for future use
df.to_csv("./ragas_synthetic_dataset.csv", index=False)
print("Dataset saved to 'ragas_synthetic_dataset.csv'")