In [1]:
import os
import getpass

# Get API keys

In [2]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [3]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

# Load Data

In [4]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader


path = "./../data/"
loader = DirectoryLoader(path, glob="*.txt", loader_cls=TextLoader)
docs = loader.load()

# Unrolled version of SDG for more flexibilty with testset distribution

In [7]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-nano"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [6]:
from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph()
kg

KnowledgeGraph(nodes: 0, relationships: 0)

In [12]:
from ragas.testset.graph import Node, NodeType

### NOTICE: We're using a subset of the data for this example - this is to keep costs/time down.
for doc in docs[:15]:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
        )
    )
kg

KnowledgeGraph(nodes: 15, relationships: 0)

In [None]:
from ragas.testset.transforms import default_transforms, apply_transforms

transformer_llm = generator_llm
embedding_model = generator_embeddings

default_transforms = default_transforms(documents=docs, llm=transformer_llm, embedding_model=embedding_model)
apply_transforms(kg, default_transforms)
kg

In [None]:
kg.save("product_data_kg.json")
product_data_kg = KnowledgeGraph.load("product_data_kg.json")
product_data_kg

KnowledgeGraph(nodes: 33, relationships: 360)

In [11]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=embedding_model, knowledge_graph=product_data_kg)

In [12]:
from ragas.testset.synthesizers import default_query_distribution, SingleHopSpecificQuerySynthesizer, MultiHopAbstractQuerySynthesizer, MultiHopSpecificQuerySynthesizer

query_distribution = [
        (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1) #not enough clusters for multi-hop
]

In [13]:
testset = generator.generate(testset_size=25, query_distribution=query_distribution)
testset.to_pandas()

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/25 [00:00<?, ?it/s]

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,In which countries is the Nacho Cheese Flavore...,[Product Information Summary UPC: 028400070560...,The Nacho Cheese Flavored Tortilla Chips produ...,single_hop_specifc_query_synthesizer
1,In which countries is the Nacho Cheese Flavore...,[Product Information Summary UPC: 028400070560...,The Nacho Cheese Flavored Tortilla Chips produ...,single_hop_specifc_query_synthesizer
2,Is the product available in France?,[Product Information Summary UPC: 028400070560...,"Yes, the product is available in France, as in...",single_hop_specifc_query_synthesizer
3,Is Frito Lay the company that makes the Nacho ...,[Product Information Summary UPC: 028400070560...,"Yes, Frito Lay is the manufacturer of the Nach...",single_hop_specifc_query_synthesizer
4,Is Frito Lay the company that makes the Nacho ...,[Product Information Summary UPC: 028400070560...,"Yes, Nacho Cheese Flavored Tortilla Chips is a...",single_hop_specifc_query_synthesizer
5,What is OpenFoodFacts and how can it assist he...,[CONSUMER GUIDANCE This product summary is int...,OpenFoodFacts is a collaborative database that...,single_hop_specifc_query_synthesizer
6,What is OpenFoodFacts and how can it help me m...,[CONSUMER GUIDANCE This product summary is int...,OpenFoodFacts is a collaborative database that...,single_hop_specifc_query_synthesizer
7,What should I check for regarding the product ...,[CONSUMER GUIDANCE This product summary is int...,Consumers should refer to the actual product p...,single_hop_specifc_query_synthesizer
8,Can you tell me about the OpenFoodFacts databa...,[CONSUMER GUIDANCE This product summary is int...,This product summary is intended to help consu...,single_hop_specifc_query_synthesizer
9,What is OpenFoodFacts and how can I use it to ...,[CONSUMER GUIDANCE This product summary is int...,OpenFoodFacts is a collaborative database that...,single_hop_specifc_query_synthesizer


# Create Testsets for RAG Variants

In [15]:
import copy

ensemble_testset = copy.deepcopy(testset)
naive_testset = copy.deepcopy(testset)
bm25_testset = copy.deepcopy(testset)
compression_testset = copy.deepcopy(testset)

# Build Naive Graph

In [None]:
import sys

# Add the src directory to Python path
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.insert(0, src_path)

# Now you can import from utils
from utils.rag_graph import build_rag_graph

In [26]:
graph = build_rag_graph()

# Evaluate Naive Graph

In [18]:
for test_row in naive_testset:
  response = graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [19]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(naive_testset.to_pandas())

In [20]:
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-mini"))

In [22]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

naive_result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Exception raised in Job[107]: InternalServerError(upstream connect error or disconnect/reset before headers. reset reason: connection termination)


In [23]:
naive_result

{'context_recall': 0.7993, 'faithfulness': 0.7426, 'factual_correctness': 0.6244, 'answer_relevancy': 0.8102, 'context_entity_recall': 0.5970, 'noise_sensitivity_relevant': 0.1215}

# Build BM25 + Naive Ensemble Graph

In [24]:
from utils.ensemble_rag_graph import build_rag_graph

ensemble_graph = build_rag_graph()

# Evaluate BM25 + Naive Ensemble Graph

In [25]:
for test_row in ensemble_testset:
  response = ensemble_graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [26]:

ensemble_evaluation_dataset = EvaluationDataset.from_pandas(ensemble_testset.to_pandas())

In [27]:
ensemble_result = evaluate(
    dataset=ensemble_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Exception raised in Job[49]: APIConnectionError(Connection error.)
Exception raised in Job[64]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-mini in organization org-7Afz73t5u8875SgyGqpWnq4o on tokens per min (TPM): Limit 200000, Used 197454, Requested 5249. Please try again in 810ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[79]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-mini in organization org-7Afz73t5u8875SgyGqpWnq4o on tokens per min (TPM): Limit 200000, Used 198380, Requested 6416. Please try again in 1.438s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[113]: InternalServerError(upstream connect error or disconnect/reset before headers. reset reason: connection te

In [28]:
ensemble_result

{'context_recall': 0.8733, 'faithfulness': 0.8921, 'factual_correctness': 0.6136, 'answer_relevancy': 0.8749, 'context_entity_recall': 0.4100, 'noise_sensitivity_relevant': 0.2213}

# Build BM25 Graph

In [29]:
from utils.bm25_rag_graph import build_rag_graph

bm25_graph = build_rag_graph()

# Evaluate BM25 Graph

In [30]:
for test_row in bm25_testset:
  response = bm25_graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [31]:

bm25_evaluation_dataset = EvaluationDataset.from_pandas(bm25_testset.to_pandas())

In [32]:
bm25_result = evaluate(
    dataset=bm25_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Exception raised in Job[62]: AttributeError('StringIO' object has no attribute 'statements')
Exception raised in Job[97]: APIConnectionError(Connection error.)
Exception raised in Job[84]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-mini in organization org-7Afz73t5u8875SgyGqpWnq4o on tokens per min (TPM): Limit 200000, Used 197426, Requested 4231. Please try again in 497ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[79]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4.1-mini in organization org-7Afz73t5u8875SgyGqpWnq4o on tokens per min (TPM): Limit 200000, Used 200000, Requested 5429. Please try again in 1.628s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})


In [33]:
bm25_result

{'context_recall': 0.7222, 'faithfulness': 0.8584, 'factual_correctness': 0.4504, 'answer_relevancy': 0.8353, 'context_entity_recall': 0.3336, 'noise_sensitivity_relevant': 0.3783}

# Build Contextual Compression Graph

In [34]:
from utils.compression_rag_graph import build_rag_graph

compression_graph = build_rag_graph()

# Evaluate Contextual Compression Graph

In [35]:
for test_row in compression_testset:
  response = compression_graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [36]:

compression_evaluation_dataset = EvaluationDataset.from_pandas(compression_testset.to_pandas())

In [37]:
compression_result = evaluate(
    dataset=compression_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

In [None]:
compression_result

{'context_recall': 0.7527, 'faithfulness': 0.8238, 'factual_correctness': 0.6252, 'answer_relevancy': 0.8091, 'context_entity_recall': 0.6294, 'noise_sensitivity_relevant': 0.0762}

# Rag Graph Comparison

In [51]:
print("Naive Result: ", naive_result)
print()
print("BM25 Result: ", bm25_result)
print()
print("Contextual Compression Result: ", compression_result)
print()
print("Ensemble Result: ", ensemble_result)

Naive Result:  {'context_recall': 0.7993, 'faithfulness': 0.7426, 'factual_correctness': 0.6244, 'answer_relevancy': 0.8102, 'context_entity_recall': 0.5970, 'noise_sensitivity_relevant': 0.1215}

BM25 Result:  {'context_recall': 0.7222, 'faithfulness': 0.8584, 'factual_correctness': 0.4504, 'answer_relevancy': 0.8353, 'context_entity_recall': 0.3336, 'noise_sensitivity_relevant': 0.3783}

Contextual Compression Result:  {'context_recall': 0.7527, 'faithfulness': 0.8238, 'factual_correctness': 0.6252, 'answer_relevancy': 0.8091, 'context_entity_recall': 0.6294, 'noise_sensitivity_relevant': 0.0762}

Ensemble Result:  {'context_recall': 0.8733, 'faithfulness': 0.8921, 'factual_correctness': 0.6136, 'answer_relevancy': 0.8749, 'context_entity_recall': 0.4100, 'noise_sensitivity_relevant': 0.2213}


In [52]:
naive_result_dict = {'context_recall': 0.7993, 'faithfulness': 0.7426, 'factual_correctness': 0.6244, 'answer_relevancy': 0.8102, 'context_entity_recall': 0.5970, 'noise_sensitivity_relevant': 0.1215}
bm25_result_dict = {'context_recall': 0.7222, 'faithfulness': 0.8584, 'factual_correctness': 0.4504, 'answer_relevancy': 0.8353, 'context_entity_recall': 0.3336, 'noise_sensitivity_relevant': 0.3783}
compression_result_dict = {'context_recall': 0.7527, 'faithfulness': 0.8238, 'factual_correctness': 0.6252, 'answer_relevancy': 0.8091, 'context_entity_recall': 0.6294, 'noise_sensitivity_relevant': 0.0762}
ensemble_result_dict = {'context_recall': 0.8733, 'faithfulness': 0.8921, 'factual_correctness': 0.6136, 'answer_relevancy': 0.8749, 'context_entity_recall': 0.4100, 'noise_sensitivity_relevant': 0.2213}

In [54]:
import pandas as pd

comparison = pd.DataFrame([naive_result_dict, bm25_result_dict, compression_result_dict, ensemble_result_dict], index=["Naive", "BM25", "Contextual Compression", "BM25 + Naive Ensemble"])
comparison


Unnamed: 0,context_recall,faithfulness,factual_correctness,answer_relevancy,context_entity_recall,noise_sensitivity_relevant
Naive,0.7993,0.7426,0.6244,0.8102,0.597,0.1215
BM25,0.7222,0.8584,0.4504,0.8353,0.3336,0.3783
Contextual Compression,0.7527,0.8238,0.6252,0.8091,0.6294,0.0762
BM25 + Naive Ensemble,0.8733,0.8921,0.6136,0.8749,0.41,0.2213
