In [1]:
!pip install ragas

Collecting ragas
  Downloading ragas-0.2.14-py3-none-any.whl.metadata (8.5 kB)
Collecting datasets (from ragas)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting filelock (from datasets->ragas)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->ragas)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->ragas)
  Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets->ragas)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets->ragas)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata

In [35]:
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
import os
api_version = "2024-02-15-preview"  # Update to the correct version
azure_embedding_deployment_name = "text-embedding-ada-002"
azure_llm_deployment_name = "gpt-4o-mini"
openai_api_key = os.getenv("OPENAI_API_KEY")

In [36]:
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-ada-002"))

# Factual Correctness

In [7]:
import pandas as pd
# Load the Excel file
df = pd.read_excel('processed_final_test.xlsx')

# Ensure the necessary columns are present
if 'Parsed_LLM_Output' in df.columns and 'Gold_Input' in df.columns:
    # Create the dataset
    dataset = [
        {
            "response": row['Parsed_LLM_Output'],
            "reference": row['Gold_Input']
        }
        for _, row in df.iterrows()
    ]
else:
    raise KeyError("The required columns 'Parsed_LLM_Output' and 'Gold_Input' are not present in the Excel file.")

# Display the dataset
for entry in dataset:
    print(entry)

{'response': 'User is Responsible For ISO Processes: Risk management process, Validation process\nUser is Supporting: Stakeholder needs and requirements definition process, System requirements definition process\nUser is Designing: ', 'reference': 'User is Responsible For ISO Processes: verification process, Project planning process, risk management process, Stakeholder needs and requirements definition process\nUser is Supporting: risk management process, life cycle model management process, infrastructure management process, stakeholder needs and requirements definition process, quality assurance process\nUser is Designing: System architecture definition process, Stakeholder needs and requirements definition process, System requirements definition process, Risk management process, Verification process, Validation process'}
{'response': 'User is Responsible For ISO Processes: Stakeholder needs and requirements definition process\nUser is Supporting: Quality assurance process, Validati

In [8]:
from ragas import EvaluationDataset
from ragas import evaluate

In [21]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics._factual_correctness import FactualCorrectness

In [10]:
evaluation_dataset = EvaluationDataset.from_list(dataset)

In [39]:
from ragas.metrics import FactualCorrectness

# Initialize the metric with the LLM
factual_correctness_metric = FactualCorrectness(llm=evaluator_llm)


In [40]:
import pandas as pd
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics._factual_correctness import FactualCorrectness

In [65]:
factual_correctness_metric = FactualCorrectness(llm=evaluator_llm)


In [42]:
# Load the Excel file
df = pd.read_excel('processed_final_test.xlsx')

# Ensure the necessary columns are present
if 'Parsed_LLM_Output' not in df.columns or 'Gold_Input' not in df.columns:
    raise KeyError("The required columns 'Parsed_LLM_Output' and 'Gold_Input' are not present in the Excel file.")

# Create SingleTurnSample instances
samples = [
    SingleTurnSample(
        response=row['Parsed_LLM_Output'],
        reference=row['Gold_Input']
    )
    for _, row in df.iterrows()
]


In [43]:
samples

[SingleTurnSample(user_input=None, retrieved_contexts=None, reference_contexts=None, response='User is Responsible For ISO Processes: Risk management process, Validation process\nUser is Supporting: Stakeholder needs and requirements definition process, System requirements definition process\nUser is Designing: ', multi_responses=None, reference='User is Responsible For ISO Processes: verification process, Project planning process, risk management process, Stakeholder needs and requirements definition process\nUser is Supporting: risk management process, life cycle model management process, infrastructure management process, stakeholder needs and requirements definition process, quality assurance process\nUser is Designing: System architecture definition process, Stakeholder needs and requirements definition process, System requirements definition process, Risk management process, Verification process, Validation process', rubrics=None),
 SingleTurnSample(user_input=None, retrieved_con

In [70]:
import asyncio

# Function to evaluate a single sample
async def evaluate_sample(sample):
    score = await factual_correctness_metric.single_turn_ascore(sample)
    return score

# Evaluate all samples asynchronously
async def evaluate_all_samples(samples):
    tasks = [evaluate_sample(sample) for sample in samples]
    return await asyncio.gather(*tasks)

# Run the evaluation
scores = asyncio.run(evaluate_all_samples(samples))


In [45]:
scores

[0.21,
 0.0,
 0.31,
 0.4,
 0.24,
 0.29,
 0.0,
 0.17,
 0.57,
 0.4,
 0.42,
 0.46,
 0.0,
 0.4,
 0.24,
 0.5,
 0.27,
 0.29]

In [46]:
def average(lst):
    return sum(lst) / len(lst) if lst else 0  # Avoid division by zero

In [67]:
avg_f1_score_1=average(scores)

In [69]:
avg_f1_score_2=average(scores)

In [71]:
avg_f1_score_3=average(scores)

In [72]:
average([avg_f1_score_1,avg_f1_score_2,avg_f1_score_3])

0.3194444444444445

# Precision

In [73]:
factual_correctness_metric = FactualCorrectness(llm=evaluator_llm, mode="precision")

In [78]:
import asyncio

# Function to evaluate a single sample
async def evaluate_sample(sample):
    score = await factual_correctness_metric.single_turn_ascore(sample)
    return score

# Evaluate all samples asynchronously
async def evaluate_all_samples(samples):
    tasks = [evaluate_sample(sample) for sample in samples]
    return await asyncio.gather(*tasks)

# Run the evaluation
scores = asyncio.run(evaluate_all_samples(samples))


In [75]:
avg_precision_1=average(scores)

In [77]:
avg_precision_2=average(scores)

In [79]:
avg_precision_3=average(scores)

In [80]:
average([avg_precision_1,avg_f1_score_2,avg_precision_3])

0.40814814814814815

# Recall

In [81]:
factual_correctness_metric = FactualCorrectness(llm=evaluator_llm, mode="recall")

In [82]:
import asyncio

# Function to evaluate a single sample
async def evaluate_sample(sample):
    score = await factual_correctness_metric.single_turn_ascore(sample)
    return score

# Evaluate all samples asynchronously
async def evaluate_all_samples(samples):
    tasks = [evaluate_sample(sample) for sample in samples]
    return await asyncio.gather(*tasks)

# Run the evaluation
scores1 = asyncio.run(evaluate_all_samples(samples))
scores2 = asyncio.run(evaluate_all_samples(samples))
scores3 = asyncio.run(evaluate_all_samples(samples))


In [83]:
avg_recall_1=average(scores1)
avg_recall_2=average(scores2)
avg_recall_3=average(scores3)

In [84]:
average([avg_recall_1,avg_recall_2,avg_recall_3])

0.2662962962962963