# Evaluation using RAGAS 
- https://docs.ragas.io/en/stable/howtos/integrations/langchain/#evaluate

In [None]:
# Import all necessary libraries
import os
import ast
import pandas as pd
import langchain
import ragas

# Environment and configuration
from dotenv import load_dotenv

# Hugging Face datasets
from datasets import Dataset

# LangChain core components
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser

# RAGAS evaluation
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import (
    ContextPrecision,
    ContextRecall,
    ContextRelevance,
    ContextEntityRecall,
    NoiseSensitivity,
    ResponseRelevancy,
    Faithfulness,
    ResponseGroundedness,
)

print(f"LangChain Version: {langchain.__version__}")
print(f"Ragas Version: {ragas.__version__}")

In [None]:
# Load API KEY information
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


LangChain Version: 0.3.26
Ragas Version: 0.3.0


In [None]:
# Load synthetic dataset
df = pd.read_csv("./ragas_synthetic_dataset.csv")

True

### Load Sythetic Dataset

In [None]:
import pandas as pd

df = pd.read_csv("./ragas_synthetic_dataset.csv")

In [None]:
# datasets is a library developed by Hugging Face that provides tools for easily loading and processing datasets for machine learning.
from datasets import Dataset


test_dataset = Dataset.from_pandas(df)
test_dataset

Dataset({
    features: ['user_input', 'reference_contexts', 'reference', 'synthesizer_name'],
    num_rows: 12
})

In [None]:
# Convert strings in context column into List
import ast


def convert_to_list(example):
    reference_contexts = ast.literal_eval(example["reference_contexts"])
    return {"reference_contexts": reference_contexts}


test_dataset = test_dataset.map(convert_to_list)
print(test_dataset)

Map: 100%|██████████| 12/12 [00:00<00:00, 1499.97 examples/s]

Dataset({
    features: ['user_input', 'reference_contexts', 'reference', 'synthesizer_name'],
    num_rows: 12
})





In [None]:
test_dataset[1]["reference_contexts"]

['D.gov 이슈분석 / 2024-06호 /\n2  ❘ \n「D.gov 이슈분석」은 정부의 디지털 전환을 위한 다양한 이슈 분석과 향후 정책 \n방향을 모색하기 위해 한국지능정보사회진흥원에서 기획․발간하는 보고서입니다.\n한국지능정보사회진흥원의 사전 승인 없이 본 보고서의 무단전재나 복제를 금하며, \n가공·인용할 때는 반드시 출처를 명시하여 주시기 바랍니다.\n본 보고서의 내용은 한국지능정보사회진흥원의 공식 견해와 다를 수 있으며, 문의 \n및 제안은 아래 연락처로 해 주시기 바랍니다.\n■ 발행처: 한국지능정보사회진흥원\n■ 발행인: 황종성\n■ 작성자: 한국지능정보사회진흥원 디지털플랫폼정부본부 정책기획팀\n          - 송지향 책임(jhsong@nia.or.kr)\n          - 박슬기 선임(psk64@nia.or.kr) \n■ 보고서 온라인 서비스: www.nia.or.kr']

## Simple RAG

In [None]:
FILE_PATH = r"C:\Users\jongb\dev\jb_langchain\assets\250620_해외_디지털정부_전문조직_심층분석_및_시사점.pdf"

In [None]:
### Pre-processing
# Step 1: Load Document
loader = PyMuPDFLoader(FILE_PATH)
docs = loader.load()

# Step 2: Split Documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

# Step 3: Embedding
embeddings = OpenAIEmbeddings()

# Step 4: Create Vector DB and Save Embedded Chunks
vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)


### RAG Run Time

# Step 5: Create Retriever
retriever = vectorstore.as_retriever()

# Step 6: Create Prompt Template
prompt = PromptTemplate.from_template(
    """
    You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.

    Answer in KOREAN.

    # CONTEXT 
    {context}

    # QUESTION
    {question}

    # ANSWER: 
    
    """
)

# Step 7: Define LLM
llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=0)

# Step 8: Build Chain
chain = prompt | llm | StrOutputParser()

In [None]:
def format_docs(relevant_docs):
    return "\n".join(doc.page_content for doc in relevant_docs)

### Create Batch dataset 

In [None]:
# Create batch dataset including retrieved context for each question
batch_dataset = []
retrieved_contexts = []
for question in test_dataset["user_input"]:
    # Search context for each question
    contexts = retriever.invoke(question)

    retrieved_docs = [doc.page_content for doc in contexts]
    retrieved_contexts.append(retrieved_docs)
    # Store question and retrieved context together
    batch_dataset.append({"question": question, "context": format_docs(contexts)})

batch_dataset[:3]

['Can you provide an overview of the digital government specialized organizations in the 영국 as mentioned in the 2024-6호 report?',
 'Who is 박슬기 in the context of the D.gov 이슈분석 report?',
 '영국 GDS는 어떤 역할을 하나요?']

### Generate answer by calling batch()

In [None]:
# Method 1: Generate answers using existing chain
answer = chain.batch(batch_dataset)

### For RAGAS Evaluation, the Evaluation dataset must contain the following 4 columns:
 -"user_input"
 -"retrieved_contexts"
 -"response"
 -"reference"

In [None]:
# Add response column
if "response" in test_dataset.column_names:
    test_dataset = test_dataset.remove_columns(["response"]).add_column(
        "response", answer
    )


else:

    test_dataset = test_dataset.add_column("response", answer)

In [None]:
# Next, add retrieved_contexts column
if "retrieved_contexts" in test_dataset.column_names:
    test_dataset = test_dataset.remove_columns(["retrieved_contexts"]).add_column(
        "retrieved_contexts", retrieved_contexts
    )
else:
    test_dataset = test_dataset.add_column("retrieved_contexts", retrieved_contexts)

In [85]:
test_dataset

Dataset({
    features: ['user_input', 'reference_contexts', 'reference', 'synthesizer_name', 'response', 'retrieved_contexts'],
    num_rows: 12
})

## RAG Evaluation 

In [None]:
llm = ChatOpenAI(model="gpt-4.1-mini")
evaluator_llm = LangchainLLMWrapper(llm)

Evaluating_RAG_metrics = [
    ContextPrecision(llm=evaluator_llm),
    ContextRecall(llm=evaluator_llm),
    ContextRelevance(llm=evaluator_llm),
    ContextEntityRecall(llm=evaluator_llm),
    NoiseSensitivity(llm=evaluator_llm),
    ResponseRelevancy(llm=evaluator_llm),
    Faithfulness(llm=evaluator_llm),
    ResponseGroundedness(llm=evaluator_llm),
]

In [123]:
eval_rag_result = evaluate(
    dataset=test_dataset,
    metrics=Evaluating_RAG_metrics,
    # llm=evaluator_llm,
)
eval_rag_result

Evaluating:  97%|█████████▋| 93/96 [03:03<00:33, 11.14s/it]Exception raised in Job[28]: TimeoutError()
Evaluating:  99%|█████████▉| 95/96 [03:31<00:13, 13.67s/it]Exception raised in Job[68]: TimeoutError()
Evaluating: 100%|██████████| 96/96 [03:33<00:00,  2.22s/it]


{'context_precision': 0.7384, 'context_recall': 0.6895, 'nv_context_relevance': 1.0000, 'context_entity_recall': 0.2805, 'noise_sensitivity(mode=relevant)': 0.4049, 'answer_relevancy': 0.8976, 'faithfulness': 0.8075, 'nv_response_groundedness': 0.8958}

In [None]:
eval_rag_df = eval_rag_result.to_pandas()
eval_rag_df.head()