# RAG Pipeline rag-checker


In [1]:
from pathlib import Path
from dotenv import load_dotenv
import sys, pandas as pd

ROOT = Path.cwd().parent  # 노트북은 tests/ 하위에 있다고 가정
sys.path.append(str(ROOT))

from services.orchestrator import router_node
from ragas import SingleTurnSample

from config import OPENAI_API_KEY

load_dotenv(ROOT / ".env")
print(f"ROOT - {ROOT}")
print("환경 변수 로드 완료")

2025-08-04 21:36:38,163 | INFO | Loading faiss with AVX2 support.
2025-08-04 21:36:38,323 | INFO | Successfully loaded faiss with AVX2 support.
2025-08-04 21:36:38,335 | INFO | Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss.


ROOT - c:\Users\insung\Finance_Agent
환경 변수 로드 완료


In [None]:
import litellm
from ragchecker import RAGChecker
from kiwipiepy import Kiwi

kiwi = Kiwi() 

def openai_api_function(prompts: list[str]) -> list[str]:
    """
    OpenAI의 gpt-4o 모델을 호출하고 응답 텍스트 리스트를 반환하는 커스텀 함수.
    """
    try:
        # litellm을 사용하여 OpenAI 모델을 배치로 호출합니다.
        response = litellm.batch_completion(
            model="gpt-4o",  # 사용할 OpenAI 모델 지정
            messages=[[{"role": "user", "content": p}] for p in prompts]
        )
        
        return [res.choices[0].message.content for res in response]
    except Exception as e:
        print(f"API 호출 중 에러 발생: {e}")
        # 에러 발생 시, 각 프롬프트에 대해 빈 문자열을 반환하여 평가가 중단되지 않게 함
        return ["" for _ in prompts]

class KiwiTokenizer:
    def __init__(self):
        pass
    def tokenize(self, text):
        return [token.form for token in kiwi.tokenize(text)]
    def lemmatize(self, text):
        return [token.form for token in kiwi.tokenize(text)]

checker = RAGChecker(
    tokenizer=KiwiTokenizer(),
    language="ko",
    custom_llm_api_func=openai_api_function
)

print("rag-checker 초기화 완료")


W0804 21:23:51.085000 25528 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


rag-checker 초기화 완료


In [4]:
# 샘플 질문 정의
SAMPLE_QUESTIONS = [
    "OTP 비밀번호 오류 해제 방법 알려줘",
    "첫급여 우리적금에서 우대이율을 받기 위한 조건은 무엇인가요?",
    "정기적금을 만기 지난 뒤 해지하면 어떤 만기후이율이 적용되나요?",
    "오늘 날씨 어때?",
]

In [5]:
# 파이프라인 호출 → 평가 입력 변환
from services.orchestrator import router_node

records = []
for q in SAMPLE_QUESTIONS:
    res  = router_node.invoke(q)
    ctxs = res.get("context", "").split("\n\n") if res.get("context") else []
    records.append({"question": q, "answer": res["answer"], "contexts": ctxs})
print(f"{len(records)}개 레코드 수집 완료")


4개 레코드 수집 완료


## RAGchecker Evaluavtion

In [6]:
from ragchecker import RAGResult, RAGResults

rag_result_objects = []
for i, record in enumerate(records):
    rag_result_objects.append(
        RAGResult(
            query_id=f"q_{i}",
            query=record["question"],
            response=record["answer"],
            retrieved_context=record["contexts"],
            # gt_answer는 실제 정답이 있을 경우 제공, 없으면 None
            gt_answer=None 
        )
    )

rag_results = RAGResults(results=rag_result_objects)

print(rag_results)

RAGResults(
  4 RAG results,
  Metrics:
  {
    "overall_metrics": {},
    "retriever_metrics": {},
    "generator_metrics": {}
  }
)


In [7]:
# Overall Metrics (Precision / Recall / F1)

overall = checker.evaluate(rag_results)
print("### Overall ###")
for k, v in overall.items():
    print(f"{k:<10}: {v:.3f}")

[32m2025-08-04 21:25:31.261[0m | [1mINFO    [0m | [36mragchecker.evaluator[0m:[36mextract_claims[0m:[36m113[0m - [1mExtracting claims for response of 4 RAG results.[0m
  0%|          | 0/1 [00:00<?, ?it/s][92m21:25:31 - LiteLLM:INFO[0m: utils.py:3260 - 
LiteLLM completion() model= meta.llama3-70b-instruct-v1:0; provider = bedrock
2025-08-04 21:25:31,279 | INFO | 
LiteLLM completion() model= meta.llama3-70b-instruct-v1:0; provider = bedrock
[92m21:25:31 - LiteLLM:INFO[0m: utils.py:3260 - 
LiteLLM completion() model= meta.llama3-70b-instruct-v1:0; provider = bedrock
2025-08-04 21:25:31,303 | INFO | 
LiteLLM completion() model= meta.llama3-70b-instruct-v1:0; provider = bedrock
[92m21:25:31 - LiteLLM:INFO[0m: utils.py:3260 - 
LiteLLM completion() model= meta.llama3-70b-instruct-v1:0; provider = bedrock
2025-08-04 21:25:31,317 | INFO | 
LiteLLM completion() model= meta.llama3-70b-instruct-v1:0; provider = bedrock
[92m21:25:31 - LiteLLM:INFO[0m: utils.py:3260 - 
LiteLLM c


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.



AttributeError: 'AuthenticationError' object has no attribute 'choices'

In [None]:
# Diagnostic Retriever Metrics (top-k 정밀·재현율 등)

diag_retr = checker.diagnose_retriever(rag_results, k_values=[1,3,5])
print("\n### Retriever Diagnostics ###")
display(diag_retr) 

In [None]:
# Diagnostic Generator Metrics

diag_gen = checker.diagnose_generator(rag_results)
print("### Generator Diagnostics ###")
display(diag_gen)

In [None]:
# Claim-level Entailment

entail = checker.check_entailment(rag_results)
df_ent = pd.DataFrame(entail)
print("### Claim-level Entailment ###")
display(df_ent.head())
print("\nEntailment accuracy :", df_ent["entailment"].mean().round(3))
