In [1]:
from dotenv import load_dotenv

# API KEY 정보로드
load_dotenv()
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("CH16-Evaluations")

LangSmith 추적을 시작합니다.
[프로젝트명]
CH16-Evaluations


# RAG 성능 테스트를 위한 함수 정의

In [2]:
from myrag import PDFRAG
from langchain_openai import ChatOpenAI

# PDFRAG 객체 생성
rag = PDFRAG(
    "data/SPRI_AI_Brief_2023년12월호_F.pdf",
    ChatOpenAI(model="gpt-4o-mini", temperature=0),
)

# 검색기(retriever) 생성
retriever = rag.create_retriever()

# 체인(chain) 생성
chain = rag.create_chain(retriever)

# 질문에 대한 답변 생성
chain.invoke("삼성전자가 자체 개발한 생성형 AI의 이름은 무엇인가요?")

"삼성전자가 자체 개발한 생성형 AI의 이름은 '삼성 가우스'입니다."

In [3]:
# 질문에 대한 답변하는 함수를 생성
def ask_question(inputs: dict):
    return {"answer": chain.invoke(inputs["question"])}

# 사용자 정의 Evaluator 구성

In [4]:
from langsmith.schemas import Run, Example
import random

def random_score_evaluator(run: Run, example: Example) -> dict:
    # 랜덤 점수 반환
    score = random.randint(1, 11)
    return {"key": "random_score", "score": score}

In [5]:
from langsmith.evaluation import evaluate

# 데이터셋 이름 설정
dataset_name = "RAG_EVAL_DATASET"

# 실행
experiment_results = evaluate(
    ask_question,
    data=dataset_name,
    evaluators=[random_score_evaluator],
    experiment_prefix="CUSTOM-EVAL",
    metadata={
        "variant": "랜덤 점수 평가자",
    }
)

View the evaluation results for experiment: 'CUSTOM-EVAL-2b42f832' at:
https://smith.langchain.com/o/9b141874-d093-4103-946d-7bc247255f98/datasets/899fb1c5-744d-4f35-a48e-68fe78d807f1/compare?selectedSessions=3dc8d2e7-7c33-4bde-8c6f-f307c59bd605




0it [00:00, ?it/s]

# Custom LLM-as-Judge

In [6]:
# Context를 반환하는 RAG 결과 반환 함수
def context_answer_rag_answer(inputs: dict):
    context = retriever.invoke(inputs["question"])
    return {
        "context": "\n".join([doc.page_content for doc in context]),
        "answer": chain.invoke(inputs["question"]),
        "question": inputs["question"],
    }

In [7]:
from langchain import hub

# 평가자 Prompt 가져오기
llm_evaluator_prompt = hub.pull("teddynote/context-answer-evaluator")
llm_evaluator_prompt.pretty_print()


As an LLM evaluator (judge), please assess the LLM's response to the given question. Evaluate the response's accuracy, comprehensiveness, and context precision based on the provided context. After your evaluation, return only the numerical scores in the following format:
Accuracy: [score]
Comprehensiveness: [score]
Context Precision: [score]
Final: [normalized score]
Grading rubric:

Accuracy (0-10 points):
Evaluate how well the answer aligns with the information provided in the given context.

0 points: The answer is completely inaccurate or contradicts the provided context
4 points: The answer partially aligns with the context but contains significant inaccuracies
7 points: The answer mostly aligns with the context but has minor inaccuracies or omissions
10 points: The answer fully aligns with the provided context and is completely accurate


Comprehensiveness (0-10 points):

0 points: The answer is completely inadequate or irrelevant
3 points: The answer is accurate but too brief t

In [8]:
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

# 평가자 생성
custom_llm_evaluator = (
    llm_evaluator_prompt
    | ChatOpenAI(temperature=0.0, model="gpt-4o-mini")
    | StrOutputParser()
)

In [9]:
# 답변을 생성합니다.
output = context_answer_rag_answer(
    {"question": "삼성전자가 자체 개발한 생성형 AI의 이름은 무엇인가요?"}
)

# 점수 평가 실행
custom_llm_evaluator.invoke(output)

'1.00'

In [13]:
from langsmith.schemas import Run, Example


def custom_evaluator(run: Run, example: Example) -> dict:
    print(run)
    print(example)
    # LLM 생성 답변, 정답 답변 가져오기
    llm_answer = run.outputs.get("answer", "")
    context = run.outputs.get("context", "")
    question = example.outputs.get("question", "")

    # 랜덤 점수 반환
    score = custom_llm_evaluator.invoke(
        {"question": question, "answer": llm_answer, "context": context}
    )
    return {"key": "custom_score", "score": float(score)}

In [14]:
from langsmith.evaluation import evaluate

# 데이터셋 이름 설정
dataset_name = "RAG_EVAL_DATASET"

# 실행
experiment_results = evaluate(
    context_answer_rag_answer,
    data=dataset_name,
    evaluators=[custom_evaluator],
    experiment_prefix="CUSTOM-LLM-EVAL",
    # 실험 메타데이터 지정
    metadata={
        "variant": "Custom LLM Evaluator 활용한 평가",
    },
)

View the evaluation results for experiment: 'CUSTOM-LLM-EVAL-d68b66e6' at:
https://smith.langchain.com/o/9b141874-d093-4103-946d-7bc247255f98/datasets/899fb1c5-744d-4f35-a48e-68fe78d807f1/compare?selectedSessions=28ead4bd-f376-4f3a-94a0-e204a1552427




0it [00:00, ?it/s]

id=UUID('158b6af3-64da-4887-a7e6-bdca2a7f4645') name='Target' start_time=datetime.datetime(2025, 4, 16, 8, 37, 3, 109101, tzinfo=datetime.timezone.utc) run_type='chain' end_time=datetime.datetime(2025, 4, 16, 8, 37, 6, 512494, tzinfo=datetime.timezone.utc) extra={'metadata': {'revision_id': '3c15601-dirty', 'variant': 'Custom LLM Evaluator 활용한 평가', 'num_repetitions': 1, 'example_version': '2025-04-16T06:21:58.637211+00:00', 'ls_method': 'traceable', 'LANGSMITH_TRACING': 'true', 'LANGSMITH_ENDPOINT': 'https://api.smith.langchain.com', 'LANGSMITH_PROJECT': 'CH16-Evaluations'}, 'runtime': {'sdk': 'langsmith-py', 'sdk_version': '0.3.16', 'library': 'langsmith', 'platform': 'Windows-10-10.0.22621-SP0', 'runtime': 'python', 'py_implementation': 'CPython', 'runtime_version': '3.11.9', 'langchain_version': '0.3.21', 'langchain_core_version': '0.3.45'}} error=None serialized={'name': 'Target', 'signature': '(inputs: dict)', 'doc': None} events=[] inputs={'inputs': {'question': '구글이 테디노트에게 20억달러