In [1]:
# API KEY를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# API KEY 정보로드
load_dotenv()
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("CH16-Evaluations")

LangSmith 추적을 시작합니다.
[프로젝트명]
CH16-Evaluations


In [2]:
from langchain import hub

from langchain_openai import ChatOpenAI
from langsmith.schemas import Example, Run
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langsmith.evaluation import evaluate


def evaluate_pairwise(runs: list, example) -> dict:
    """
    A simple evaluator for pairwise answers to score based on  engagement
    """

    # 점수 저장
    scores = {}
    for i, run in enumerate(runs):
        scores[run.id] = i

    # 각 예제에 대한 실행 쌍
    answer_a = runs[0].outputs["answer"]
    answer_b = runs[1].outputs["answer"]
    question = example.inputs["question"]

    # 함수 호출이 있는 LLM, 최고 성능 모델 사용
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # 구조화된 프롬프트
    grade_prompt = PromptTemplate.from_template(
        """
        You are an LLM judge. Compare the following two answers to a question and determine which one is better.
        Better answer is the one that is more detailed and informative.
        If the answer is not related to the question, it is not a good answer.

        # Question:
        {question}

        #Answer A:
        {answer_a}

        #Answer B:
        {answer_b}

        Output should be either `A` or `B`. Pick the answer that is better.

        #Preference:
        """
    )
    answer_grader = grade_prompt | llm | StrOutputParser()

    # 점수 획득
    score = answer_grader.invoke(
        {
            "question": question,
            "answer_a": answer_a,
            "answer_b": answer_b,
        }
    )
    # score = score["Preference"]

    # 점수에 따른 실행 할당 매핑
    if score == "A":  # Assistant A 선호
        scores[runs[0].id] = 1
        scores[runs[1].id] = 0
    elif score == "B":  # Assistant B 선호
        scores[runs[0].id] = 0
        scores[runs[1].id] = 1
    else:
        scores[runs[0].id] = 0
        scores[runs[1].id] = 0

    return {"key": "ranked_preference", "scores": scores}


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
from langsmith.evaluation import evaluate_comparative

# 실험 이름 또는 ID 배열 교체
evaluate_comparative(
    ["MODEL_COMPARE_EVAL-7c182adb", "MODEL_COMPARE_EVAL-e1c7c62c"],
    # 평가자 배열
    evaluators=[evaluate_pairwise],
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/9b141874-d093-4103-946d-7bc247255f98/datasets/899fb1c5-744d-4f35-a48e-68fe78d807f1/compare?selectedSessions=af9c00f7-b6d0-41c9-951c-15e88f19c4b6%2C158b56cb-28d5-45a5-b439-bfd0c504d3f4&comparativeExperiment=90fe03e0-a027-4e08-8cfc-6f7c71209b0f




  0%|          | 0/5 [00:00<?, ?it/s]

<langsmith.evaluation._runner.ComparativeExperimentResults at 0x1c2f0821850>