### Pairwise Evaluation
* 두 개 이상의 LLM 생성물을 서로 비교

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

def evaluate_pairwise(runs: list, example) -> dict:
    # 점수 저장
    scores = {}
    for i, run in enumerate(runs):
        scores[run.id] = i
    
    # 각 예제에 대한 실행 쌍
    answer_a = runs[0].outputs["answer"]
    answer_b = runs[1].outputs["answer"]
    question = example.inputs["question"]

    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    grade_prompt = PromptTemplate.from_template(
        """
        You are an LLM judge. Compare the following two answers to a question and determine which one is better.
        Better answer is the one that is more detailed and informative.
        If the answer is not related to the question, it is not a good answer.

        
        #Question:
        {question}
        
        #Answer A: 
        {answer_a}
        
        #Answer B: 
        {answer_b}
        
        Output should be either `A` or `B`. Pick the answer that is better.
        
        #Preference:
        """
    )

    answer_grader = grade_prompt | llm | StrOutputParser()

    score = answer_grader.invoke(
        {
            "question":question,
            "answer_a":answer_a,
            "answer_b":answer_b
        }
    )

    if score == "A":    # A가 답변을 더 잘했다.
        scores[runs[0].id] = 1
        scores[runs[1].id] = 0
    elif score == "B":  # B가 답변을 더 잘했다.
        scores[runs[0].id] = 0
        scores[runs[1].id] = 1
    else:
        scores[runs[0].id] = 0
        scores[runs[1].id] = 0

    return {"key":"ranked_preference", "scores":scores}

In [5]:
from rag import PDFRAG
from langchain_openai import ChatOpenAI

def ask_question_with_llm(llm):
    rag = PDFRAG(
        "data/snow-white.pdf",
        llm
    )
    
    retriever = rag.create_retriever()

    rag_chain = rag.create_chain(retriever)

    def _ask_question(inputs: dict):
        context = retriever.invoke(inputs["question"])
        context = "\n".join([doc.page_content for doc in context])
        return {
            "question":inputs["question"],
            "context":context,
            "answer":rag_chain.invoke(inputs["question"])
        }
    
    return _ask_question

In [6]:
from langchain_openai import ChatOpenAI

gpt3 = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
gpt3.invoke("안녕하세요?")

AIMessage(content='안녕하세요! 무엇을 도와드릴까요?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 13, 'total_tokens': 34, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-b436c3b8-16f3-4256-8743-6d06363eed29-0', usage_metadata={'input_tokens': 13, 'output_tokens': 21, 'total_tokens': 34, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

In [7]:
!pip install -q langchain_ollama

In [8]:
from langchain_ollama import ChatOllama

# Ollama 모델 불러오기
ollama = ChatOllama(model="gemma2:9b")

# Ollama 모델 호출
ollama.invoke("안녕하세요?")

AIMessage(content='안녕하세요! 👋 \n\n무엇을 도와드릴까요? 😊', additional_kwargs={}, response_metadata={'model': 'gemma2:9b', 'created_at': '2024-11-04T05:37:47.1582547Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 3026605700, 'load_duration': 58629400, 'prompt_eval_count': 13, 'prompt_eval_duration': 302800000, 'eval_count': 19, 'eval_duration': 2663151000}, id='run-c692ecf7-320d-45e4-9075-e4483ca3310a-0', usage_metadata={'input_tokens': 13, 'output_tokens': 19, 'total_tokens': 32})

In [9]:
gpt4o_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=0))
gpt3_chain = ask_question_with_llm(ChatOpenAI(model="gpt-3.5-turbo", temperature=0))

# Ollama 사용시
ollama_chain = ask_question_with_llm(ChatOllama(model="gemma2:9b"))

In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

cot_qa_evaluator = LangChainStringEvaluator(
    "cot_qa",
    config={"llm":ChatOpenAI(model="gpt-4o-mini", temperature=0)},  # 평가자
    prepare_data=lambda run, example: {
        "prediction":run.outputs["answer"],
        "reference":run.outputs["context"],
        "input":example.inputs["question"],
    }
)

dataset_name = "RAG_EVALUATION_DATASET"

experiment_result1 = evaluate(
    gpt3_chain,
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    experiment_prefix="MODEL_COMPARE_EVALUATION",
    metadata={
        "variant":"GPT-3.5-turbo 평가 (cot_qa)"
    }
)

experiment_result2 = evaluate(
    gpt4o_chain,
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    experiment_prefix="MODEL_COMPARE_EVALUATION",
    metadata={
        "variant": "GPT-4o-mini 평가 (cot_qa)"
    }
)

View the evaluation results for experiment: 'MODEL_COMPARE_EVALUATION-6e8c530b' at:
https://smith.langchain.com/o/0952f302-ca2e-4d37-9a5b-1830d66b833b/datasets/a81bc91f-29c2-4be9-a030-f450d6fd800a/compare?selectedSessions=0625bbee-6422-42ae-ab21-98716c9c6f06




0it [00:00, ?it/s]

View the evaluation results for experiment: 'MODEL_COMPARE_EVALUATION-6a551a5c' at:
https://smith.langchain.com/o/0952f302-ca2e-4d37-9a5b-1830d66b833b/datasets/a81bc91f-29c2-4be9-a030-f450d6fd800a/compare?selectedSessions=4c3dee64-da5e-4b41-838c-563fcf2fee14




0it [00:00, ?it/s]

In [None]:
from langsmith.evaluation import evaluate_comparative

evaluate_comparative(
    ["MODEL_COMPARE_EVALUATION-393290e4","MODEL_COMPARE_EVALUATION-d0667ace"],
    # 평가자
    evaluators=[evaluate_pairwise]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/0952f302-ca2e-4d37-9a5b-1830d66b833b/datasets/a81bc91f-29c2-4be9-a030-f450d6fd800a/compare?selectedSessions=ddd324f6-9545-4196-ab49-7fc887545ef4%2C2841d0b6-4482-4cc6-973c-cbf0b6ae4412&comparativeExperiment=841b8b9c-041d-41f5-8f3d-192eba40657b




  0%|          | 0/5 [00:00<?, ?it/s]

<langsmith.evaluation._runner.ComparativeExperimentResults at 0x2d65e57bc50>