In [4]:
from dotenv import load_dotenv

# API KEY 정보로드
load_dotenv()
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("CH16-Evaluations")

LangSmith 추적을 시작합니다.
[프로젝트명]
CH16-Evaluations


# RAG 성능 테스트를 위한 함수 정의

In [1]:
from myrag import PDFRAG
from langchain_openai import ChatOpenAI


# 질문에 대한 답변하는 함수를 생성
def ask_question_with_llm(llm):
    # PDFRAG 객체 생성
    rag = PDFRAG(
        "data/SPRI_AI_Brief_2023년12월호_F.pdf",
        llm,
    )

    # 검색기(retriever) 생성
    retriever = rag.create_retriever()

    # 체인(chain) 생성
    rag_chain = rag.create_chain(retriever)

    def _ask_question(inputs: dict):
        context = retriever.invoke(inputs["question"])
        context = "\n".join([doc.page_content for doc in context])
        return {
            "question": inputs["question"],
            "context": context,
            "answer": rag_chain.invoke(inputs["question"]),
        }

    return _ask_question

In [9]:
gpt_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=0))
gpt_chain2 = ask_question_with_llm(ChatOpenAI(temperature=0))

In [10]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# qa 평가자 생성
cot_qa_evalulator = LangChainStringEvaluator(
    "cot_qa",
    config={"llm": ChatOpenAI(model="gpt-4o-mini", temperature=0)},
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["context"],
        "input": example.inputs["question"],
    },
)

dataset_name = "RAG_EVAL_DATASET"

# 평가 실행
experiment_results1 = evaluate(
    gpt_chain,
    data=dataset_name,
    evaluators=[cot_qa_evalulator],
    experiment_prefix="MODEL_COMPARE_EVAL",
    # 실험 메타데이터 지정
    metadata={
        "variant": "GPT-4o-mini 평가 (cot_qa)",
    },
)

# 평가 실행
experiment_results2 = evaluate(
    gpt_chain2,
    data=dataset_name,
    evaluators=[cot_qa_evalulator],
    experiment_prefix="MODEL_COMPARE_EVAL",
    # 실험 메타데이터 지정
    metadata={
        "variant": "Ollama(EEVE-Korean-10.8B:latest) 평가 (cot_qa)",
    },
)

View the evaluation results for experiment: 'MODEL_COMPARE_EVAL-e1c7c62c' at:
https://smith.langchain.com/o/9b141874-d093-4103-946d-7bc247255f98/datasets/899fb1c5-744d-4f35-a48e-68fe78d807f1/compare?selectedSessions=158b56cb-28d5-45a5-b439-bfd0c504d3f4




0it [00:00, ?it/s]

View the evaluation results for experiment: 'MODEL_COMPARE_EVAL-7c182adb' at:
https://smith.langchain.com/o/9b141874-d093-4103-946d-7bc247255f98/datasets/899fb1c5-744d-4f35-a48e-68fe78d807f1/compare?selectedSessions=af9c00f7-b6d0-41c9-951c-15e88f19c4b6




0it [00:00, ?it/s]