In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
import pandas as pd

# 질문, 답변 목록
inputs = [
    "백설공주는 왜 백설공주라는 이름을 가지게 되었나요?",
    "백설공주를 죽이려고 했던 사람은 누구인가요?",
    "백설공주는 어떤 음식을 먹고 쓰러졌나요?"
]

outputs = [
    "백설공주는 피부가 눈처럼 하얗고 아름다워서백설공주라는 이름을 가지게 되었어요.",
    "백설공주의 새어머니인 왕비가 백설공주를 죽이려고 했어요. 왕비는 자신이 세상에서 가장 아름다운 사람이 되고싶어서 백설공주를 질투했기 때문이에요",
    "백설공주는 빨간 독사과를 먹고 쓰러졌어요."
]

# 답변 쌍으로 만들기
# zip() : 파이썬 내장함수 여러개의 반복가능한 객체들을 병렬적으로 묶어주는 역할
qa_pairs = [{"question": q, "answer":a} for q, a in zip(inputs, outputs)]

df = pd.DataFrame(qa_pairs)

df.head()

Unnamed: 0,question,answer
0,백설공주는 왜 백설공주라는 이름을 가지게 되었나요?,백설공주는 피부가 눈처럼 하얗고 아름다워서백설공주라는 이름을 가지게 되었어요.
1,백설공주를 죽이려고 했던 사람은 누구인가요?,백설공주의 새어머니인 왕비가 백설공주를 죽이려고 했어요. 왕비는 자신이 세상에서 가...
2,백설공주는 어떤 음식을 먹고 쓰러졌나요?,백설공주는 빨간 독사과를 먹고 쓰러졌어요.


In [7]:
from langsmith import Client

client = Client()
dataset_name = "RAG_EVALUATION_DATASET"

def create_dataset(client, datatset_name, description=None):

    # 기존의 모든 데이터셋을 순회
    for dataset in client.list_datasets():
        # 동일한 이름의 데이터셋이 이미 존재하면 기존 데이터셋 반환
        if dataset.name == dataset_name:
            return dataset
    
    # 동일 이름의 데이터셋이 없으면 새로운 데이터셋 생성
    dataset = client.create_dataset(
        dataset_name=dataset_name, # 데이터셋 이름 설정
        description=description    # 데이터셋 설명 설정
    )

    return dataset

#데이터셋 생성
dataset = create_dataset(client, dataset_name)

client.create_examples(
    inputs=[{"question": q} for q in df["question"].tolist()],
    outputs=[{"answer": a} for a in df["answer"].tolist()],
    dataset_id=dataset.id,
)

In [None]:
from rag import PDFRAG
from langchain_openai import ChatOpenAI
from langsmith.evaluation import evaluate, LangChainStringEvaluator

rag = PDFRAG(
    file_path="data/snow-white.pdf", llm=ChatOpenAI(model_name="gpt-4o-mini",temperature=0)
)

retriever = rag.create_retriever()

chain = rag.create_chain(retriever)

# chain.invoke("백설공주는 어떤 과일을 먹고 쓰러졌나요?")

# 질문에 답변하는 함수
def ask_question(inputs : dict):
    return {"answer" : chain.invoke(inputs["question"])}

# llm_answer = ask_question(
#     {"question" : "백설공주는 어떤 과일을 먹고 쓰러졌나요?"}
# )
# llm_answer

# evaluator prompt 출력을 위한 함수
def print_evaluator_prompt(evaluator):
    return evaluator.evaluator.prompt.pretty_print()

# qa 평가자 생성
qa_evaluator = LangChainStringEvaluator("qa")

print_evaluator_prompt(qa_evaluator)

dataset_name = "RAG_EVALUATION_DATASET"

experiment_results = evaluate(
    ask_question, # 평가할 함수 지정
    data=dataset_name, # 데이터셋지정
    evaluators=[qa_evaluator], # 평가자 지정
    experiment_prefix="RAG_EVALUATION", # 실험 이름 지정
    metadata={
        "variant" : "QA Evaluator 를 활용한 평가"
    }
)

# Context를 반환하는 RAG 결과 반환 함수
def rag_context_answer(inputs: dict):
    context = retriever.invoke(inputs["question"])
    return {
        "context":"\n".join([doc.page_content for doc in context]),
        "answer": chain.invoke(inputs["question"]),
        "query": inputs["question"]
    }

rag_context_answer(
    {"question": "백설공주는 어떤 과일을 먹고 쓰러졌나요?"}
)

# cot_qa 평가자
cot_qa_evaluator = LangChainStringEvaluator(
    "cot_qa",
    prepare_data=lambda run, example : {
        "prediction": run.outputs["answer"], # LLM이 생성한답변
        "reference": run.outputs["context"], # Context
        "input": example.inputs["question"]  # 데이터셋의 질문
    }
)

# context_qa 평가자
context_qa_evaluator = LangChainStringEvaluator(
    "context_qa",
    prepare_data=lambda run, example : {
        "prediction": run.outputs["answer"], # LLM이 생성한답변
        "reference": run.outputs["context"], # Context
        "input": example.inputs["question"]  # 데이터셋의 질문
    }
)

print_evaluator_prompt(cot_qa_evaluator)

#데이터셋 이름
dataset_name = "RAG_EVALUATION_DATASET"

# 평가실행
evaluate(
    rag_context_answer,
    data=dataset_name,
    evaluators=[cot_qa_evaluator, context_qa_evaluator],
    experiment_prefix="RAG_EVALUATION",
    metadata={
        "variant" : "COT_QA & CONTEXT_QA Evaluatior 를 활용한 평가"
    }
)

from langsmith.evaluation import evaluate, LangChainStringEvaluator

# 평가자
criteria_evaluator = [
    LangChainStringEvaluator("criteria", config={"criteria": "conciseness"}), # 간결성
    LangChainStringEvaluator("criteria", config={"criteria": "misogyny"}), # 여성 비하
    LangChainStringEvaluator("criteria", config={"criteria": "criminality"}) # 범죄 촉진
]

#데이터셋 이름 설정
dataset_name = "RAG_EVALUATION_DATASET"

# 평가 실행
experiment_results = evaluate(
    ask_question,
    data=dataset_name,
    evaluators=criteria_evaluator,
    experiment_prefix="CRITERIAL_EVALUATION",
    metadata={
        "variant": "criteria를 활용한 평가"
    }
)

You are a teacher grading a quiz.
You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT.

Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: CORRECT or INCORRECT here

Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 

QUESTION: [33;1m[1;3m{query}[0m
STUDENT ANSWER: [33;1m[1;3m{result}[0m
TRUE ANSWER: [33;1m[1;3m{answer}[0m
GRADE:
View the evaluation results for experiment: 'RAG_EVALUATION-d642b324' at:
https://smith.langchain.com/o/4f4b6069-9fa7-4219-889b-6de7c9b52ea4/datasets/d27aef40-c129-4c8c-85da-f16249bcb82f/compare?selectedSessions=4989d06d-7a59-4fba-8b5b-576e2b5f1247




20it [00:05,  3.65it/s]


You are a teacher grading a quiz.
You are given a question, the context the question is about, and the student's answer. You are asked to score the student's answer as either CORRECT or INCORRECT, based on the context.
Write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset.

Example Format:
QUESTION: question here
CONTEXT: context the question is about here
STUDENT ANSWER: student's answer here
EXPLANATION: step by step reasoning here
GRADE: CORRECT or INCORRECT here

Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 

QUESTION: [33;1m[1;3m{query}[0m
CONTEXT: [33;1m[1;3m{context}[0m
STUDENT ANSWER: [33;1m[1;3m{result}[0m
EXPLANATION:
View t

0it [00:00, ?it/s]Error running evaluator <DynamicRunEvaluator evaluate> on run 7c2532bc-a1ed-4f6d-b085-de4a41bd0d94: RateLimitError("Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4 in organization org-bhhAVavshSkAZcM7eVekvSrb on tokens per min (TPM): Limit 10000, Used 9748, Requested 604. Please try again in 2.112s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}")
Traceback (most recent call last):
  File "c:\Users\kangyeonjin\miniforge3\envs\buzz\Lib\site-packages\langsmith\evaluation\_runner.py", line 1384, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
                         ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kangyeonjin\miniforge3\envs\buzz\Lib\site-packages\langsmith\evaluation\evaluator.py", line 329, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kangyeonjin\miniforge3\envs\buzz\Lib\site-packages\langsmith\

View the evaluation results for experiment: 'CRITERIAL_EVALUATION-3c3b77d3' at:
https://smith.langchain.com/o/4f4b6069-9fa7-4219-889b-6de7c9b52ea4/datasets/d27aef40-c129-4c8c-85da-f16249bcb82f/compare?selectedSessions=64524624-70a7-442a-b29a-109e8719a019




0it [00:00, ?it/s]Error running evaluator <DynamicRunEvaluator evaluate> on run 6c456f66-1fbd-4904-8d25-3aef061cb5ae: RateLimitError("Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4 in organization org-bhhAVavshSkAZcM7eVekvSrb on tokens per min (TPM): Limit 10000, Used 9955, Requested 241. Please try again in 1.176s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}")
Traceback (most recent call last):
  File "c:\Users\kangyeonjin\miniforge3\envs\buzz\Lib\site-packages\langsmith\evaluation\_runner.py", line 1384, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
                         ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kangyeonjin\miniforge3\envs\buzz\Lib\site-packages\langsmith\evaluation\evaluator.py", line 329, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kangyeonjin\miniforge3\envs\buzz\Lib\site-packages\langsmith\