# dataset 생성


In [2]:
from langsmith import Client
from dotenv import load_dotenv
import pandas as pd

load_dotenv()
client = Client()
dataset_name = "globalmacro_custom_dataset"
df = pd.read_excel("./data/custom_testdataset.xlsx")


def create_dataset(client, dataset_name, description=None):
    for dataset in client.list_datasets():
        if dataset.name == dataset_name:
            return dataset

    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description=description,
    )
    return dataset


dataset = create_dataset(client, dataset_name)

client.create_examples(
    inputs=[{"question": q} for q in df["question"].tolist()],
    outputs=[{"answer": a} for a in df["ground_truth"].tolist()],
    dataset_id=dataset.id,
)

# chain


In [9]:
from dotenv import load_dotenv
from langchain_teddynote.community.pinecone import init_pinecone_index
from langchain_upstage.embeddings import UpstageEmbeddings
from langchain_teddynote.community.pinecone import PineconeKiwiHybridRetriever
from langchain_teddynote.korean import stopwords
import os

load_dotenv()

pinecone_params = init_pinecone_index(
    index_name="globalmacro-chatbot",
    namespace="financical-data-00",
    api_key=os.environ["PINECONE_API_KEY"],
    sparse_encoder_path="../data/sparse_encoder_01.pkl",
    stopwords=stopwords(),
    tokenizer="kiwi",
    embeddings=UpstageEmbeddings(model="solar-embedding-1-large-query"),
    top_k=10,
    alpha=0.4,  # alpha=0.75로 설정한 경우, (0.75: Dense Embedding, 0.25: Sparse Embedding)
)


pinecone_retriever = PineconeKiwiHybridRetriever(**pinecone_params)

[init_pinecone_index]
{'dimension': 4096,
 'index_fullness': 0.0,
 'namespaces': {'financical-data-00': {'vector_count': 2012}},
 'total_vector_count': 2012}


In [10]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import PromptTemplate
import pandas as pd
import sys
import os

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from DataProcessing.utils import load_yaml
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_community.document_compressors import JinaRerank
from langchain_cohere import CohereRerank


def create_chain(reranker=None, model="claude"):
    prompt_template = load_yaml("../prompts/Retriever._prompt.yaml")["prompt"]
    prompt = PromptTemplate.from_template(prompt_template)

    if model == "claude":
        llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0.5)
    else:
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.5)

    if reranker is not None:
        if reranker == "cohere":
            compressor = CohereRerank(model="rerank-multilingual-v3.0")
        elif reranker == "jina":
            compressor = JinaRerank(model="jina-reranker-v2-base-multilingual", top_n=5)
        elif reranker == "bge":
            model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")
            compressor = CrossEncoderReranker(model=model, top_n=5)
        elif reranker == "ko-reranker":
            model = HuggingFaceCrossEncoder(model_name="Dongjin-kr/ko-reranker")
            compressor = CrossEncoderReranker(model=model, top_n=5)

        compression_retriever = ContextualCompressionRetriever(
            base_compressor=compressor, base_retriever=pinecone_retriever
        )
        retriever = (
            compression_retriever if reranker is not None else pinecone_retriever
        )

    rag_chain = (
        {
            "context": retriever,
            "question": RunnablePassthrough(),
        }
        | prompt
        | llm
        | StrOutputParser()
    )

    return retriever, rag_chain

# LLM-as-Judge


In [48]:
reranker_list = [None, "cohere", "jina", "bge", "ko-reranker"]
reranker = "ko-reranker"
model = "openai"
retriever, chain = create_chain(reranker=reranker, model=model)
dataset_name = "globalmacro_cutom_dataset"

## Question-Answer Evaluator


In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator


def ask_question(inputs: dict):
    return {"answer": chain.invoke(inputs["question"])}


qa_evalulator = LangChainStringEvaluator("qa")


experiment_results = evaluate(
    ask_question,
    data=dataset_name,
    evaluators=[qa_evalulator],
    experiment_prefix=f"qa_{reranker}",
    metadata={
        "variant": "QA Evaluator 를 활용한 평가",
    },
)

## Context QA Evaluator


In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator


def context_answer_rag_answer(inputs: dict):
    context = retriever.invoke(inputs["question"])
    return {
        "context": "\n".join([doc.page_content for doc in context]),
        "answer": chain.invoke(inputs["question"]),
        "query": inputs["question"],
    }


eval_llm = ChatOpenAI(temperature=0.0, model="gpt-4o-mini")

cot_qa_evaluator = LangChainStringEvaluator(
    "cot_qa",
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["context"],
        "input": example.inputs["question"],
    },
    config={"llm": eval_llm},
)


context_qa_evaluator = LangChainStringEvaluator(
    "context_qa",
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["context"],
        "input": example.inputs["question"],
    },
    config={"llm": eval_llm},
)


evaluate(
    context_answer_rag_answer,
    data=dataset_name,
    evaluators=[cot_qa_evaluator, context_qa_evaluator],
    experiment_prefix=f"cot_qa_{reranker}",
    metadata={
        "variant": "COT_QA & Context_QA Evaluator 를 활용한 평가",
    },
)

## Embedding distance Evaluator


In [38]:
from langsmith.evaluation import LangChainStringEvaluator
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_upstage import UpstageEmbeddings
from langchain_openai import OpenAIEmbeddings
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

model_name = "BAAI/bge-m3"

hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cpu"},  # cuda, cpu
    # encode_kwargs={"normalize_embeddings": True},
)

# 임베딩 모델 평가자 생성
hf_embedding_evaluator = LangChainStringEvaluator(
    "embedding_distance",
    config={
        "embeddings": hf_embeddings,
        "distance_metric": "cosine",  # "cosine", "euclidean", "chebyshev", "hamming", and "manhattan"
    },
)

upstage_embedding_evaluator = LangChainStringEvaluator(
    "embedding_distance",
    config={
        "embeddings": UpstageEmbeddings(model="solar-embedding-1-large-query"),
        "distance_metric": "euclidean",  # "cosine", "euclidean", "chebyshev", "hamming", and "manhattan"
    },
)

openai_embedding_evaluator = LangChainStringEvaluator(
    "embedding_distance",
    config={
        "embeddings": OpenAIEmbeddings(model="text-embedding-3-small"),
        "distance_metric": "euclidean",  # "cosine", "euclidean", "chebyshev", "hamming", and "manhattan"
    },
)

In [None]:
# 하나의 metric에 여거해긔 embedding 모델이 사용되는 경우, 결과는 평균값으로 산정
from langsmith.evaluation import evaluate


def ask_question(inputs: dict):
    return {"answer": chain.invoke(inputs["question"])}


experiment_results = evaluate(
    ask_question,
    data=dataset_name,
    evaluators=[
        hf_embedding_evaluator,
        upstage_embedding_evaluator,
        openai_embedding_evaluator,
    ],
    experiment_prefix=f"EMBEDDING-EVAL_{reranker}",
    metadata={
        "variant": "embedding_distance 활용한 평가",
    },
)

## Groundedness Evaluator


In [41]:
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate
from langchain_upstage import UpstageGroundednessCheck


def ask_question(inputs: dict):
    context = retriever.invoke(inputs["question"])
    context = "\n".join([doc.page_content for doc in context])
    return {
        "question": inputs["question"],
        "context": context,
        "answer": chain.invoke(inputs["question"]),
    }


def upstage_groundness_check_evaluator(run: Run, example: Example) -> dict:
    answer = run.outputs.get("answer", "")
    context = run.outputs.get("context", "")
    upstage_groundedness_check = UpstageGroundednessCheck()

    groundedness_score = upstage_groundedness_check.invoke(
        {"answer": answer, "context": context}
    )
    groundedness_score = groundedness_score == "grounded"

    return {"key": "groundness_score", "score": int(groundedness_score)}

In [42]:
from langsmith.schemas import Run, Example
from langchain_teddynote.evaluator import GroundnessChecker
from langchain_openai import ChatOpenAI

groundedness_check = GroundnessChecker(
    ChatOpenAI(model="gpt-4o-mini", temperature=0)
).create()


def teddynote_groundness_check_evaluator(run: Run, example: Example) -> dict:
    answer = run.outputs.get("answer", "")
    context = run.outputs.get("context", "")

    groundedness_score = groundedness_check.invoke(
        {"answer": answer, "context": context}
    )
    groundedness_score = groundedness_score.score == "yes"

    return {"key": "groundness_score", "score": int(groundedness_score)}

In [None]:
from langsmith.evaluation import evaluate


experiment_results = evaluate(
    ask_question,
    data=dataset_name,
    evaluators=[
        upstage_groundness_check_evaluator,
        teddynote_groundness_check_evaluator,
    ],
    experiment_prefix=f"GROUNDEDNESS-EVAL_{reranker}",
    metadata={
        "variant": "Upstage & teddynote Groundness Checker 를 활용한 Hallucination 평가",
    },
)