In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [1]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

def evaluate_pairwise(runs: list, example) -> dict:

    # 점수 저장
    scores = {}
    for i, run in enumerate(runs):
        scores[run.id] = i
    

    # 각 예제에 대한 실행 쌍
    answer_a = runs[0].outputs["answer"]
    answer_b = runs[1].outputs["answer"]
    question = example.inputs["question"]

    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    grade_prompt = PromptTemplate.from_template(
        """
        You are an LLM judge. Compare the following two answers to a question and determine which one is better.
        Better answer is the one that is more detailed and informative.
        If the answer is not related to the question, it is not a good answer.

        
        # Question:
        {question}
        
        #Answer A: 
        {answer_a}
        
        #Answer B: 
        {answer_b}
        
        Output should be either `A` or `B`. Pick the answer that is better.
        
        #Preference:
        """
    )
    answer_grader = grade_prompt | llm | StrOutputParser()

    score = answer_grader.invoke(
        {
            "question": question,
            "answer_a": answer_a,
            "answer_b": answer_b
        }
    )

    if score == "A": # A가 더 답변을 잘했다.
        scores[runs[0].id] = 1
        scores[runs[1].id] = 0
    elif score == "B": # B가 더 답변을 잘했다.
        scores[runs[0].id] = 0
        scores[runs[1].id] = 1
    else:
        scores[runs[0].id] = 0
        scores[runs[1].id] = 0
        
    return {"key": "ranked_preference", "scores": scores}

In [22]:
from rag import PDFRAG
from langchain_openai import ChatOpenAI

def ask_question_with_llm(llm):

    rag = PDFRAG(
        "data/snow-white.pdf",
        llm
    )

    retriever = rag.create_retriever()

    rag_chain = rag.create_chain(retriever)

    def _ask_question(inputs: dict):
        context = retriever.invoke(inputs["question"])
        context = "\n".join([doc.page_content for doc in context])
        return {
            "question": inputs["question"],
            "context": context,
            "answer": rag_chain.invoke(inputs["question"])
        }
    return _ask_question

In [23]:
from langchain_openai import ChatOpenAI

gpt3 = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

gpt3.invoke("안녕하세요?")


AIMessage(content='안녕하세요! 무엇을 도와드릴까요?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 13, 'total_tokens': 34, 'completion_tokens_details': {'audio_tokens': None, 'reasoning_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': None, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-d84d1dc0-cde9-46fd-bbba-69fe33a88e1f-0', usage_metadata={'input_tokens': 13, 'output_tokens': 21, 'total_tokens': 34, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 0}})

In [10]:
!pip install -q langchain_ollama


In [38]:
pip install langchain langchain-ollama

Note: you may need to restart the kernel to use updated packages.


In [41]:
from langchain_ollama import ChatOllama

# Ollama 모델을 불러옴
ollama = ChatOllama(model="gemma2")

# Ollama 모델 호출
ollama.invoke("안녕하세요?")

# 응답 출력
# print(response)

AIMessage(content='안녕하세요! 👋  \n\n무엇을 도와드릴까요? 😊', additional_kwargs={}, response_metadata={'model': 'gemma2', 'created_at': '2024-11-04T05:54:44.9747338Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 3157001700, 'load_duration': 64104900, 'prompt_eval_count': 13, 'prompt_eval_duration': 364349000, 'eval_count': 19, 'eval_duration': 2727027000}, id='run-41bbf29a-6d7e-4c5f-ba56-0702692362bb-0', usage_metadata={'input_tokens': 13, 'output_tokens': 19, 'total_tokens': 32})

In [42]:

questions = [
    "오늘 날씨는 어때?",
    "인공지능의 미래는 어떻게 될까?",
    "Python의 장점은 무엇인가요?"
]

for question in questions:
    response = ollama.invoke(question)
    print(f"Q: {question}\nA: {response}\n")

Q: 오늘 날씨는 어때?
A: content='죄송합니다. 저는 실시간 정보에 접근할 수 없어서 현재 날씨를 알려드릴 수 없습니다. 날씨 확인을 위해 기상 웹사이트나 앱을 사용해 보세요! 😊' additional_kwargs={} response_metadata={'model': 'gemma2', 'created_at': '2024-11-04T05:55:26.1372414Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 9259345800, 'load_duration': 31052600, 'prompt_eval_count': 17, 'prompt_eval_duration': 1727131000, 'eval_count': 49, 'eval_duration': 7499222000} id='run-6c889c49-de98-4154-af91-6fa02bfd896a-0' usage_metadata={'input_tokens': 17, 'output_tokens': 49, 'total_tokens': 66}

Q: 인공지능의 미래는 어떻게 될까?
A: content='인공지능의 미래에 대한 예측은 항상 흥미롭고, 다양한 가능성을 가지고 있습니다. 저는 대규모 언어 모델로서 인간의 지식과 창의력을 모방하도록 설계되었지만, 미래를 예측하는 것은 불가능합니다. 그러나 과학 기술 발전 추세와 현재 트렌드를 바탕으로 몇 가지 가능성을 제시해 드릴 수 있습니다.\n\n**긍정적인 측면:**\n\n* **생활의 질 향상:** 인공지능은 의료 진단, 개인 맞춤형 교육, 자율 주행 기술 등 다양한 분야에서 사람들의 삶을 더욱 편리하고 안전하게 만들 수 있습니다.\n* **혁신과 창조성 증대:** 인공지능은 새로운 재료 개발, 디자인 설계, 예술 작품 창작 등 창의적인 활동을 지원하여 인류의 지식과 기술 발전을 촉진할 수 있습니다.\n* **효율성

In [43]:
response = ollama.invoke("안녕하세요?", config={"max_tokens": 100})

In [25]:
gpt4o_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=0))
gpt3_chain = ask_question_with_llm(ChatOpenAI(model="gpt-3.5-turbo", temperature=0))

# ollama 사용시
# ollama_chain = ask_question_with_llm(ChatOllama(model=""))

In [26]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

cot_qa_evaluator = LangChainStringEvaluator(
    "cot_qa",
    config={"llm": ChatOpenAI(model="gpt-4o-mini", temperature=0)}, # 평가자
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["context"],
        "input": example.inputs["question"]
    }
)

dataset_name = "RAG_EVALUATION_DATASET"

experiment_result1 = evaluate(
    gpt3_chain,
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    experiment_prefix="MODEL_COMPARE_EVALUATION",
    metadata={
        "variant": "GPT-3.5-turbo 평가 (cot_qa)"
    }
)

experiment_result2 = evaluate(
    gpt4o_chain,
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    experiment_prefix="MODEL_COMPARE_EVALUATION",
    metadata={
        "variant": "GPT-4o-mini 평가 (cot_qa)"
    }
)

View the evaluation results for experiment: 'MODEL_COMPARE_EVALUATION-256328d6' at:
https://smith.langchain.com/o/4f4b6069-9fa7-4219-889b-6de7c9b52ea4/datasets/d27aef40-c129-4c8c-85da-f16249bcb82f/compare?selectedSessions=2b3ec683-2ba4-4c2d-92af-6ff8af908451




0it [00:00, ?it/s]

View the evaluation results for experiment: 'MODEL_COMPARE_EVALUATION-8db0a31c' at:
https://smith.langchain.com/o/4f4b6069-9fa7-4219-889b-6de7c9b52ea4/datasets/d27aef40-c129-4c8c-85da-f16249bcb82f/compare?selectedSessions=eecfec86-ee46-4697-bf49-15f09455b80d




0it [00:00, ?it/s]

In [32]:
from langsmith.evaluation import evaluate_comparative

evaluate_comparative(
    ["MODEL_COMPARE_EVALUATION-256328d6","MODEL_COMPARE_EVALUATION-8db0a31c"],
    # 평가자
     evaluators=[evaluate_pairwise]
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/4f4b6069-9fa7-4219-889b-6de7c9b52ea4/datasets/d27aef40-c129-4c8c-85da-f16249bcb82f/compare?selectedSessions=2b3ec683-2ba4-4c2d-92af-6ff8af908451%2Ceecfec86-ee46-4697-bf49-15f09455b80d&comparativeExperiment=458c97a7-5502-4271-b8dc-a94cf8342fb1




  0%|          | 0/8 [00:00<?, ?it/s]

<langsmith.evaluation._runner.ComparativeExperimentResults at 0x226b1ae00d0>