<a href="https://colab.research.google.com/github/hanhanwu/Hanhan_LangGraph_Exercise/blob/main/RAG_Chatbot/try_langsmith_model_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## About

* Compare 2 LLM solutions' performance
* Performance evaluation uses a testset with questions and their correct answer
* See comparison on LangSmith dashboard

In [5]:
%%capture --no-stderr
%pip install -U --quiet langsmith langchain_openai thefuzz

In [2]:
import pandas as pd
import os
from google.colab import userdata

def _set_env(key_name: str, key_value: str):
    if key_name not in os.environ:
      os.environ[key_name] = key_value

# load the environment variables set in colab
_set_env('OPENAI_API_KEY', userdata.get('OPENAI_API_KEY'))
_set_env('LANGSMITH_API_KEY', userdata.get('LANGSMITH_API_KEY'))

## Upload Testet to LangSmith

* Testset is saved [here][1]
* The dataset is uploaded to LangSmith UI [like this][2]
  * `dataset_name` was defined in this process
  * Only choose "question" as LangSmith dataset input, and choose "answer" as output, otherwise LangSmith might print out ValueError

[1]:https://github.com/hanhanwu/Hanhan_LangGraph_Exercise/blob/main/RAG_Chatbot/model_eval_testset.csv
[2]:https://docs.smith.langchain.com/old/evaluation/faq/manage-datasets

In [3]:
from langsmith import Client
client = Client(api_key=os.environ['LANGSMITH_API_KEY'])
dataset_name = 'model_eval_testset'

client.list_examples(dataset_name)

<generator object Client.list_examples at 0x7df9f914d0e0>

In [31]:
display_data = pd.read_csv('https://raw.githubusercontent.com/hanhanwu/Hanhan_LangGraph_Exercise/refs/heads/main/RAG_Chatbot/model_eval_testset.csv')
display(display_data)

Unnamed: 0,question_id,question,answer
0,1,What are the battery types for electric wheelc...,"Lithium-ion battery, Dry batteries, Shielded b..."
1,2,Does aisle seats have movable armrests?,Some aisle seats have movable armrests for eas...
2,3,What is escort?,"If any of the following applies to you, you ar..."
3,4,What documents to bring for guide dogs?,We will check the guide dog vest or harness (w...
4,5,Does escort include guide dogs?,Guide dogs are considered as escort.
5,6,Is there staff available to guide visually dis...,Our staff will escort you at the airport and i...
6,7,Can I request a pre-boarding service to settle...,Pre-boarding service is available for passenge...


## Define Metrics

In [7]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate
from langsmith.evaluation import LangChainStringEvaluator
from langsmith.schemas import Run, Example
from thefuzz import fuzz


correctness_prompt_template = """You are an ANA (All Nippon Airways) accessibility policy expert.
                      Now you need to evaluate answers of customers' questions related to accessibility policy.
                    Here's customer' question:
                    {query}
                    Here is the real answer:
                    {answer}
                    You are evaluating the following predicted answer:
                    {result}
                    Respond with CORRECT or INCORRECT:
                    Evaluate:
                  """

correctness_prompt = PromptTemplate(template=correctness_prompt_template,
    input_variables=["query", "answer", "result"],  # names here matter
)
eval_llm = ChatOpenAI(temperature=0.0)
correctness_evaluator = LangChainStringEvaluator("qa",  # has to name as 'qa' for correctness
                                        config={"llm": eval_llm,
                                                "prompt": correctness_prompt})

In [15]:
def evaluate_fuzz_ratio(run: Run, example: Example) -> dict:
    referred_answer = example.outputs.get("answer") or ""
    predicted_answer = run.outputs.get("output") or ""
    fuzz_ratio = round(fuzz.ratio(referred_answer, predicted_answer), 4)
    return {'key':'fuzzy_ratio', 'score': fuzz_ratio}  # has to have "key' and 'score'

## Run Evaluations

In [16]:
def eval_query(question, openai_model_name):
    return openai_client.chat.completions.create(
        model=openai_model_name,
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "Respond to customer's question about All Nippon Airways' accessibility policy."
            },
            {
                "role": "user",
                "content": question,
            }
        ],
    ).choices[0].message.content


def langsmith_eval_all_queries(inputs, openai_model_name):
    output = eval_query(inputs["question"], openai_model_name)
    return {'output': output}

In [17]:
import openai
from langsmith import evaluate
from functools import partial

openai_client = openai.Client()
model1_name = 'gpt-3.5-turbo'
model2_name = 'gpt-4o-mini'

eval_func1 = partial(langsmith_eval_all_queries, openai_model_name=model1_name)
eval_func2 = partial(langsmith_eval_all_queries, openai_model_name=model2_name)

In [19]:
experiment_results = evaluate(
    eval_func1,
    data=dataset_name,
    evaluators=[correctness_evaluator, evaluate_fuzz_ratio], # eval metrics
    experiment_prefix="model1",
  )

View the evaluation results for experiment: 'model1-4e20b3c8' at:
https://smith.langchain.com/o/e9860761-8952-4d6a-a09f-3b40cecc4980/datasets/06ca5d2d-4af3-470a-8608-599b736808be/compare?selectedSessions=1bb64957-2c7a-4027-b2cc-f8671bf9fe8e




0it [00:00, ?it/s]

In [20]:
experiment_results = evaluate(
    eval_func2,
    data=dataset_name,
    evaluators=[correctness_evaluator, evaluate_fuzz_ratio], # eval metrics
    experiment_prefix="model2",
  )

View the evaluation results for experiment: 'model2-ed3704cb' at:
https://smith.langchain.com/o/e9860761-8952-4d6a-a09f-3b40cecc4980/datasets/06ca5d2d-4af3-470a-8608-599b736808be/compare?selectedSessions=4253aa81-5029-40a5-932a-8c247a9a18c4




0it [00:00, ?it/s]

## Then Compare 2 experiments' performance on LangSmith Dashboard

In [28]:
from IPython.core.display import Image, display

correctness_image = 'https://raw.githubusercontent.com/hanhanwu/Hanhan_LangGraph_Exercise/refs/heads/main/RAG_Chatbot/images/correctness_comparison.png'
display(Image(url=correctness_image, width=1200, unconfined=True))


In [30]:
correctness_image = 'https://raw.githubusercontent.com/hanhanwu/Hanhan_LangGraph_Exercise/refs/heads/main/RAG_Chatbot/images/fuzzy_ratio_comparison.png'
display(Image(url=correctness_image, width=1200, unconfined=True))