# LangSmithを使った評価
LangSmithの基本的なトレーシングの方法と、feedbackやevaluationを使った評価を行う。

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

## LangSmith関連の環境変数

In [3]:
import os

print(f"""Env Variables
LANGCHAIN_TRACING_V2: {os.environ["LANGCHAIN_TRACING_V2"]}
LANGCHAIN_PROJECT: {os.environ["LANGCHAIN_PROJECT"]}
LANGCHAIN_ENDPOINT: {os.environ["LANGCHAIN_ENDPOINT"]}
""")

Env Variables
LANGCHAIN_TRACING_V2: true
LANGCHAIN_PROJECT: machine-learning-workshop
LANGCHAIN_ENDPOINT: https://api.smith.langchain.com



## LangSmithのトレースの基本
traceableデコレータを使用することで、任意の関数の引数と返り値をLangSmithで確認できるようになる。

In [4]:
from IPython.display import display, Markdown
from langsmith import traceable
import openai

openai_client = openai.Client()


@traceable
def format_prompt(question):
    return [
        {
            "role": "system",
            "content": "あなたはBigQueryのエキスパートです. 出したいデータのクエリを作成してください. 出力はクエリのみで他の情報は不要です.",
        },
        {"role": "user", "content": f"{question}"},
    ]


@traceable(run_type="llm")
def invoke_llm(messages):
    return openai_client.chat.completions.create(
        messages=messages, model="gpt-4o", temperature=0
    )


@traceable
def parse_output(response):
    return response.choices[0].message.content


@traceable
def run_pipeline():
    messages = format_prompt("ウェブサイトの回遊率")
    response = invoke_llm(messages)
    return parse_output(response)


display(Markdown(run_pipeline()))

```sql
SELECT
  user_id,
  COUNT(DISTINCT page_id) AS pages_visited,
  COUNT(DISTINCT session_id) AS sessions,
  COUNT(DISTINCT page_id) / COUNT(DISTINCT session_id) AS page_views_per_session
FROM
  `your_dataset.your_table`
GROUP BY
  user_id
```

openaiとのやり取りの可観測にするラッパー `wrap_openai` を使うと、モデルの情報などを簡単に取得可能になる。

In [5]:
from langsmith.wrappers import wrap_openai

wrap_openai_client = wrap_openai(openai.Client())


@traceable(name="run_pipeline with wrap_openai")
def run_pipeline_with_wrap_llm():
    messages = format_prompt("ウェブサイトの回遊率")
    response = wrap_openai_client.chat.completions.create(
        messages=messages, model="gpt-4o", temperature=0
    )
    return parse_output(response)


display(Markdown(run_pipeline_with_wrap_llm()))

```sql
SELECT
  user_id,
  COUNT(DISTINCT page_id) AS pages_visited,
  COUNT(DISTINCT session_id) AS sessions,
  COUNT(DISTINCT page_id) / COUNT(DISTINCT session_id) AS page_views_per_session
FROM
  `your_dataset.your_table`
GROUP BY
  user_id
```

LCEL (LangChain Expression Language) を使えば、LangSmithでの観測が楽にできる。

In [6]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-4")

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "あなたはBigQueryのエキスパートです. 出したいデータのクエリを作成してください. 出力はクエリのみで他の情報は不要です.",
        ),
        ("human", "{question}"),
    ]
)
output_parser = StrOutputParser()

chain = prompt | model | output_parser
display(Markdown(chain.invoke("ウェブサイトの回遊率")))

以下は、BigQueryでウェブサイトの回遊率を計算するためのSQLクエリです。ここでは、`website_logs`というテーブルが存在し、それぞれのユーザーのセッションとページビューを記録していると仮定しています。

```sql
SELECT session_id, COUNT(DISTINCT page_id) AS page_views
FROM `project_id.dataset_id.website_logs`
GROUP BY session_id
```

このクエリは、各セッションで見られたページの数（page_views）を計算します。回遊率を計算するためには、1つ以上のページを見たセッションの割合を計算する必要があります。以下はそのためのクエリです。

```sql
WITH page_views_per_session AS (
  SELECT session_id, COUNT(DISTINCT page_id) AS page_views
  FROM `project_id.dataset_id.website_logs`
  GROUP BY session_id
)
SELECT COUNTIF(page_views > 1) / COUNT(*) AS bounce_rate
FROM page_views_per_session
```

このクエリは、最初のクエリの結果を使用して、ページビューが1以上のセッションの割合（bounce_rate）を計算します。ここで、`COUNTIF(page_views > 1)`はページビューが1以上のセッションの数を、`COUNT(*)`は全体のセッション数をそれぞれ計算します。

## FeedbackとEvaluationを使った評価


In [7]:
import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt


def mm(graph):
    graphbytes = graph.encode("ascii")
    base64_bytes = base64.b64encode(graphbytes)
    base64_string = base64_bytes.decode("ascii")
    display(Image(url="https://mermaid.ink/img/" + base64_string))

In [8]:
mm("""
graph TD
  A[LangSmith] --> B[User and Product Team Feedback]
  A --> C[Prepared Dataset Experiments]

  subgraph B[Feedback]
    B1[Annotate Runs] --> B2[Save Feedback]
  end

  subgraph C[Evaluation]
    C1[Input and Output] --> C2[Run Experiments]
  end

  classDef main fill:#f9f,stroke:#333,stroke-width:2px;
  classDef sub fill:#bbf,stroke:#333,stroke-width:2px;
  classDef detail fill:#fb3,stroke:#333,stroke-width:2px;

  class A,B,C main;
  class B1,B2,C1,C2 detail;   
   """)

### Feedback
- Traceされた実行の中に含まれるRunに、自分で定義したTagやKeyをAnnotate
  - trace_id1つに対して複数のrun_idが含まれる構造
  - 最初のrun_idはtrace_idと同一
- API経由のfeedbackではKey, 手動のfeedbackではTagでAnnotateする仕組みとなっているが、TagもKeyとして保存されている
"- ただし、API経由のfeedbackはrecordが追加・上書きできるのに対して、手動のfeedbackは上書きのみという違いがある
"- 数値データで同じキーのものは集計されて表示される
- LLMアプリを使っているユーザからのフィードバックは、基本的にAPI経由の登録となる


In [9]:
from langchain_openai import ChatOpenAI
from langchain import hub


llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Get the prompt to use - you can modify this!
prompt = hub.pull("hwchase17/openai-functions-agent")
prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [10]:
from langchain.agents import create_tool_calling_agent
from langchain.agents import AgentExecutor
from langchain_community.tools.tavily_search import TavilySearchResults

search = TavilySearchResults()
tools = [search]

agent = create_tool_calling_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
agent_executor.invoke({"input": "how can langsmith help with testing?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'how can LangSmith help with testing'}`


[0m[36;1m[1;3m[{'url': 'https://blog.logrocket.com/langsmith-test-llms-ai-applications/', 'content': 'This function helps to load the specific language models and tools required for the task as shown in the code snippet below:\nAs a next step, initialize an agent by calling the initialize_agent function with several parameters like tools, llms, and agent:\nThe verbose parameter is set to false, indicating that the agent will not provide verbose or detailed output.\n You can accomplish this by following the shell commands provided below:\nCreating a LangSmith client\nNext, create a LangSmith client to interact with the API:\nIf you’re using Python, run the following commands to import the module:\n This code also handles exceptions that may occur during the agent execution:\nIt’s also important to call the wait_for_all_tracers 

{'input': 'how can langsmith help with testing?',
 'output': 'LangSmith is a platform designed to assist with the development, debugging, testing, and continuous improvement of Language Learning Models (LLMs) and AI applications. Here are some ways LangSmith can help with testing:\n\n1. **Prototyping and Debugging**:\n   - LangSmith makes it easy to prototype LLM applications and agents. It provides tools to iterate on prompts, chains, and other components, which is crucial for building a high-quality product.\n   - The platform allows for detailed logging and tracing, helping developers identify and fix issues quickly.\n\n2. **Evaluating Labeled and Unlabeled Datasets**:\n   - LangSmith offers comprehensive evaluators for measuring the correctness of responses to prompts, especially useful for labeled datasets.\n   - For unlabeled datasets, built-in and custom evaluators written in natural language can be used to assess the performance of the models.\n\n3. **Automated Testing**:\n   -

LangSmith上で手動でFeedbackを付与することができる

#### Runの情報を取得
LangSmithのAPIを使ってFeedbackを作成するためには、run idを取得する必要がある。

In [11]:
from langchain.callbacks import tracing_v2_enabled

with tracing_v2_enabled() as cb:
    agent_executor.invoke({"input": "how can langsmith help with testing?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'how can LangSmith help with testing?'}`


[0m[36;1m[1;3m[{'url': 'https://www.datacamp.com/tutorial/introduction-to-langsmith', 'content': 'How it Works, Use Cases, Alternatives & More\nRichie Cotton\nHow AI is Changing Cybersecurity with Brian Murphy, CEO of ReliaQuest\nAdel Nehme\n32 min\nAn Introductory Guide to Fine-Tuning LLMs\nJosep Ferrer\n12 min\nSalesforce XGen-7B: A Step-by-Step Tutorial on Using And Fine-Tuning XGen-7B\nBex Tuychiev\n15 min\nGrow your data skills with DataCamp for Mobile\nMake progress on the go with our mobile courses and daily 5-minute coding challenges.\n For labeled datasets like the CSV dataset we uploaded, LangSmith offers more comprehensive evaluators for measuring the correctness of the response to a prompt:\nLet’s try the last one on our examples:\nCoTQA criterion returns a score called Contextual accuracy, as depicted in the GIF 

In [13]:
# RunのURLを取得
cb.get_run_url()

'https://smith.langchain.com/o/bd14a154-65e7-52b4-bdce-b9a16d5e3513/projects/p/dc1fff1d-e827-4ee4-b948-e71e0914f7dd/r/8cf28732-35e3-46aa-a408-548c35257ae0?poll=true'

In [15]:
# Run IDを取得
run_id = cb.latest_run.id
run_id

UUID('8cf28732-35e3-46aa-a408-548c35257ae0')

#### Feedbackの作成
run_idを指定してFeedbackを追加することができる。
keyは自分で任意の値を指定することが可能。

In [16]:
from langsmith import Client

ls_client = Client()

score_feedback1 = ls_client.create_feedback(
    run_id=cb.latest_run.id, key="test-score", score=1
)
score_feedback1

Feedback(id=UUID('b368a5a7-ed79-48a7-bbb4-c3ed06585fc1'), created_at=datetime.datetime(2024, 6, 26, 1, 22, 58, 129345, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 26, 1, 22, 58, 129349, tzinfo=datetime.timezone.utc), run_id=UUID('8cf28732-35e3-46aa-a408-548c35257ae0'), key='test-score', score=1, value=None, comment=None, correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

同じキーを指定すると別のfeedbackとして保存される

In [20]:
score_feedback2 = ls_client.create_feedback(
    run_id=cb.latest_run.id, key="test-score", score=100
)
score_feedback2

Feedback(id=UUID('6a3e59d2-f57a-4352-a45e-9782d7a2abdc'), created_at=datetime.datetime(2024, 6, 26, 1, 29, 54, 589717, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 26, 1, 29, 54, 589718, tzinfo=datetime.timezone.utc), run_id=UUID('8cf28732-35e3-46aa-a408-548c35257ae0'), key='test-score', score=100, value=None, comment=None, correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

feedback_idを指定して、上書きすることも可能。

In [22]:
update_score_feedback1 = ls_client.create_feedback(
    run_id=cb.latest_run.id, feedback_id=score_feedback1.id, key="test-score", score=500
)
update_score_feedback1

Feedback(id=UUID('b368a5a7-ed79-48a7-bbb4-c3ed06585fc1'), created_at=datetime.datetime(2024, 6, 26, 1, 32, 4, 807318, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 26, 1, 32, 4, 807322, tzinfo=datetime.timezone.utc), run_id=UUID('8cf28732-35e3-46aa-a408-548c35257ae0'), key='test-score', score=500, value=None, comment=None, correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

In [23]:
comment_feedback = ls_client.create_feedback(
    run_id=cb.latest_run.id, key="test-comment", comment="test comment"
)
comment_feedback

Feedback(id=UUID('74d0b074-05d7-476f-9097-35301f569d52'), created_at=datetime.datetime(2024, 6, 26, 1, 32, 16, 902910, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 26, 1, 32, 16, 902912, tzinfo=datetime.timezone.utc), run_id=UUID('8cf28732-35e3-46aa-a408-548c35257ae0'), key='test-comment', score=None, value=None, comment='test comment', correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

#### Feedbackの削除

In [19]:
# 作成したfeedbackを削除
ls_client.delete_feedback(score_feedback1.id)
ls_client.delete_feedback(comment_feedback.id)

In [20]:
# まとめて消去することも可能
for feedback in ls_client.list_feedback(run_ids=[cb.latest_run.id]):
    ls_client.delete_feedback(feedback.id)

### Evaluation

In [21]:
import textwrap

from langsmith import Client
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

ls_client = Client()  # LangSmithのクライアント

# 作成するデータセット
dataset_name = "SQL Samples"

# データセットがあれば削除
if ls_client.has_dataset(dataset_name=dataset_name):
    dataset = ls_client.delete_dataset(dataset_name=dataset_name)

dataset = ls_client.create_dataset(
    dataset_name, description="ML Workshop用のサンプルクエリ"
)

# データセットにexampleを保存
ls_client.create_examples(
    inputs=[
        {"question": "MAUを取得"},
        {"question": "新規ユーザ数の推移"},
    ],
    outputs=[
        {
            "query": textwrap.dedent("""
           SELECT
               COUNT(DISTINCT user_id) AS monthly_active_users
           FROM
               `your_dataset.user_activities`
           WHERE
               activity_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH) AND CURRENT_DATE()
        """),
            "tables": ["user_activities"],
        },
        {
            "query": textwrap.dedent("""
            SELECT
                signup_date,
                COUNT(user_id) AS new_users
            FROM
                `your_dataset.user_activities`
            GROUP BY
                signup_date
            ORDER BY
            　　 signup_date
        """),
            "tables": ["user_activities"],
        },
    ],
    dataset_id=dataset.id,
)

In [22]:
dataset

Dataset(name='SQL Samples', description='ML Workshop用のサンプルクエリ', data_type=<DataType.kv: 'kv'>, id=UUID('b5c798dc-ea07-4213-97ae-7e2f3072513b'), created_at=datetime.datetime(2024, 6, 25, 22, 36, 11, 696756, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 25, 22, 36, 11, 696756, tzinfo=datetime.timezone.utc), example_count=0, session_count=0, last_session_start_time=None)

example_countが0となっているので、LangSmith Clientを使ってdatasetを読み直す

In [23]:
ls_client.read_dataset(dataset_name=dataset_name)

Dataset(name='SQL Samples', description='ML Workshop用のサンプルクエリ', data_type=<DataType.kv: 'kv'>, id=UUID('b5c798dc-ea07-4213-97ae-7e2f3072513b'), created_at=datetime.datetime(2024, 6, 25, 22, 36, 11, 696756, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 25, 22, 36, 11, 696756, tzinfo=datetime.timezone.utc), example_count=2, session_count=0, last_session_start_time=None)

In [24]:
dataset.url

'https://smith.langchain.com/o/bd14a154-65e7-52b4-bdce-b9a16d5e3513/datasets/b5c798dc-ea07-4213-97ae-7e2f3072513b'

In [25]:
ls_client.create_examples(
    inputs=[
        {"question": "月ごとのCV数の推移"},
    ],
    outputs=[
        {
            "query": textwrap.dedent("""
           SELECT
               FORMAT_TIMESTAMP('%Y-%m', conv_date) AS conversion_month,
               COUNT(conv_id) AS conversions
           FROM
               `your_dataset.your_table`
           WHERE
               conv_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR) AND CURRENT_DATE()
           GROUP BY
               conversion_month
           ORDER BY
               conversion_month
        """),
            "tables": ["user_activities"],
        },
    ],
    dataset_id=dataset.id,
)

exampleを増やしたことにより、datasetのバージョンも変更されている

In [26]:
ls_client.read_dataset(dataset_name=dataset_name).example_count

3

In [27]:
# datasetに保存されているexampleの一覧
for example in ls_client.list_examples(dataset_name=dataset_name):
    print(f"""
question: {example.inputs["question"]}
query: {example.outputs["query"]}
    """)


question: 月ごとのCV数の推移
query: 
SELECT
    FORMAT_TIMESTAMP('%Y-%m', conv_date) AS conversion_month,
    COUNT(conv_id) AS conversions
FROM
    `your_dataset.your_table`
WHERE
    conv_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR) AND CURRENT_DATE()
GROUP BY
    conversion_month
ORDER BY
    conversion_month

    

question: MAUを取得
query: 
SELECT
    COUNT(DISTINCT user_id) AS monthly_active_users
FROM
    `your_dataset.user_activities`
WHERE
    activity_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH) AND CURRENT_DATE()

    

question: 新規ユーザ数の推移
query: 
SELECT
    signup_date,
    COUNT(user_id) AS new_users
FROM
    `your_dataset.user_activities`
GROUP BY
    signup_date
ORDER BY
　　 signup_date

    


## LangSmith Evaluation
### Custom Evaluation

In [28]:
# inputsにexampleが1つずつ渡される
def predict(inputs: dict) -> dict:
    model = ChatOpenAI(model="gpt-4")
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "あなたはBigQueryのエキスパートです. 出したいデータのクエリを作成してください. 出力はクエリのみで他の情報は不要です.",
            ),
            ("human", "{question}, tableはuser_activitiesを使います."),
        ]
    )
    output_parser = StrOutputParser()
    llm = prompt | model | output_parser
    return {"output": llm.invoke(inputs)}


# Custom Evaluation
def must_have_user_activities(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("output") or ""
    print(f"run id: {run.id}\n")
    required = example.outputs.get("tables") or []  # outputsのキー (tables) と合わせる
    print(required)
    print(prediction)
    score = all(
        phrase in prediction for phrase in required
    )  # scoreは自分で定義したものでよい
    return {
        "key": "must_have_user_activities",
        "score": score,
        "comment": "comment test",
    }  # key, score, commentを返す


experiment_results = evaluate(
    predict,
    data=dataset_name,  # The data to predict and grade over
    evaluators=[must_have_user_activities],  # The evaluators to score the results
    experiment_prefix="ml-workshop",  # A prefix for your experiment names to easily identify them
    metadata={
        "version": "1.0.0",
    },
)

View the evaluation results for experiment: 'ml-workshop-5202c67c' at:
https://smith.langchain.com/o/bd14a154-65e7-52b4-bdce-b9a16d5e3513/datasets/b5c798dc-ea07-4213-97ae-7e2f3072513b/compare?selectedSessions=7ccb23a7-364e-447f-ae54-9813a475ab54




0it [00:00, ?it/s]

run id: ed966304-d7af-4848-a81c-6f46974f31a3
run id: 800fb134-2c72-455d-a9ae-0d209b684b3e

['user_activities']
```
SELECT 
  DATE(user_activities.timestamp) AS date, 
  COUNT(DISTINCT user_activities.user_id) AS new_users 
FROM 
  `your_project.your_dataset.user_activities` 
WHERE 
  user_activities.is_new_user = TRUE 
GROUP BY 
  date 
ORDER BY 
  date ASC;
```
run id: aa7a22af-6896-4dc1-bb46-b330a8d77f04

['user_activities']
```sql
SELECT 
  EXTRACT(YEAR FROM activity_date) AS year, 
  EXTRACT(MONTH FROM activity_date) AS month, 
  COUNT(DISTINCT user_id) AS MAU 
FROM 
  `project_id.dataset.user_activities` 
GROUP BY 
  year, 
  month 
ORDER BY 
  year DESC, 
  month DESC;
```
ここで、"project_id"と"dataset"は適切なプロジェクトIDとデータセットに置き換えてください。

['user_activities']
```
SELECT 
  FORMAT_TIMESTAMP('%Y-%m', timestamp) AS Month,
  COUNT(*) AS CV_Count
FROM 
  `project.dataset.user_activities`
WHERE 
  activity = 'CV'
GROUP BY 
  Month
ORDER BY 
  Month ASC
```
