# LangSmithを使った評価
LangSmithの基本的なトレーシングの方法と、feedbackやevaluationを使った評価を行う。

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

## LangSmith関連の環境変数

In [2]:
import os

print(f"""Env Variables
LANGCHAIN_TRACING_V2: {os.environ["LANGCHAIN_TRACING_V2"]}
LANGCHAIN_PROJECT: {os.environ["LANGCHAIN_PROJECT"]}
LANGCHAIN_ENDPOINT: {os.environ["LANGCHAIN_ENDPOINT"]}
""")

Env Variables
LANGCHAIN_TRACING_V2: true
LANGCHAIN_PROJECT: machine-learning-workshop
LANGCHAIN_ENDPOINT: https://api.smith.langchain.com



## LangSmithのトレースの基本
traceableデコレータを使用することで、任意の関数の引数と返り値をLangSmithで確認できるようになる。

In [3]:
from IPython.display import display, Markdown
from langsmith import traceable
import openai

openai_client = openai.Client()


@traceable
def format_prompt(question):
    return [
        {
            "role": "system",
            "content": "あなたはBigQueryのエキスパートです. 出したいデータのクエリを作成してください. 出力はクエリのみで他の情報は不要です.",
        },
        {"role": "user", "content": f"{question}"},
    ]


@traceable(run_type="llm")
def invoke_llm(messages):
    return openai_client.chat.completions.create(
        messages=messages, model="gpt-4o", temperature=0
    )


@traceable
def parse_output(response):
    return response.choices[0].message.content


@traceable
def run_pipeline():
    messages = format_prompt("ウェブサイトの回遊率")
    response = invoke_llm(messages)
    return parse_output(response)


display(Markdown(run_pipeline()))

```sql
SELECT
  (SUM(CASE WHEN pageviews > 1 THEN 1 ELSE 0 END) / COUNT(*)) AS bounce_rate
FROM
  `your_dataset.your_table`
WHERE
  session_id IS NOT NULL;
```

openaiとのやり取りの可観測にするラッパー `wrap_openai` を使うと、モデルの情報などを簡単に取得可能になる。

In [4]:
from langsmith.wrappers import wrap_openai

wrap_openai_client = wrap_openai(openai.Client())


@traceable(name="run_pipeline with wrap_openai")
def run_pipeline_with_wrap_llm():
    messages = format_prompt("ウェブサイトの回遊率")
    response = wrap_openai_client.chat.completions.create(
        messages=messages, model="gpt-4o", temperature=0
    )
    return parse_output(response)


display(Markdown(run_pipeline_with_wrap_llm()))

```sql
SELECT
  user_id,
  COUNT(DISTINCT page_id) AS pages_visited,
  COUNT(DISTINCT session_id) AS sessions,
  COUNT(DISTINCT page_id) / COUNT(DISTINCT session_id) AS page_views_per_session
FROM
  `your_project.your_dataset.your_table`
GROUP BY
  user_id
```

LCEL (LangChain Expression Language) を使えば、LangSmithでの観測が楽にできる。

In [5]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-4")

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "あなたはBigQueryのエキスパートです. 出したいデータのクエリを作成してください. 出力はクエリのみで他の情報は不要です.",
        ),
        ("human", "{question}"),
    ]
)
output_parser = StrOutputParser()

chain = prompt | model | output_parser
display(Markdown(chain.invoke("ウェブサイトの回遊率")))

下記のクエリは、ウェブサイトの各セッションにおけるページビュー数を計算します。これは、ウェブサイトの回遊率を理解するための一つの方法です。

```
SELECT
  sessionId,
  COUNT(pageViewId) as pageViews
FROM
  `project.dataset.table`
GROUP BY
  sessionId
```

ここで、`project.dataset.table`はあなたのBigQueryデータセットの具体的な場所を指します。また、`sessionId` と `pageViewId` はセッションIDとページビューIDを追跡するためのフィールド名を指します。これらのフィールド名は、あなたの具体的なデータ構造に基づいて変更する必要があるかもしれません。

## FeedbackとEvaluationを使った評価


In [6]:
import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt


def mm(graph):
    graphbytes = graph.encode("ascii")
    base64_bytes = base64.b64encode(graphbytes)
    base64_string = base64_bytes.decode("ascii")
    display(Image(url="https://mermaid.ink/img/" + base64_string))

In [7]:
mm("""
graph TD
  A[LangSmith] --> B[User and Product Team Feedback]
  A --> C[Prepared Dataset Experiments]

  subgraph B[Feedback]
    B1[Annotate Runs] --> B2[Save Feedback]
  end

  subgraph C[Evaluation]
    C1[Input and Output] --> C2[Run Experiments]
  end

  classDef main fill:#f9f,stroke:#333,stroke-width:2px;
  classDef sub fill:#bbf,stroke:#333,stroke-width:2px;
  classDef detail fill:#fb3,stroke:#333,stroke-width:2px;

  class A,B,C main;
  class B1,B2,C1,C2 detail;   
   """)

### Feedback
- Traceされた実行の中に含まれるRunに、自分で定義したTagやKeyをAnnotate
  - trace_id1つに対して複数のrun_idが含まれる構造
  - 最初のrun_idはtrace_idと同一
- API経由のfeedbackではKey, 手動のfeedbackではTagでAnnotateする仕組みとなっているが、TagもKeyとして保存されている
"- ただし、API経由のfeedbackはrecordが追加・上書きできるのに対して、手動のfeedbackは上書きのみという違いがある
"- 数値データで同じキーのものは集計されて表示される
- LLMアプリを使っているユーザからのフィードバックは、基本的にAPI経由の登録となる


In [8]:
from langchain_openai import ChatOpenAI
from langchain import hub


llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Get the prompt to use - you can modify this!
prompt = hub.pull("hwchase17/openai-functions-agent")
prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [9]:
from langchain.agents import create_tool_calling_agent
from langchain.agents import AgentExecutor
from langchain_community.tools.tavily_search import TavilySearchResults

search = TavilySearchResults()
tools = [search]

agent = create_tool_calling_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
agent_executor.invoke({"input": "how can langsmith help with testing?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'Langsmith testing features'}`


[0m[36;1m[1;3m[{'url': 'https://blog.logrocket.com/langsmith-test-llms-ai-applications/', 'content': 'This function helps to load the specific language models and tools required for the task as shown in the code snippet below:\nAs a next step, initialize an agent by calling the initialize_agent function with several parameters like tools, llms, and agent:\nThe verbose parameter is set to false, indicating that the agent will not provide verbose or detailed output.\n You can accomplish this by following the shell commands provided below:\nCreating a LangSmith client\nNext, create a LangSmith client to interact with the API:\nIf you’re using Python, run the following commands to import the module:\n This code also handles exceptions that may occur during the agent execution:\nIt’s also important to call the wait_for_all_tracers function 

{'input': 'how can langsmith help with testing?',
 'output': 'Langsmith offers a variety of features to help with testing, debugging, and evaluating Language Learning Models (LLMs) and AI applications. Here are some key aspects:\n\n1. **Automated Testing**: Langsmith supports setting up automated testing workflows that can be integrated into CI/CD pipelines. This ensures that your models are consistently evaluated and tested as part of your development process.\n\n2. **Dataset Creation and Management**: You can create and manage datasets within Langsmith. This includes labeled datasets for more comprehensive evaluation and unlabeled datasets for general testing.\n\n3. **Evaluation Metrics**: Langsmith provides built-in and custom evaluators to measure the correctness and performance of your models. This includes metrics like contextual accuracy and other custom criteria.\n\n4. **Prompt Management**: Langsmith allows you to refine, test, and version your prompts in one place. This helps

LangSmith上で手動でFeedbackを付与することができる

#### Runの情報を取得
LangSmithのAPIを使ってFeedbackを作成するためには、run idを取得する必要がある。

In [10]:
from langchain.callbacks import tracing_v2_enabled

with tracing_v2_enabled() as cb:
    agent_executor.invoke({"input": "how can langsmith help with testing?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'Langsmith testing features'}`


[0m[36;1m[1;3m[{'url': 'https://www.datacamp.com/tutorial/introduction-to-langsmith', 'content': 'How it Works, Use Cases, Alternatives & More\nRichie Cotton\nHow AI is Changing Cybersecurity with Brian Murphy, CEO of ReliaQuest\nAdel Nehme\n32 min\nAn Introductory Guide to Fine-Tuning LLMs\nJosep Ferrer\n12 min\nSalesforce XGen-7B: A Step-by-Step Tutorial on Using And Fine-Tuning XGen-7B\nBex Tuychiev\n15 min\nGrow your data skills with DataCamp for Mobile\nMake progress on the go with our mobile courses and daily 5-minute coding challenges.\n For labeled datasets like the CSV dataset we uploaded, LangSmith offers more comprehensive evaluators for measuring the correctness of the response to a prompt:\nLet’s try the last one on our examples:\nCoTQA criterion returns a score called Contextual accuracy, as depicted in the GIF below (als

In [11]:
# RunのURLを取得
cb.get_run_url()

'https://smith.langchain.com/o/bd14a154-65e7-52b4-bdce-b9a16d5e3513/projects/p/dc1fff1d-e827-4ee4-b948-e71e0914f7dd/r/73c37ddb-d255-443d-a136-35eee99e5a58?poll=true'

In [12]:
# Run IDを取得
run_id = cb.latest_run.id
run_id

UUID('73c37ddb-d255-443d-a136-35eee99e5a58')

#### Feedbackの作成
run_idを指定してFeedbackを追加することができる。
keyは自分で任意の値を指定することが可能。

In [13]:
from langsmith import Client

ls_client = Client()

score_feedback1 = ls_client.create_feedback(
    run_id=cb.latest_run.id, key="test-score", score=1
)
score_feedback1

Feedback(id=UUID('28f1db89-a530-4d41-beb4-0c143a9c8b95'), created_at=datetime.datetime(2024, 6, 26, 1, 52, 2, 559765, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 26, 1, 52, 2, 559768, tzinfo=datetime.timezone.utc), run_id=UUID('73c37ddb-d255-443d-a136-35eee99e5a58'), key='test-score', score=1, value=None, comment=None, correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

同じキーを指定すると別のfeedbackとして保存される

In [14]:
score_feedback2 = ls_client.create_feedback(
    run_id=cb.latest_run.id, key="test-score", score=100
)
score_feedback2

Feedback(id=UUID('a210e562-3f1d-44b4-8711-46d2699a7dfb'), created_at=datetime.datetime(2024, 6, 26, 1, 52, 3, 237451, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 26, 1, 52, 3, 237463, tzinfo=datetime.timezone.utc), run_id=UUID('73c37ddb-d255-443d-a136-35eee99e5a58'), key='test-score', score=100, value=None, comment=None, correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

feedback_idを指定して、上書きすることも可能。

In [15]:
update_score_feedback1 = ls_client.create_feedback(
    run_id=cb.latest_run.id, feedback_id=score_feedback1.id, key="test-score", score=500
)
update_score_feedback1

Feedback(id=UUID('28f1db89-a530-4d41-beb4-0c143a9c8b95'), created_at=datetime.datetime(2024, 6, 26, 1, 52, 3, 910193, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 26, 1, 52, 3, 910200, tzinfo=datetime.timezone.utc), run_id=UUID('73c37ddb-d255-443d-a136-35eee99e5a58'), key='test-score', score=500, value=None, comment=None, correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

In [16]:
comment_feedback = ls_client.create_feedback(
    run_id=cb.latest_run.id, key="test-comment", comment="test comment"
)
comment_feedback

Feedback(id=UUID('81bf1761-efef-404c-9bc0-20dfaa319e23'), created_at=datetime.datetime(2024, 6, 26, 1, 52, 5, 112288, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 26, 1, 52, 5, 112292, tzinfo=datetime.timezone.utc), run_id=UUID('73c37ddb-d255-443d-a136-35eee99e5a58'), key='test-comment', score=None, value=None, comment='test comment', correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

#### Feedbackの削除
feedback_idを指定して、作成したfeedbackを削除することも可能。

In [17]:
# 作成したfeedbackを削除
ls_client.delete_feedback(score_feedback1.id)
ls_client.delete_feedback(comment_feedback.id)

In [18]:
# list_feedbackでfeedbackをすべて取得し、削除
for feedback in ls_client.list_feedback(run_ids=[cb.latest_run.id]):
    ls_client.delete_feedback(feedback.id)

### Evaluation
- DatasetにあらかじめInputとOutputの組み合わせからなるExampleを保存
- ExampleのInputを使ってLLMを実行し、出てきたOutputを保存されているOutputを使って評価
- 評価の方法は、LangSmithがあらかじめ用意しているものか、カスタムで作成することができる
- 評価結果は、key (評価指標の名前), score (評価結果), commentとして残すことが可能


In [19]:
mm("""
graph TD
  subgraph EvaluationPipeline[Evaluation Pipeline]
    B[Datasets] --> C[Examples]
    C --> D[Inputs]
    C --> E[Expected Outputs]
    D --> F[LLM]
    F --> G[Run Outputs]
    E --> H[Evaluators]
    G --> H
    H --> I[Evaluation Result]

    subgraph I[Evaluation Result]
      direction LR
      I1[Key: metric name] -.- I2[Score: metric value] -.- I3[Comment: reasoning]
      I3 -.- I1
    end
  end

  classDef main fill:#f9f,stroke:#333,stroke-width:2px;
  classDef sub fill:#bbf,stroke:#333,stroke-width:2px;
  classDef detail fill:#fb3,stroke:#333,stroke-width:2px;

  class B,C,D,E,F,G,H,I main;
  class I1,I2,I3 detail;   
   """)

#### DatasetとExampleを作成
Datasetを作成し、Exampleを保存。

In [20]:
import textwrap

from langsmith import Client
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

ls_client = Client()  # LangSmithのクライアント

# 作成するデータセット
dataset_name = "SQL Samples"

# データセットがあれば削除
if ls_client.has_dataset(dataset_name=dataset_name):
    dataset = ls_client.delete_dataset(dataset_name=dataset_name)

dataset = ls_client.create_dataset(
    dataset_name, description="ML Workshop用のサンプルクエリ"
)

# データセットにexampleを保存
ls_client.create_examples(
    inputs=[
        {"question": "MAUを取得"},
        {"question": "新規ユーザ数の推移"},
    ],
    outputs=[
        {
            "query": textwrap.dedent("""
           SELECT
               COUNT(DISTINCT user_id) AS monthly_active_users
           FROM
               `your_dataset.user_activities`
           WHERE
               activity_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH) AND CURRENT_DATE()
        """),
            "tables": ["user_activities"],
        },
        {
            "query": textwrap.dedent("""
            SELECT
                signup_date,
                COUNT(user_id) AS new_users
            FROM
                `your_dataset.user_activities`
            GROUP BY
                signup_date
            ORDER BY
            　　 signup_date
        """),
            "tables": ["user_activities"],
        },
    ],
    dataset_id=dataset.id,
)

In [21]:
# datasetのURLを取得
dataset.url

'https://smith.langchain.com/o/bd14a154-65e7-52b4-bdce-b9a16d5e3513/datasets/ec7e16e3-d0fe-4aa9-9b24-241327e6f1ce'

In [22]:
# datasetのexample数を取得
dataset.example_count

0

example_countが0となっているので、LangSmith Clientを使ってdatasetを読み直す

In [23]:
dataset = ls_client.read_dataset(dataset_name=dataset_name)
dataset.example_count

2

In [24]:
ls_client.create_examples(
    inputs=[
        {"question": "月ごとのCV数の推移"},
    ],
    outputs=[
        {
            "query": textwrap.dedent("""
           SELECT
               FORMAT_TIMESTAMP('%Y-%m', conv_date) AS conversion_month,
               COUNT(conv_id) AS conversions
           FROM
               `your_dataset.your_table`
           WHERE
               conv_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR) AND CURRENT_DATE()
           GROUP BY
               conversion_month
           ORDER BY
               conversion_month
        """),
            "tables": ["user_activities"],
        },
    ],
    dataset_id=dataset.id,
)

exampleを増やしたことにより、datasetのバージョンも変更されている

In [25]:
dataset = ls_client.read_dataset(dataset_name=dataset_name)
print(f"""
modified_at: {dataset.modified_at}
example_count: {dataset.example_count}
""")


modified_at: 2024-06-26 01:52:09.950582+00:00
example_count: 3



In [26]:
# datasetに保存されているexampleの一覧
for example in ls_client.list_examples(dataset_name=dataset_name):
    print(f"""
question: {example.inputs["question"]}
query: {example.outputs["query"]}
    """)


question: 月ごとのCV数の推移
query: 
SELECT
    FORMAT_TIMESTAMP('%Y-%m', conv_date) AS conversion_month,
    COUNT(conv_id) AS conversions
FROM
    `your_dataset.your_table`
WHERE
    conv_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR) AND CURRENT_DATE()
GROUP BY
    conversion_month
ORDER BY
    conversion_month

    

question: MAUを取得
query: 
SELECT
    COUNT(DISTINCT user_id) AS monthly_active_users
FROM
    `your_dataset.user_activities`
WHERE
    activity_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH) AND CURRENT_DATE()

    

question: 新規ユーザ数の推移
query: 
SELECT
    signup_date,
    COUNT(user_id) AS new_users
FROM
    `your_dataset.user_activities`
GROUP BY
    signup_date
ORDER BY
　　 signup_date

    


### Custom Evaluationを実行

In [27]:
# inputsにexampleが1つずつ渡される
def predict(inputs: dict) -> dict:
    model = ChatOpenAI(model="gpt-4")
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "あなたはBigQueryのエキスパートです. 出したいデータのクエリを作成してください. 出力はクエリのみで他の情報は不要です.",
            ),
            ("human", "{question}, tableはuser_activitiesを使います."),
        ]
    )
    output_parser = StrOutputParser()
    llm = prompt | model | output_parser
    return {"output": llm.invoke(inputs)}


# Custom Evaluation
def must_have_user_activities(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("output") or ""
    print(f"run id: {run.id}\n")
    required = example.outputs.get("tables") or []  # outputsのキー (tables) と合わせる
    print(required)
    print(prediction)
    score = all(
        phrase in prediction for phrase in required
    )  # scoreは自分で定義したものでよい
    return {
        "key": "must_have_user_activities",
        "score": score,
        "comment": "comment test",
    }  # key, score, commentを返す


experiment_results = evaluate(
    predict,
    data=dataset_name,  # The data to predict and grade over
    evaluators=[must_have_user_activities],  # The evaluators to score the results
    experiment_prefix="ml-workshop",  # A prefix for your experiment names to easily identify them
    metadata={
        "version": "1.0.0",
    },
)

View the evaluation results for experiment: 'ml-workshop-97d77c3e' at:
https://smith.langchain.com/o/bd14a154-65e7-52b4-bdce-b9a16d5e3513/datasets/ec7e16e3-d0fe-4aa9-9b24-241327e6f1ce/compare?selectedSessions=335a9383-c555-4d54-b4e2-658a4bc17c8c




0it [00:00, ?it/s]

run id: cf15f7b3-6b9d-41fa-a94e-704a2f64e5b5
run id: 602a44ee-fd61-4818-b532-0cc4feb9a80c

['user_activities']
```
SELECT 
  EXTRACT(YEAR FROM activity_date) AS year,
  EXTRACT(MONTH FROM activity_date) AS month,
  COUNT(DISTINCT user_id) AS MAU
FROM 
  `project_id.dataset_id.user_activities`
GROUP BY 
  year, 
  month
ORDER BY 
  year DESC, 
  month DESC
```

['user_activities']
```
SELECT 
  DATE(created_at) AS date, 
  COUNT(DISTINCT user_id) AS new_user_count
FROM 
  user_activities
WHERE 
  new_user = TRUE
GROUP BY 
  date
ORDER BY 
  date
```
run id: e9428f05-a949-4fea-af40-e52872d75d5f

['user_activities']
```
SELECT 
  FORMAT_TIMESTAMP('%Y-%m', TIMESTAMP_SECONDS(time)) as Month,
  COUNT(*) as CV 
FROM 
  `project.dataset.user_activities` 
WHERE 
  activity_type = 'conversion'
GROUP BY 
  Month
ORDER BY 
  Month
```
