In [4]:
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
import os

print(f"""Env Variables
LANGCHAIN_TRACING_V2: {os.environ["LANGCHAIN_TRACING_V2"]}
LANGCHAIN_PROJECT: {os.environ["LANGCHAIN_PROJECT"]}
LANGCHAIN_ENDPOINT: {os.environ["LANGCHAIN_ENDPOINT"]}
""")

Env Variables
LANGCHAIN_TRACING_V2: true
LANGCHAIN_PROJECT: machine-learning-workshop
LANGCHAIN_ENDPOINT: https://api.smith.langchain.com



## LangSmithのトレースの基本
traceableデコレータを使用することで、任意の関数の引数と返り値をLangSmithで確認できるようになる。

In [6]:
from IPython.display import display, Markdown
from langsmith import traceable
import openai

openai_client = openai.Client()


@traceable
def format_prompt(question):
    return [
        {
            "role": "system",
            "content": "あなたはBigQueryのエキスパートです. 出したいデータのクエリを作成してください. 出力はクエリのみで他の情報は不要です.",
        },
        {"role": "user", "content": f"{question}"},
    ]


@traceable(run_type="llm")
def invoke_llm(messages):
    return openai_client.chat.completions.create(
        messages=messages, model="gpt-4o", temperature=0
    )


@traceable
def parse_output(response):
    return response.choices[0].message.content


@traceable
def run_pipeline():
    messages = format_prompt("ウェブサイトの回遊率")
    response = invoke_llm(messages)
    return parse_output(response)


display(Markdown(run_pipeline()))

```sql
SELECT
  user_id,
  COUNT(DISTINCT page_id) AS pages_visited,
  COUNT(DISTINCT session_id) AS sessions,
  COUNT(DISTINCT page_id) / COUNT(DISTINCT session_id) AS page_views_per_session
FROM
  `your_dataset.your_table`
GROUP BY
  user_id
```

openaiとのやり取りの可観測にするラッパー `wrap_openai` を使うと詳細な情報を簡単に取得可能になる。

In [7]:
from langsmith.wrappers import wrap_openai

wrap_openai_client = wrap_openai(openai.Client())


@traceable(name="run_pipeline with wrap_openai")
def run_pipeline_with_wrap_llm():
    messages = format_prompt("ウェブサイトの回遊率")
    response = wrap_openai_client.chat.completions.create(
        messages=messages, model="gpt-4o", temperature=0
    )
    return parse_output(response)


display(Markdown(run_pipeline_with_wrap_llm()))

```sql
SELECT
  user_id,
  COUNT(DISTINCT page_id) AS pages_visited,
  COUNT(DISTINCT session_id) AS sessions,
  COUNT(DISTINCT page_id) / COUNT(DISTINCT session_id) AS page_views_per_session
FROM
  `your_dataset.your_table`
GROUP BY
  user_id
```

LCEL (LangChain Expression Language) を使えば、LangSmithでの観測が楽にできる。

In [8]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

model = ChatOpenAI(model="gpt-4")

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "あなたはBigQueryのエキスパートです. 出したいデータのクエリを作成してください. 出力はクエリのみで他の情報は不要です.",
        ),
        ("human", "{question}"),
    ]
)
output_parser = StrOutputParser()

chain = prompt | model | output_parser
display(Markdown(chain.invoke("ウェブサイトの回遊率")))

以下は、ユーザーの訪問回数とそのセッションの数をカウントするためのBigQuery SQLクエリです。

```sql
SELECT 
  fullVisitorId,
  COUNT(DISTINCT visitId) as sessions,
  COUNT(*) as total_page_views,
  ROUND(COUNT(*) / COUNT(DISTINCT visitId), 2) as avg_page_views_per_session
FROM 
  `project_id.dataset_id.table_id`
GROUP BY 
  fullVisitorId
ORDER BY 
  avg_page_views_per_session DESC
```

ここで、`project_id`, `dataset_id`, `table_id`は適切なプロジェクト、データセット、テーブルIDに置き換える必要があります。

このクエリでは、各ユーザー（`fullVisitorId`）のセッション数（`visitId`）、総ページビュー数、およびセッションあたりの平均ページビュー数を計算します。これにより、ウェブサイトの回遊率を把握することができます。

## Feedback

In [9]:
from langchain_openai import ChatOpenAI
from langchain import hub


llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Get the prompt to use - you can modify this!
prompt = hub.pull("hwchase17/openai-functions-agent")
prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant')),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}')),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [10]:
from langchain.agents import create_tool_calling_agent
from langchain.agents import AgentExecutor
from langchain_community.tools.tavily_search import TavilySearchResults

search = TavilySearchResults()
tools = [search]

agent = create_tool_calling_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)
agent_executor.invoke({"input": "how can langsmith help with testing?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'how can LangSmith help with testing?'}`


[0m[36;1m[1;3m[{'url': 'https://www.datacamp.com/tutorial/introduction-to-langsmith', 'content': 'How it Works, Use Cases, Alternatives & More\nRichie Cotton\nHow AI is Changing Cybersecurity with Brian Murphy, CEO of ReliaQuest\nAdel Nehme\n32 min\nAn Introductory Guide to Fine-Tuning LLMs\nJosep Ferrer\n12 min\nSalesforce XGen-7B: A Step-by-Step Tutorial on Using And Fine-Tuning XGen-7B\nBex Tuychiev\n15 min\nGrow your data skills with DataCamp for Mobile\nMake progress on the go with our mobile courses and daily 5-minute coding challenges.\n For labeled datasets like the CSV dataset we uploaded, LangSmith offers more comprehensive evaluators for measuring the correctness of the response to a prompt:\nLet’s try the last one on our examples:\nCoTQA criterion returns a score called Contextual accuracy, as depicted in the GIF 

{'input': 'how can langsmith help with testing?',
 'output': 'LangSmith is a platform designed to assist with the development, testing, and deployment of Language Learning Models (LLMs) and AI applications. Here are some ways LangSmith can help with testing:\n\n1. **Comprehensive Evaluators**: LangSmith offers built-in and custom evaluators for measuring the correctness of responses to prompts. This is particularly useful for labeled datasets, where you can get detailed metrics like contextual accuracy.\n\n2. **Debugging and Iteration**: LangSmith makes it easy to debug and iterate on your prompts, chains, and other components. This helps in refining the model to achieve higher quality outputs.\n\n3. **Automated Testing**: You can run multiple prompts and evaluate their outputs in a single line of code using functions like `run_on_dataset`. This streamlines the testing process and makes it more efficient.\n\n4. **Exception Handling**: LangSmith provides tools for handling exceptions du

### Runの情報を取得

In [11]:
from langchain.callbacks import tracing_v2_enabled

with tracing_v2_enabled() as cb:
    agent_executor.invoke({"input": "how can langsmith help with testing?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'Langsmith testing features'}`


[0m[36;1m[1;3m[{'url': 'https://www.datacamp.com/tutorial/introduction-to-langsmith', 'content': 'How it Works, Use Cases, Alternatives & More\nRichie Cotton\nHow AI is Changing Cybersecurity with Brian Murphy, CEO of ReliaQuest\nAdel Nehme\n32 min\nAn Introductory Guide to Fine-Tuning LLMs\nJosep Ferrer\n12 min\nSalesforce XGen-7B: A Step-by-Step Tutorial on Using And Fine-Tuning XGen-7B\nBex Tuychiev\n15 min\nGrow your data skills with DataCamp for Mobile\nMake progress on the go with our mobile courses and daily 5-minute coding challenges.\n For labeled datasets like the CSV dataset we uploaded, LangSmith offers more comprehensive evaluators for measuring the correctness of the response to a prompt:\nLet’s try the last one on our examples:\nCoTQA criterion returns a score called Contextual accuracy, as depicted in the GIF below (als

In [12]:
cb.get_run_url()

'https://smith.langchain.com/o/bd14a154-65e7-52b4-bdce-b9a16d5e3513/projects/p/dc1fff1d-e827-4ee4-b948-e71e0914f7dd/r/d0b9dc17-c299-42e7-9159-5ea2f1814f06?poll=true'

In [13]:
run_id = cb.latest_run.id

### Feedbackの作成

In [14]:
from langsmith import Client

ls_client = Client()

score_feedback1 = ls_client.create_feedback(
    run_id=cb.latest_run.id, key="test-score", score=1
)
score_feedback1

Feedback(id=UUID('964ca95e-9186-44a2-b0c4-36b63ec6fed2'), created_at=datetime.datetime(2024, 6, 25, 22, 35, 27, 683459, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 25, 22, 35, 27, 683463, tzinfo=datetime.timezone.utc), run_id=UUID('d0b9dc17-c299-42e7-9159-5ea2f1814f06'), key='test-score', score=1, value=None, comment=None, correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

In [16]:
score_feedback2 = ls_client.create_feedback(
    run_id=cb.latest_run.id, key="test-score", score=10
)
score_feedback2

Feedback(id=UUID('62276b5f-f154-4625-8973-87ee2214a9d6'), created_at=datetime.datetime(2024, 6, 25, 22, 35, 35, 947858, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 25, 22, 35, 35, 947862, tzinfo=datetime.timezone.utc), run_id=UUID('d0b9dc17-c299-42e7-9159-5ea2f1814f06'), key='test-score', score=10, value=None, comment=None, correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

In [17]:
comment_feedback = ls_client.create_feedback(
    run_id=cb.latest_run.id, key="test-comment", comment="test comment"
)
comment_feedback

Feedback(id=UUID('835374bc-1867-44d7-867c-1f0471f5a491'), created_at=datetime.datetime(2024, 6, 25, 22, 35, 40, 463695, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 25, 22, 35, 40, 463697, tzinfo=datetime.timezone.utc), run_id=UUID('d0b9dc17-c299-42e7-9159-5ea2f1814f06'), key='test-comment', score=None, value=None, comment='test comment', correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}), session_id=None, comparative_experiment_id=None, feedback_group_id=None)

### Feedbackの削除

In [19]:
# 作成したfeedbackを削除
ls_client.delete_feedback(score_feedback1.id)
ls_client.delete_feedback(comment_feedback.id)

In [20]:
# まとめて消去することも可能
for feedback in ls_client.list_feedback(run_ids=[cb.latest_run.id]):
    ls_client.delete_feedback(feedback.id)

## Evaluation

In [21]:
import textwrap

from langsmith import Client
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

ls_client = Client()  # LangSmithのクライアント

# 作成するデータセット
dataset_name = "SQL Samples"

# データセットがあれば削除
if ls_client.has_dataset(dataset_name=dataset_name):
    dataset = ls_client.delete_dataset(dataset_name=dataset_name)

dataset = ls_client.create_dataset(
    dataset_name, description="ML Workshop用のサンプルクエリ"
)

# データセットにexampleを保存
ls_client.create_examples(
    inputs=[
        {"question": "MAUを取得"},
        {"question": "新規ユーザ数の推移"},
    ],
    outputs=[
        {
            "query": textwrap.dedent("""
           SELECT
               COUNT(DISTINCT user_id) AS monthly_active_users
           FROM
               `your_dataset.user_activities`
           WHERE
               activity_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH) AND CURRENT_DATE()
        """),
            "tables": ["user_activities"],
        },
        {
            "query": textwrap.dedent("""
            SELECT
                signup_date,
                COUNT(user_id) AS new_users
            FROM
                `your_dataset.user_activities`
            GROUP BY
                signup_date
            ORDER BY
            　　 signup_date
        """),
            "tables": ["user_activities"],
        },
    ],
    dataset_id=dataset.id,
)

In [22]:
dataset

Dataset(name='SQL Samples', description='ML Workshop用のサンプルクエリ', data_type=<DataType.kv: 'kv'>, id=UUID('b5c798dc-ea07-4213-97ae-7e2f3072513b'), created_at=datetime.datetime(2024, 6, 25, 22, 36, 11, 696756, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 25, 22, 36, 11, 696756, tzinfo=datetime.timezone.utc), example_count=0, session_count=0, last_session_start_time=None)

example_countが0となっているので、LangSmith Clientを使ってdatasetを読み直す

In [23]:
ls_client.read_dataset(dataset_name=dataset_name)

Dataset(name='SQL Samples', description='ML Workshop用のサンプルクエリ', data_type=<DataType.kv: 'kv'>, id=UUID('b5c798dc-ea07-4213-97ae-7e2f3072513b'), created_at=datetime.datetime(2024, 6, 25, 22, 36, 11, 696756, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2024, 6, 25, 22, 36, 11, 696756, tzinfo=datetime.timezone.utc), example_count=2, session_count=0, last_session_start_time=None)

In [24]:
dataset.url

'https://smith.langchain.com/o/bd14a154-65e7-52b4-bdce-b9a16d5e3513/datasets/b5c798dc-ea07-4213-97ae-7e2f3072513b'

In [25]:
ls_client.create_examples(
    inputs=[
        {"question": "月ごとのCV数の推移"},
    ],
    outputs=[
        {
            "query": textwrap.dedent("""
           SELECT
               FORMAT_TIMESTAMP('%Y-%m', conv_date) AS conversion_month,
               COUNT(conv_id) AS conversions
           FROM
               `your_dataset.your_table`
           WHERE
               conv_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR) AND CURRENT_DATE()
           GROUP BY
               conversion_month
           ORDER BY
               conversion_month
        """),
            "tables": ["user_activities"],
        },
    ],
    dataset_id=dataset.id,
)

exampleを増やしたことにより、datasetのバージョンも変更されている

In [26]:
ls_client.read_dataset(dataset_name=dataset_name).example_count

3

In [27]:
# datasetに保存されているexampleの一覧
for example in ls_client.list_examples(dataset_name=dataset_name):
    print(f"""
question: {example.inputs["question"]}
query: {example.outputs["query"]}
    """)


question: 月ごとのCV数の推移
query: 
SELECT
    FORMAT_TIMESTAMP('%Y-%m', conv_date) AS conversion_month,
    COUNT(conv_id) AS conversions
FROM
    `your_dataset.your_table`
WHERE
    conv_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 YEAR) AND CURRENT_DATE()
GROUP BY
    conversion_month
ORDER BY
    conversion_month

    

question: MAUを取得
query: 
SELECT
    COUNT(DISTINCT user_id) AS monthly_active_users
FROM
    `your_dataset.user_activities`
WHERE
    activity_date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH) AND CURRENT_DATE()

    

question: 新規ユーザ数の推移
query: 
SELECT
    signup_date,
    COUNT(user_id) AS new_users
FROM
    `your_dataset.user_activities`
GROUP BY
    signup_date
ORDER BY
　　 signup_date

    


## LangSmith Evaluation
### Custom Evaluation

In [28]:
# inputsにexampleが1つずつ渡される
def predict(inputs: dict) -> dict:
    model = ChatOpenAI(model="gpt-4")
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "あなたはBigQueryのエキスパートです. 出したいデータのクエリを作成してください. 出力はクエリのみで他の情報は不要です.",
            ),
            ("human", "{question}, tableはuser_activitiesを使います."),
        ]
    )
    output_parser = StrOutputParser()
    llm = prompt | model | output_parser
    return {"output": llm.invoke(inputs)}


# Custom Evaluation
def must_have_user_activities(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("output") or ""
    print(f"run id: {run.id}\n")
    required = example.outputs.get("tables") or []  # outputsのキー (tables) と合わせる
    print(required)
    print(prediction)
    score = all(
        phrase in prediction for phrase in required
    )  # scoreは自分で定義したものでよい
    return {
        "key": "must_have_user_activities",
        "score": score,
        "comment": "comment test",
    }  # key, score, commentを返す


experiment_results = evaluate(
    predict,
    data=dataset_name,  # The data to predict and grade over
    evaluators=[must_have_user_activities],  # The evaluators to score the results
    experiment_prefix="ml-workshop",  # A prefix for your experiment names to easily identify them
    metadata={
        "version": "1.0.0",
    },
)

View the evaluation results for experiment: 'ml-workshop-5202c67c' at:
https://smith.langchain.com/o/bd14a154-65e7-52b4-bdce-b9a16d5e3513/datasets/b5c798dc-ea07-4213-97ae-7e2f3072513b/compare?selectedSessions=7ccb23a7-364e-447f-ae54-9813a475ab54




0it [00:00, ?it/s]

run id: ed966304-d7af-4848-a81c-6f46974f31a3
run id: 800fb134-2c72-455d-a9ae-0d209b684b3e

['user_activities']
```
SELECT 
  DATE(user_activities.timestamp) AS date, 
  COUNT(DISTINCT user_activities.user_id) AS new_users 
FROM 
  `your_project.your_dataset.user_activities` 
WHERE 
  user_activities.is_new_user = TRUE 
GROUP BY 
  date 
ORDER BY 
  date ASC;
```
run id: aa7a22af-6896-4dc1-bb46-b330a8d77f04

['user_activities']
```sql
SELECT 
  EXTRACT(YEAR FROM activity_date) AS year, 
  EXTRACT(MONTH FROM activity_date) AS month, 
  COUNT(DISTINCT user_id) AS MAU 
FROM 
  `project_id.dataset.user_activities` 
GROUP BY 
  year, 
  month 
ORDER BY 
  year DESC, 
  month DESC;
```
ここで、"project_id"と"dataset"は適切なプロジェクトIDとデータセットに置き換えてください。

['user_activities']
```
SELECT 
  FORMAT_TIMESTAMP('%Y-%m', timestamp) AS Month,
  COUNT(*) AS CV_Count
FROM 
  `project.dataset.user_activities`
WHERE 
  activity = 'CV'
GROUP BY 
  Month
ORDER BY 
  Month ASC
```
