# LLM Evaluation and Tracing with W&B

<!--- @wandbcode{dlai_04} -->

## 1. 평가에 테이블 사용

이 섹션에서는 OpenAI LLM을 호출하여 게임 에셋의 이름을 생성하겠습니다. 생성된 에셋을 평가하기 위해 W&B 테이블을 사용하겠습니다.

In [1]:
import os
import random
import time
import datetime

import openai

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential, # for exponential backoff
)  
import wandb
from wandb.sdk.data_types.trace_tree import Trace

In [2]:
# get openai API key
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [3]:
PROJECT = "dlai_llm"
MODEL_NAME = "gpt-3.5-turbo"

In [4]:
wandb.login(anonymous="allow")

[34m[1mwandb[0m: Currently logged in as: [33mkimwooglae[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
run = wandb.init(project=PROJECT, job_type="generation")

### 간단한 생성
먼저 OpenAI `ChatCompletion`을 사용하여 게임 에셋의 이름을 생성하고 결과 생성물을 W&B 테이블에 저장해 보겠습니다.

In [6]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [7]:
def generate_and_print(system_prompt, user_prompt, table, n=5):
    messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]
    start_time = time.time()
    responses = completion_with_backoff(
        model=MODEL_NAME,
        messages=messages,
        n = n,
        )
    elapsed_time = time.time() - start_time
    for response in responses.choices:
        generation = response.message.content
        print(generation)
    table.add_data(system_prompt,
                user_prompt,
                [response.message.content for response in responses.choices],
                elapsed_time,
                datetime.datetime.fromtimestamp(responses.created),
                responses.model,
                responses.usage.prompt_tokens,
                responses.usage.completion_tokens,
                responses.usage.total_tokens
                )

In [8]:
system_prompt = """You are a creative copywriter.
You're given a category of game asset, \
and your goal is to design a name of that asset.
The game is set in a fantasy world \
where everyone laughs and respects each other, 
while celebrating diversity."""

In [9]:
# Define W&B Table to store generations
columns = ["system_prompt", "user_prompt", "generations", "elapsed_time", "timestamp",\
            "model", "prompt_tokens", "completion_tokens", "total_tokens"]
table = wandb.Table(columns=columns)

In [10]:
user_prompt = "hero"
generate_and_print(system_prompt, user_prompt, table)

Harmony's Valor
Joybringer
Name: Harmony's Valor
Unity's Pride
Name: Unity's Champion


In [11]:
user_prompt = "jewel"
generate_and_print(system_prompt, user_prompt, table)

Gleamverse Gems
Gem Harmony
Chromatic Splendor
Rainbow Gems
Gleamstone


In [12]:
wandb.log({"simple_generations": table})
run.finish()

## 2. 트레이서를 사용하여 더 복잡한 체인 기록하기

어떻게 하면 더 창의적인 결과물을 얻을 수 있을까요? 먼저 판타지 세계를 무작위로 선택한 다음 캐릭터 이름을 생성하는 LLM 체인을 설계해 보겠습니다. 이 시나리오에서 트레이서를 사용하는 방법을 보여드리겠습니다. 입력 및 출력, 시작 및 종료 시간, OpenAI 호출 성공 여부, 토큰 사용량, 추가 메타데이터를 기록할 것입니다.

In [13]:
worlds = [
    "a mystic medieval island inhabited by intelligent and funny frogs",
    "a modern castle sitting on top of a volcano in a faraway galaxy",
    "a digital world inhabited by friendly machine learning engineers"
]

In [14]:
# define your config
model_name = "gpt-3.5-turbo"
temperature = 0.7
system_message = """You are a creative copywriter. 
You're given a category of game asset and a fantasy world.
Your goal is to design a name of that asset.
Provide the resulting name only, no additional description.
Single name, max 3 words output, remember!"""

In [15]:
def run_creative_chain(query):
    # part 1 - a chain is started...
    start_time_ms = round(datetime.datetime.now().timestamp() * 1000)

    root_span = Trace(
          name="MyCreativeChain",
          kind="chain",
          start_time_ms=start_time_ms,
          metadata={"user": "student_1"},
          model_dict={"_kind": "CreativeChain"}
          )

    # part 2 - your chain picks a fantasy world
    time.sleep(3)
    world = random.choice(worlds)
    expanded_prompt = f'Game asset category: {query}; fantasy world description: {world}'
    tool_end_time_ms = round(datetime.datetime.now().timestamp() * 1000)

    # create a Tool span 
    tool_span = Trace(
          name="WorldPicker",
          kind="tool",
          status_code="success",
          start_time_ms=start_time_ms,
          end_time_ms=tool_end_time_ms,
          inputs={"input": query},
          outputs={"result": expanded_prompt},
          model_dict={"_kind": "tool", "num_worlds": len(worlds)}
          )

    # add the TOOL span as a child of the root
    root_span.add_child(tool_span)

    # part 3 - the LLMChain calls an OpenAI LLM...
    messages=[
      {"role": "system", "content": system_message},
      {"role": "user", "content": expanded_prompt}
    ]

    response = completion_with_backoff(model=model_name,
                                       messages=messages,
                                       max_tokens=12,
                                       temperature=temperature)   

    llm_end_time_ms = round(datetime.datetime.now().timestamp() * 1000)
    response_text = response["choices"][0]["message"]["content"]
    token_usage = response["usage"].to_dict()

    llm_span = Trace(
          name="OpenAI",
          kind="llm",
          status_code="success",
          metadata={"temperature":temperature,
                    "token_usage": token_usage, 
                    "model_name":model_name},
          start_time_ms=tool_end_time_ms,
          end_time_ms=llm_end_time_ms,
          inputs={"system_prompt":system_message, "query":expanded_prompt},
          outputs={"response": response_text},
          model_dict={"_kind": "Openai", "engine": response["model"], "model": response["object"]}
          )

    # add the LLM span as a child of the Chain span...
    root_span.add_child(llm_span)

    # update the end time of the Chain span
    root_span.add_inputs_and_outputs(
          inputs={"query":query},
          outputs={"response": response_text})

    # update the Chain span's end time
    root_span.end_time_ms = llm_end_time_ms


    # part 4 - log all spans to W&B by logging the root span
    root_span.log(name="creative_trace")
    print(f"Result: {response_text}")


In [16]:
# Let's start a new wandb run
wandb.init(project=PROJECT, job_type="generation")

In [17]:
run_creative_chain("hero")

Result: Croakmaster


In [18]:
run_creative_chain("jewel")

Result: Volcano Gem


In [19]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

## Langchain agent

세 번째 시나리오에서는 월드피커와 네임밸류데이터와 같은 도구를 사용해 최종적인 이름을 도출하는 에이전트를 소개합니다. 또한 여기서는 Langchain을 사용하여 W&B 통합을 시연할 것입니다.

In [20]:
# Import things that are needed generically
from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.tools import BaseTool

from typing import Optional

from langchain.callbacks.manager import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)

In [21]:
wandb.init(project=PROJECT, job_type="generation")

In [22]:
os.environ["LANGCHAIN_WANDB_TRACING"] = "true"

In [26]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "http://localhost:1984"
os.environ["LANGCHAIN_PROJECT"] = "W_and_B"

In [27]:
class WorldPickerTool(BaseTool):
    name = "pick_world"
    description = "pick a virtual game world for your character or item naming"
    worlds = [
                "a mystic medieval island inhabited by intelligent and funny frogs",
                "a modern anthill featuring a cyber-ant queen and her cyber-ant-workers",
                "a digital world inhabited by friendly machine learning engineers"
            ]

    def _run(
        self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool."""
        time.sleep(1)
        return random.choice(self.worlds)

    async def _arun(
        self, query: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool asynchronously."""
        raise NotImplementedError("pick_world does not support async")
        
class NameValidatorTool(BaseTool):
    name = "validate_name"
    description = "validate if the name is properly generated"

    def _run(
        self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool."""
        time.sleep(1)
        if len(query) < 20:
            return f"This is a correct name: {query}"
        else:
            return f"This name is too long. It should be shorter than 20 characters."

    async def _arun(
        self, query: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool asynchronously."""
        raise NotImplementedError("validate_name does not support async")

In [28]:
llm = ChatOpenAI(temperature=0.7)

In [29]:
tools = [WorldPickerTool(), NameValidatorTool()]
agent = initialize_agent(
    tools, 
    llm, 
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    handle_parsing_errors=True,
    verbose=True
)

In [30]:
agent.run(
    "Find a virtual game world for me and imagine the name of a hero in that world"
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to pick a virtual game world first and then come up with a name for a hero in that world.
Action: pick_world
Action Input: None[0m
Observation: [36;1m[1;3ma mystic medieval island inhabited by intelligent and funny frogs[0m
Thought:[32;1m[1;3mI have picked a virtual game world that is a mystic medieval island inhabited by intelligent and funny frogs. Now I need to generate a name for a hero in that world.
Action: validate_name
Action Input: None[0m
Observation: [33;1m[1;3mThis is a correct name: None[0m
Thought:[32;1m[1;3mI need to generate a name for a hero in the virtual game world "mystic medieval island inhabited by intelligent and funny frogs".
Action: validate_name
Action Input: "Frogbert the Wise"[0m
Observation: [33;1m[1;3mThis is a correct name: Frogbert the Wise[0m
Thought:[32;1m[1;3mI have generated a name for a hero in the virtual game world "mystic medieval island inhabited by intelligent

'Frogbert the Wise'

In [34]:
agent.run(
    "Find a virtual game world for me and imagine the name of a jewel in that world"
)

[34m[1mwandb[0m: Streaming LangChain activity to W&B at https://wandb.ai/kimwooglae/DeepLearning.AI-Evaluating%20and%20Debugging%20Generative%20AI/runs/08ld68z3
[34m[1mwandb[0m: `WandbTracer` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `langchain`.




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to pick a virtual game world first and then come up with a name for a jewel in that world.[0m
Observation: Invalid Format: Missing 'Action:' after 'Thought:'
Thought:[32;1m[1;3mI need to pick a virtual game world first and then come up with a name for a jewel in that world.
Action: pick_world
Action Input: None[0m
Observation: [36;1m[1;3ma modern anthill featuring a cyber-ant queen and her cyber-ant-workers[0m
Thought:[32;1m[1;3mI have picked a virtual game world called "a modern anthill featuring a cyber-ant queen and her cyber-ant-workers". Now I need to come up with a name for a jewel in that world.
Action: validate_name
Action Input: "Cyber Shard"[0m
Observation: [33;1m[1;3mThis is a correct name: Cyber Shard[0m
Thought:[32;1m[1;3mI now know the final answer
Final Answer: In the virtual game world "a modern anthill featuring a cyber-ant queen and her cyber-ant-workers", the name of the jewel is "Cybe

'In the virtual game world "a modern anthill featuring a cyber-ant queen and her cyber-ant-workers", the name of the jewel is "Cyber Shard".'

In [32]:
agent.run(
    "Find a virtual game world for me and imagine the name of food in that world."
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI need to pick a virtual game world and generate a name for food in that world.
Action: pick_world
Action Input: None[0m
Observation: [36;1m[1;3ma modern anthill featuring a cyber-ant queen and her cyber-ant-workers[0m
Thought:[32;1m[1;3mI have picked a virtual game world called "Cyber Ant Hill" for the name of food in that world.
Action: validate_name
Action Input: Cyber Ant Hill Food[0m
Observation: [33;1m[1;3mThis is a correct name: Cyber Ant Hill Food[0m
Thought:[32;1m[1;3mI now know the final answer.
Final Answer: The name of the food in the virtual game world "Cyber Ant Hill" is "Cyber Ant Hill Food".[0m

[1m> Finished chain.[0m


'The name of the food in the virtual game world "Cyber Ant Hill" is "Cyber Ant Hill Food".'

In [33]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…