In [None]:
%pip uninstall -y --no-cache-dir arize-phoenix openinference-instrumentation-langchain langchain-core langchain langchain_openai openai --quiet

In [None]:
%pip install --no-cache-dir arize-phoenix openinference-instrumentation-langchain langchain-core langchain langchain_openai openai --quiet

In [None]:
%pip show arize-phoenix

In [1]:
import os
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = "api_key=8ac45cfa86835d4e746:0fea70b"
os.environ["PHOENIX_CLIENT_HEADERS"] = "api_key=8ac45cfa86835d4e746:0fea70b"
#os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "https://app.phoenix.arize.com"

# Phoenix running on cloud

In [None]:
# Connect your application to Phoenix
# collect traces -> configure OpenTelemetry TracerProvider to send traces to Phoenix
# The register utility from the phoenix.otel module streamlines this process.
%pip install --no-cache-dir arize-phoenix-otel --quiet

**Any OpenTelemetry traces you generate will be sent to your Phoenix instance.**
(see below)



In [None]:
import os
from phoenix.otel import register

# configure the Phoenix tracer
tracer_provider = register(
  endpoint="https://app.phoenix.arize.com/v1/traces",
) 

In [None]:
%pip install --no-cache-dir openinference-instrumentation-openai openai --quiet

In [None]:
#https://app.phoenix.arize.com/projects ?
# ISSUES WITH: https://app.phoenix.arize.com/v1/traces

In [3]:
# Enable OpenAi integration
from openinference.instrumentation.openai import OpenAIInstrumentor

OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

In [None]:
import openai

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Write a haiku."}],
)
print(response.choices[0].message.content)

## Agent Function Calling Evaluation

In [None]:
from openinference.instrumentation.langchain import LangChainInstrumentor

LangChainInstrumentor().instrument(tracer_provider=tracer_provider)

# Now that you have tracing setup, all invocations of chains will be streamed to your running Phoenix
#  for observability and evaluation.

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt = ChatPromptTemplate.from_template("{x} {y} {z}?").partial(x="why is", z="blue")
chain = prompt | ChatOpenAI(model_name="gpt-4o-mini")
chain.invoke(dict(y="sky"))



In [6]:
from langchain.agents import AgentType, initialize_agent
from langchain.prompts.chat import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools import tool
from langchain_openai import ChatOpenAI
import pandas as pd

import nest_asyncio
import pandas as pd
from phoenix.evals import (
    TOOL_CALLING_PROMPT_RAILS_MAP,
    TOOL_CALLING_PROMPT_TEMPLATE,
    OpenAIModel,
    llm_classify,
)

In [7]:
## function definitions using pydantic decorator
from langchain.tools import tool

@tool
def product_comparison(product_a_id: str, product_b_id: str) -> dict:
    """
    Compare features of two products.

    Parameters:
    product_a_id (str): The unique identifier of Product A.
    product_b_id (str): The unique identifier of Product B.

    Returns:
    dict: A dictionary containing the comparison of the two products.
    """

    if product_a_id == "" or product_b_id == "":
        return {"error": "missing product id"}

    # Implement the function logic here
    return {"comparison": "Similar"}


@tool
def product_details(product_id: str) -> dict:
    """
    Get detailed features on one product.

    Parameters:
    product_id (str): The unique identifier of the Product.

    Returns:
    dict: A dictionary containing product details.
    """

    if product_id == "":
        return {"error": "missing product id"}

    # Implement the function logic here
    return {"name": "Product Name", "price": "$12.50", "Availability": "In Stock"}


@tool
def apply_discount_code(order_id: int, discount_code: str) -> dict:
    """
    Applies a discount code to an order.

    Parameters:
    order_id (str): The unique identifier of the order.
    discount_code (str): The discount code to apply.

    Returns:
    dict: A dictionary containing the updated order details.
    """

    if order_id == "" or discount_code == "":
        return {"error": "missing order id or discount code"}

    # Implement the function logic here
    return {"applied": "True"}


@tool
def product_search(
    query: str,
    category: str = None,
    min_price: float = 0.0,
    max_price: float = None,
    page: int = 1,
    page_size: int = 20,
) -> dict:
    """
    Search for products based on criteria.

    Parameters:
    query (str): The search query string.
    category (str, optional): The category to filter the search. Default is None.
    min_price (float, optional): The minimum price of the products to search. Default is 0.
    max_price (float, optional): The maximum price of the products to search. Default is None.
    page (int, optional): The page number for pagination. Default is 1.
    page_size (int, optional): The number of results per page. Default is 20.

    Returns:
    dict: A dictionary containing the search results and pagination info.
    """

    if query == "":
        return {"error": "missing query"}

    # Implement the function logic here
    return {"results": [], "pagination": {"total": 0, "page": 1, "page_size": 20}}


@tool
def customer_support(issue_type: str) -> dict:
    """
    Get contact information for customer support regarding an issue.

    Parameters:
    issue_type (str): The type of issue (e.g., billing, technical support).

    Returns:
    dict: A dictionary containing the contact information for customer support.
    """

    if issue_type == "":
        return {"error": "missing issue type"}

    # Implement the function logic here
    return {"contact": issue_type}


@tool
def track_package(tracking_number: int) -> dict:
    """
    Track the status of a package based on the tracking number.

    Parameters:
    tracking_number (str): The tracking number of the package.

    Returns:
    dict: A dictionary containing the tracking status of the package.
    """
    if tracking_number == "":
        return {"error": "missing tracking number"}

    # Implement the function logic here
    return {"status": "Delivered"}


tools = [
    product_comparison,
    product_search,
    customer_support,
    track_package,
    apply_discount_code,
    product_details,
]



In [8]:
GEN_TEMPLATE = """
You are an assistant that generates complex customer service questions. You will try to answer the question with the tool if possible,
do your best to answer, ask for more information only if needed.
The questions should often involve:

Please reference the product names, the product details, product IDS and product information.

Multiple Categories: Questions that could logically fall into more than one category (e.g., combining product details with a discount code).
Vague Details: Questions with limited or vague information that require clarification to categorize correctly.
Mixed Intentions: Queries where the customer’s goal or need is unclear or seems to conflict within the question itself.
Indirect Language: Use of indirect or polite phrasing that obscures the direct need or request (e.g., using "I was wondering if..." or "Perhaps you could help me with...").
For specific categories:

Track Package: Include vague timing references (e.g., "recently" or "a while ago") instead of specific dates.
Product Comparison and Product Search: Include generic descriptors without specific product names or IDs (e.g., "high-end smartphones" or "energy-efficient appliances").
Apply Discount Code: Include questions about discounts that might apply to hypothetical or past situations, or without mentioning if they have made a purchase.
Product Details: Ask for comparisons or details that involve multiple products or categories ambiguously (e.g., "Tell me about your range of electronics that are good for home office setups").
Examples of More Challenging Questions
Multiple Categories

"I recently bought a samsung 106i smart phone, and I was wondering if there's a way to check what deals I might have missed or if my order is on its way?"
"Could you tell me if the samsung 15H adapater in my last order are covered under warranty and if they have shipped yet?"
Vague Details

"There's an issue with one of the Vizio 14Y TV I think I bought last month—what should I do?"
"I need help with a iPhone 16H I ordered, or maybe I'm just looking for something new. Can you help?"
Mixed Intentions

"I'm not sure if I should ask for a refund or just find out when it will arrive. What do you suggest?"
"Could you help me decide whether to upgrade my product or just track the current one?"
Indirect Language

"I was wondering if you might assist me in figuring out a problem I have with an order, or maybe it's more of a query?"
"Perhaps you could help me understand the benefits of your premium products compared to the regular ones?"

Some questions should be straightforward uses of the provided functions

Respond with a list, one question per line. Do not include any numbering at the beginning of each line. Do not include any category headings.
Generate 5 questions.
"""


In [None]:
model = OpenAIModel(model="gpt-4o-mini", max_tokens=1300)
resp = model(GEN_TEMPLATE)
split_response = resp.strip().split("\n")

# Remove quotation marks from strings before creating DataFrame
clean_response = [s.strip('"').strip("'") for s in split_response]
questions_df = pd.DataFrame(clean_response, columns=["questions"])
print(questions_df)

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini")
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful assistant"),
        ("human", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)
agent_executor = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS)

questions_df["response"] = questions_df["questions"].apply(lambda x: agent_executor.invoke(x)['output'])

questions_df


### Now evaluate the tool calls!

In [11]:
from phoenix.trace import SpanEvaluations
from phoenix.trace.dsl import SpanQuery

# Evaluating:

    # Inputs
    # Outputs
    # Function call columns

# METHOD TO QUERY YOUR SPAN DATA
query = (
    SpanQuery()
    .where(
        # Filter for the `LLM` span kind.
        # The filter condition is a string of valid Python boolean expression.
        "span_kind == 'LLM'",
    )
    .select(
        # Extract and rename the following span attributes
        question="llm.input_messages",
        response="llm.output_messages",
        tool_call="llm.function_call",
    )
)


In [None]:
import phoenix as px
#px.launch_app()
trace_df = tracer_provider.query_spans(query)
trace_df["tool_call"] = trace_df["tool_call"].fillna("No tool used")

- pass in tool definitions to the evaluator
 
- best when json formatted

In [None]:
# @title JSON Function / Tool
json_tools = """
tools = [
    {
        "name": "product_comparison",
        "description": "Compare features of two products.",
        "parameters": {
            "type": "object",
            "properties": {
                "product_a_id": {
                    "type": "string",
                    "description": "The unique identifier of Product A."
                },
                "product_b_id": {
                    "type": "string",
                    "description": "The unique identifier of Product B."
                }
            },
            "required": ["product_a_id", "product_b_id"]
        }
    },
    {
        "name": "product_search",
        "description": "Search for products based on criteria.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "The search query string."
                },
                "category": {
                    "type": "string",
                    "description": "The category to filter the search.",
                    "default": None
                },
                "min_price": {
                    "type": "number",
                    "description": "The minimum price of the products to search.",
                    "default": 0
                },
                "max_price": {
                    "type": "number",
                    "description": "The maximum price of the products to search.",
                    "default": None
                },
                "page": {
                    "type": "integer",
                    "description": "The page number for pagination.",
                    "default": 1
                },
                "page_size": {
                    "type": "integer",
                    "description": "The number of results per page.",
                    "default": 20
                }
            },
            "required": ["query"]
        }
    },
    {
        "name": "customer_support",
        "description": "Get contact information for customer support regarding an issue.",
        "parameters": {
            "type": "object",
            "properties": {
                "issue_type": {
                    "type": "string",
                    "description": "The type of issue (e.g., billing, technical support)."
                }
            },
            "required": ["issue_type"]
        }
    },
    {
        "name": "track_package",
        "description": "Track the status of a package based on the tracking number.",
        "parameters": {
            "type": "object",
            "properties": {
                "tracking_number": {
                    "type": "integer",
                    "description": "The tracking number of the package."
                }
            },
            "required": ["tracking_number"]
        }
    },
    {
        "name": "product_details",
        "description": "Returns details for a given product id",
        "parameters": {
            "type": "object",
            "properties": {
                "product_id": {
                    "type": "string",
                    "description": "The id of a product to look up."
                }
            },
            "required": ["product_id"]
        }
    },
    {
        "name": "apply_discount_code",
        "description": "Applies the discount code to a given order.",
        "parameters": {
            "type": "object",
            "properties": {
                "order_id": {
                    "type": "integer",
                    "description": "The id of the order to apply the discount code to."
                },
                "discount_code": {
                    "type": "string",
                    "description": "The discount code to apply
                }
            },
            "required": ["order_id, discount_code"]
        }
    }
]

"""

# **Phoenix running locally**

In [1]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
# http://0.0.0.0:6006/projects/UHJvamVjdDox
# steps:
# 1. install arize-phoenix
# 2. phoenix serve
# 3. connect to the application
# 3a. pip install arize-phoenix-otel
# 3b. from phoenix.otel import register ...

In [None]:
# in the terminal run: phoenix serve
# enter: http://0.0.0.0:6006/projects

In [2]:
import os
from phoenix.otel import register

# configure the Phoenix tracer
tracer_provider = register(
  endpoint="http://localhost:4317",  # Sends traces using gRPC
) 

  from .autonotebook import tqdm as notebook_tqdm


🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: default
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: localhost:4317
|  Transport: gRPC
|  Transport Headers: {'user-agent': '****'}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



## Generate traces using OpenAI

other integrations: https://docs.arize.com/phoenix/tracing/integrations-tracing

In [None]:
%pip install --no-cache-dir openinference-instrumentation-openai openai --quiet

In [5]:
from openinference.instrumentation.openai import OpenAIInstrumentor

OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

In [6]:
import openai, os

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Write a haiku."}],
)
print(response.choices[0].message.content)

Whispering leaves fall,  
Autumn's gentle breath descends,  
Nature's quiet song.  


## Generate traces with Langchain

In [None]:
import langchain_core
print(langchain_core.__version__)

In [7]:
from openinference.instrumentation.langchain import LangChainInstrumentor

LangChainInstrumentor().instrument(tracer_provider=tracer_provider)

**Now that you have tracing setup, all invocations of chains will be streamed to your running Phoenix for observability and evaluation.**

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt = ChatPromptTemplate.from_template("{x} {y} {z}?").partial(x="why is", z="blue")
chain = prompt | ChatOpenAI(model_name="gpt-4o-mini")
chain.invoke(dict(y="sky"))

In [4]:
from langchain.agents import AgentType, initialize_agent
from langchain.prompts.chat import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools import tool
from langchain_openai import ChatOpenAI
import pandas as pd

import nest_asyncio
import pandas as pd
from phoenix.evals import (
    TOOL_CALLING_PROMPT_RAILS_MAP,
    TOOL_CALLING_PROMPT_TEMPLATE,
    OpenAIModel,
    llm_classify,
)

In [5]:
## function definitions using pydantic decorator
from langchain.tools import tool
@tool
def product_comparison(product_a_id: str, product_b_id: str) -> dict:
    """
    Compare features of two products.

    Parameters:
    product_a_id (str): The unique identifier of Product A.
    product_b_id (str): The unique identifier of Product B.

    Returns:
    dict: A dictionary containing the comparison of the two products.
    """

    if product_a_id == "" or product_b_id == "":
        return {"error": "missing product id"}

    # Implement the function logic here
    return {"comparison": "Similar"}


@tool
def product_details(product_id: str) -> dict:
    """
    Get detailed features on one product.

    Parameters:
    product_id (str): The unique identifier of the Product.

    Returns:
    dict: A dictionary containing product details.
    """

    if product_id == "":
        return {"error": "missing product id"}

    # Implement the function logic here
    return {"name": "Product Name", "price": "$12.50", "Availability": "In Stock"}


@tool
def apply_discount_code(order_id: int, discount_code: str) -> dict:
    """
    Applies a discount code to an order.

    Parameters:
    order_id (str): The unique identifier of the order.
    discount_code (str): The discount code to apply.

    Returns:
    dict: A dictionary containing the updated order details.
    """

    if order_id == "" or discount_code == "":
        return {"error": "missing order id or discount code"}

    # Implement the function logic here
    return {"applied": "True"}


@tool
def product_search(
    query: str,
    category: str = None,
    min_price: float = 0.0,
    max_price: float = None,
    page: int = 1,
    page_size: int = 20,
) -> dict:
    """
    Search for products based on criteria.

    Parameters:
    query (str): The search query string.
    category (str, optional): The category to filter the search. Default is None.
    min_price (float, optional): The minimum price of the products to search. Default is 0.
    max_price (float, optional): The maximum price of the products to search. Default is None.
    page (int, optional): The page number for pagination. Default is 1.
    page_size (int, optional): The number of results per page. Default is 20.

    Returns:
    dict: A dictionary containing the search results and pagination info.
    """

    if query == "":
        return {"error": "missing query"}

    # Implement the function logic here
    return {"results": [], "pagination": {"total": 0, "page": 1, "page_size": 20}}


@tool
def customer_support(issue_type: str) -> dict:
    """
    Get contact information for customer support regarding an issue.

    Parameters:
    issue_type (str): The type of issue (e.g., billing, technical support).

    Returns:
    dict: A dictionary containing the contact information for customer support.
    """

    if issue_type == "":
        return {"error": "missing issue type"}

    # Implement the function logic here
    return {"contact": issue_type}


@tool
def track_package(tracking_number: int) -> dict:
    """
    Track the status of a package based on the tracking number.

    Parameters:
    tracking_number (str): The tracking number of the package.

    Returns:
    dict: A dictionary containing the tracking status of the package.
    """
    if tracking_number == "":
        return {"error": "missing tracking number"}

    # Implement the function logic here
    return {"status": "Delivered"}


tools = [
    product_comparison,
    product_search,
    customer_support,
    track_package,
    apply_discount_code,
    product_details,
]

In [6]:
GEN_TEMPLATE = """
You are an assistant that generates complex customer service questions. You will try to answer the question with the tool if possible,
do your best to answer, ask for more information only if needed.
The questions should often involve:

Please reference the product names, the product details, product IDS and product information.

Multiple Categories: Questions that could logically fall into more than one category (e.g., combining product details with a discount code).
Vague Details: Questions with limited or vague information that require clarification to categorize correctly.
Mixed Intentions: Queries where the customer’s goal or need is unclear or seems to conflict within the question itself.
Indirect Language: Use of indirect or polite phrasing that obscures the direct need or request (e.g., using "I was wondering if..." or "Perhaps you could help me with...").
For specific categories:

Track Package: Include vague timing references (e.g., "recently" or "a while ago") instead of specific dates.
Product Comparison and Product Search: Include generic descriptors without specific product names or IDs (e.g., "high-end smartphones" or "energy-efficient appliances").
Apply Discount Code: Include questions about discounts that might apply to hypothetical or past situations, or without mentioning if they have made a purchase.
Product Details: Ask for comparisons or details that involve multiple products or categories ambiguously (e.g., "Tell me about your range of electronics that are good for home office setups").
Examples of More Challenging Questions
Multiple Categories

"I recently bought a samsung 106i smart phone, and I was wondering if there's a way to check what deals I might have missed or if my order is on its way?"
"Could you tell me if the samsung 15H adapater in my last order are covered under warranty and if they have shipped yet?"
Vague Details

"There's an issue with one of the Vizio 14Y TV I think I bought last month—what should I do?"
"I need help with a iPhone 16H I ordered, or maybe I'm just looking for something new. Can you help?"
Mixed Intentions

"I'm not sure if I should ask for a refund or just find out when it will arrive. What do you suggest?"
"Could you help me decide whether to upgrade my product or just track the current one?"
Indirect Language

"I was wondering if you might assist me in figuring out a problem I have with an order, or maybe it's more of a query?"
"Perhaps you could help me understand the benefits of your premium products compared to the regular ones?"

Some questions should be straightforward uses of the provided functions

Respond with a list, one question per line. Do not include any numbering at the beginning of each line. Do not include any category headings.
Generate 5 questions.
"""

model = OpenAIModel(model="gpt-4o-mini", max_tokens=1300)
resp = model(GEN_TEMPLATE)
split_response = resp.strip().split("\n")

# Remove quotation marks from strings before creating DataFrame
clean_response = [s.strip('"').strip("'") for s in split_response]
questions_df = pd.DataFrame(clean_response, columns=["questions"])
print(questions_df)

                                           questions
0  I was curious if there are any ongoing promoti...
1  Could you provide me with the specifications o...
2  I recently placed an order for a Dell XPS 13 l...
3  I was wondering if you could help me compare t...
4  There's a problem with my order for the Bose Q...


In [7]:
llm = ChatOpenAI(model="gpt-4o-mini")
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful assistant"),
        ("human", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)
agent_executor = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS)

  agent_executor = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS)


In [8]:
questions_df["response"] = questions_df["questions"].apply(lambda x: agent_executor.invoke(x)['output'])

In [9]:
questions_df

Unnamed: 0,questions,response
0,I was curious if there are any ongoing promoti...,It looks like there are currently no ongoing p...
1,Could you provide me with the specifications o...,It seems that I couldn't find any specificatio...
2,I recently placed an order for a Dell XPS 13 l...,It depends on what you need right now:\n\n1. *...
3,I was wondering if you could help me compare t...,The comparison between the Apple Watch Series ...
4,There's a problem with my order for the Bose Q...,"To help you better, could you please provide m..."


## Evaluate the tool calls!

In [10]:
from phoenix.trace import SpanEvaluations
from phoenix.trace.dsl import SpanQuery
import phoenix as px

Evaluating: 

- inputs

- outputs

- function call columns

In [11]:
# METHOD TO QUERY YOUR SPAN DATA
query = (
    SpanQuery()
    .where(
        # Filter for the `LLM` span kind.
        # The filter condition is a string of valid Python boolean expression.
        "span_kind == 'LLM'",
    )
    .select(
        # Extract and rename the following span attributes
        question="llm.input_messages",
        response="llm.output_messages",
        tool_call="llm.function_call",
    )
)

trace_df = px.Client().query_spans(query)
trace_df["tool_call"] = trace_df["tool_call"].fillna("No tool used")

In [12]:
#trace_df.iloc[-1]['question']
trace_df

Unnamed: 0_level_0,question,response,tool_call
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d85a00a10dbba66e,"[{'message': {'content': 'Write a haiku.', 'fu...",[{'message': {'content': 'Whispers of the bree...,No tool used
933c4ec2439077d0,"[{'message': {'content': 'Write a haiku.', 'fu...",[{'message': {'content': 'Whispers of cool bre...,No tool used
5ea4441a9cea1370,"[{'message': {'content': 'why is sky blue?', '...",[{'message': {'content': 'The sky appears blue...,No tool used
c10cdf2bda98d73c,"[{'message': {'content': 'why is sky blue?', '...",[{'message': {'content': 'The sky appears blue...,No tool used
bc10a9b0ee25ecea,"[{'message': {'content': 'why is ocean blue?',...",[{'message': {'content': 'The ocean appears bl...,No tool used
...,...,...,...
5ea677e9e1557c22,[{'message': {'content': 'You are a helpful AI...,"[{'message': {'content': None, 'function_call_...","{""arguments"": {""query"": ""Samsung Galaxy Tab S7..."
38e587d5f40c858e,[{'message': {'content': 'You are a helpful AI...,"[{'message': {'content': None, 'function_call_...","{""arguments"": {""tracking_number"": 1}, ""name"": ..."
e1b87c6c4d8ff9b5,[{'message': {'content': 'You are a helpful AI...,[{'message': {'content': 'The Samsung Galaxy T...,No tool used
29d0aa8f0420024b,[{'message': {'content': 'You are a helpful AI...,"[{'message': {'content': None, 'function_call_...",No tool used


- pass in tool definitions to the evaluator

- best when json formatted

In [19]:
json_tools = """
{
    "name": "tool_definitions",
    "tools": [
        {
            "name": "product_comparison",
            "description": "Compare features of two products.",
            "parameters": {
                "type": "object",
                "properties": {
                    "product_a_id": {
                        "type": "string",
                        "description": "The unique identifier of Product A."
                    },
                    "product_b_id": {
                        "type": "string",
                        "description": "The unique identifier of Product B."
                    }
                },
                "required": ["product_a_id", "product_b_id"]
            }
        },
        {
            "name": "product_search",
            "description": "Search for products based on criteria.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The search query string."
                    },
                    "category": {
                        "type": "string",
                        "description": "The category to filter the search."
                    },
                    "min_price": {
                        "type": "number",
                        "description": "The minimum price of the products to search."
                    },
                    "max_price": {
                        "type": "number",
                        "description": "The maximum price of the products to search."
                    },
                    "page": {
                        "type": "integer",
                        "description": "The page number for pagination."
                    },
                    "page_size": {
                        "type": "integer",
                        "description": "The number of results per page."
                    }
                },
                "required": ["query"]
            }
        },
        {
            "name": "customer_support",
            "description": "Get contact information for customer support regarding an issue.",
            "parameters": {
                "type": "object",
                "properties": {
                    "issue_type": {
                        "type": "string",
                        "description": "The type of issue (e.g., billing, technical support)."
                    }
                },
                "required": ["issue_type"]
            }
        },
        {
            "name": "track_package",
            "description": "Track the status of a package based on the tracking number.",
            "parameters": {
                "type": "object",
                "properties": {
                    "tracking_number": {
                        "type": "integer",
                        "description": "The tracking number of the package."
                    }
                },
                "required": ["tracking_number"]
            }
        },
        {
            "name": "product_details",
            "description": "Returns details for a given product id",
            "parameters": {
                "type": "object",
                "properties": {
                    "product_id": {
                        "type": "string",
                        "description": "The id of a product to look up."
                    }
                },
                "required": ["product_id"]
            }
        },
        {
            "name": "apply_discount_code",
            "description": "Applies the discount code to a given order.",
            "parameters": {
                "type": "object",
                "properties": {
                    "order_id": {
                        "type": "integer",
                        "description": "The id of the order to apply the discount code to."
                    },
                    "discount_code": {
                        "type": "string",
                        "description": "The discount code to apply."
                    }
                },
                "required": ["order_id", "discount_code"]
            }
        }
    ],
    "version": "1.0"
}
"""

In [20]:
eval_model = OpenAIModel(model="gpt-4o-mini")

And we're ready to call our evaluator! The method below takes in the dataframe of traces to evaluate, our built in evaluation prompt, the eval model to use, and a rails object to snap responses from our model to a set of binary classification responses.

We'll also instruct our model to provide explanations for its responses.

In [21]:
rails = list(TOOL_CALLING_PROMPT_RAILS_MAP.values())
rails

['correct', 'incorrect']

In [22]:
response_classifications = llm_classify(
    dataframe=trace_df,
    template=TOOL_CALLING_PROMPT_TEMPLATE.template.replace("{tool_definitions}", json_tools),
    model=eval_model,
    rails=rails,
    provide_explanation=True,
)

llm_classify |          | 0/218 (0.0%) | ⏳ 16:49<? | ?it/s
llm_classify |          | 0/218 (0.0%) | ⏳ 00:00<? | ?it/s 

Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '\n    "name"'
Retries exhausted after 1 attempts: Missing template variable: '

In [23]:
response_classifications

Unnamed: 0_level_0,label,explanation,exceptions,execution_status,execution_seconds
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
d85a00a10dbba66e,,,[PhoenixTemplateMappingError('Missing template...,MISSING INPUT,0.000905
933c4ec2439077d0,,,[PhoenixTemplateMappingError('Missing template...,MISSING INPUT,0.001736
5ea4441a9cea1370,,,[PhoenixTemplateMappingError('Missing template...,MISSING INPUT,0.002230
c10cdf2bda98d73c,,,[PhoenixTemplateMappingError('Missing template...,MISSING INPUT,0.002872
bc10a9b0ee25ecea,,,[PhoenixTemplateMappingError('Missing template...,MISSING INPUT,0.003491
...,...,...,...,...,...
5ea677e9e1557c22,,,[],DID NOT RUN,0.000000
38e587d5f40c858e,,,[],DID NOT RUN,0.000000
e1b87c6c4d8ff9b5,,,[],DID NOT RUN,0.000000
29d0aa8f0420024b,,,[],DID NOT RUN,0.000000
