### Evaluation & Observability of Agents

The ability to observe and evaluate an agent’s behavior is essential for:

- Debugging issues when tasks fail or produce suboptimal results
- Monitoring costs and performance in real-time
- Improving reliability and safety through continuous feedback

In [1]:
# Install dependencies
# %pip install 'smolagents[telemetry]'
# %pip install opentelemetry-sdk opentelemetry-exporter-otlp openinference-instrumentation-smolagents
# %pip install langfuse datasets 'smolagents[gradio]'

### Step 1: Instrument Our Agent

Configure environment variables to set up the connection to LangFuse OpenTelemetry.

In [2]:
import os
import base64

# Get your own keys from https://cloud.langfuse.com
LANGFUSE_PUBLIC_KEY = "pk-lf-4c90c343-52b0-4baa-a417-55be6f4f8933"
LANGFUSE_SECRET_KEY = "sk-lf-3a158645-a486-4e3d-b0fb-c1b4617d8d25"
os.environ["LANGFUSE_PUBLIC_KEY"] = LANGFUSE_PUBLIC_KEY
os.environ["LANGFUSE_SECRET_KEY"] = LANGFUSE_SECRET_KEY
# os.environ["LANGFUSE_HOST"] = "https://cloud.langfuse.com"  # 🇪🇺 EU region example
os.environ["LANGFUSE_HOST"] = "https://us.cloud.langfuse.com"  # 🇺🇸 US region example

LANGFUSE_AUTH = base64.b64encode(
    f"{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}".encode()
).decode()

os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = os.environ.get("LANGFUSE_HOST") + "/api/public/otel"
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic {LANGFUSE_AUTH}"
# os.environ["HF_TOKEN"] = "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" 

In [3]:
from opentelemetry.sdk.trace import TracerProvider
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 
# Create a TracerProvider for OpenTelemetry
trace_provider = TracerProvider()

# Add a SimpleSpanProcessor with the OTLPSpanExporter to send traces
trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))

# Set the global default tracer provider
from opentelemetry import trace
trace.set_tracer_provider(trace_provider)
tracer = trace.get_tracer(__name__)

# Instrument smolagents with the configured provider
SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)

### Step 2: Test Our Instrumentation

If everything is set up correctly, you will see logs/spans in your observability dashboard.

In [4]:
# Create a simple agent to test instrumentation
from smolagents import LiteLLMModel, CodeAgent

# Initialize the model
model = LiteLLMModel(
    model_id="ollama_chat/qwen2.5-coder:7b",  # Or try other Ollama-supported models
    api_base="http://127.0.0.1:11434",  # Default Ollama local server
    num_ctx=8192,
)

agent = CodeAgent(
    tools=[],
    model=model
)

agent.run("1+1=")

2

### Step 3: Observe and Evaluate a More Complex Agent

In [5]:
from smolagents import (CodeAgent, DuckDuckGoSearchTool, InferenceClientModel)

search_tool = DuckDuckGoSearchTool()
agent = CodeAgent(tools=[search_tool], model=model)

agent.run("Based on recent performances, which team do you think will win the 2025 NBA Finals?")


'Los Angeles Lakers'