Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add expect.score #631

Merged
merged 11 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions python/langsmith/_expect.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ def test_output_semantically_close():
expect.value(response_txt).to_contain("Hello!")
# Or using a custom check
expect.value(response_txt).against(lambda x: "Hello" in x)

# You can even use this for basic metric logging within unit tests

expect.score(0.8)
expect.score(0.7, key="similarity").to_be_greater_than(0.7)
""" # noqa: E501

from __future__ import annotations
Expand Down Expand Up @@ -72,7 +77,7 @@ def __init__(
max_workers=3
)
rt = rh.get_current_run_tree()
self._run_id = rt.id if rt else run_id
self._run_id = rt.trace_id if rt else run_id
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you're running this within a nested function, we'd prefer it by default to be assigned to the root trace (since it's called beneath @unit


def _submit_feedback(self, score: int, message: Optional[str] = None) -> None:
if not ls_utils.test_tracking_is_disabled():
Expand Down Expand Up @@ -336,6 +341,37 @@ def value(self, value: Any) -> _Matcher:
"""
return _Matcher(self.client, "value", value, _executor=self.executor)

def score(
hinthornw marked this conversation as resolved.
Show resolved Hide resolved
self,
score: Union[float, int],
*,
key: str = "score",
source_run_id: Optional[ls_client.ID_TYPE] = None,
comment: Optional[str] = None,
) -> _Matcher:
"""Log a numeric score to LangSmith.

Args:
score: The score value to log.
key: The key to use for logging the score. Defaults to "score".

Examples:
>>> expect.score(0.8) # doctest: +ELLIPSIS
<langsmith._expect._Matcher object at ...>

>>> expect.score(0.8, key="similarity").to_be_greater_than(0.7)
"""
self._submit_feedback(
key,
{
"score": score,
"source_info": {"method": "expect.score"},
"source_run_id": source_run_id,
"comment": comment,
},
)
return _Matcher(self.client, key, score, _executor=self.executor)

## Private Methods

@overload
Expand All @@ -354,7 +390,7 @@ def __call__(

def _submit_feedback(self, key: str, results: dict):
current_run = rh.get_current_run_tree()
run_id = current_run.id if current_run else None
run_id = current_run.trace_id if current_run else None
if not ls_utils.test_tracking_is_disabled():
self.executor.submit(
self.client.create_feedback, run_id=run_id, key=key, **results
Expand Down
6 changes: 5 additions & 1 deletion python/langsmith/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,11 @@ def _end_tests(
test_suite.client.update_project(
test_suite.experiment_id,
end_time=datetime.datetime.now(datetime.timezone.utc),
metadata={**git_info, "dataset_version": test_suite.get_version()},
metadata={
**git_info,
"dataset_version": test_suite.get_version(),
"revision_id": ls_env.get_langchain_env_var_metadata().get("revision_id"),
},
)
test_suite.wait()

Expand Down
26 changes: 13 additions & 13 deletions python/langsmith/run_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import functools
import inspect
import logging
import traceback
import uuid
import warnings
from contextvars import copy_context
Expand Down Expand Up @@ -448,8 +447,7 @@ async def async_wrapper(
):
function_result = await fr_coro
except BaseException as e:
stacktrace = traceback.format_exc()
_container_end(run_container, error=stacktrace)
_container_end(run_container, error=e)
raise e
_container_end(run_container, outputs=function_result)
return function_result
Expand Down Expand Up @@ -521,8 +519,7 @@ async def async_generator_wrapper(
except StopAsyncIteration:
pass
except BaseException as e:
stacktrace = traceback.format_exc()
_container_end(run_container, error=stacktrace)
_container_end(run_container, error=e)
raise e
if results:
if reduce_fn:
Expand Down Expand Up @@ -564,8 +561,7 @@ def wrapper(
func, *args, **kwargs
)
except BaseException as e:
stacktrace = traceback.format_exc()
_container_end(run_container, error=stacktrace)
_container_end(run_container, error=e)
raise e
_container_end(run_container, outputs=function_result)
return function_result
Expand Down Expand Up @@ -620,8 +616,7 @@ def generator_wrapper(
pass

except BaseException as e:
stacktrace = traceback.format_exc()
_container_end(run_container, error=stacktrace)
_container_end(run_container, error=e)
raise e
if results:
if reduce_fn:
Expand Down Expand Up @@ -712,7 +707,7 @@ def trace(
else:
new_run = run_trees.RunTree(
name=name,
run_id=run_id,
id=run_id or uuid.uuid4(),
reference_example_id=reference_example_id,
run_type=run_type,
extra=extra_outer,
Expand All @@ -730,7 +725,8 @@ def trace(
if exceptions_to_handle and isinstance(e, exceptions_to_handle):
tb = None
else:
tb = traceback.format_exc()
tb = utils._format_exc()
tb = f"{e.__class__.__name__}: {e}\n\n{tb}"
new_run.end(error=tb)
new_run.patch()
raise e
Expand Down Expand Up @@ -930,15 +926,19 @@ class _ContainerInput(TypedDict, total=False):
def _container_end(
container: _TraceableContainer,
outputs: Optional[Any] = None,
error: Optional[str] = None,
error: Optional[BaseException] = None,
):
"""End the run."""
run_tree = container.get("new_run")
if run_tree is None:
# Tracing enabled
return
outputs_ = outputs if isinstance(outputs, dict) else {"output": outputs}
run_tree.end(outputs=outputs_, error=error)
error_ = None
if error:
stacktrace = utils._format_exc()
error_ = f"{repr(error)}\n\n{stacktrace}"
run_tree.end(outputs=outputs_, error=error_)
run_tree.patch()
if error:
try:
Expand Down
9 changes: 9 additions & 0 deletions python/langsmith/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
import os
import pathlib
import subprocess
import sys
import threading
import traceback
from typing import (
Any,
Callable,
Expand Down Expand Up @@ -488,3 +490,10 @@ def with_optional_cache(
yield
else:
yield


def _format_exc() -> str:
hinthornw marked this conversation as resolved.
Show resolved Hide resolved
# Used internally to format exceptions without cluttering the traceback
tb_lines = traceback.format_exception(*sys.exc_info())
filtered_lines = [line for line in tb_lines if "langsmith/" not in line]
return "".join(filtered_lines)
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langsmith"
version = "0.1.51"
version = "0.1.52"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
authors = ["LangChain <support@langchain.dev>"]
license = "MIT"
Expand Down
11 changes: 9 additions & 2 deletions python/tests/unit_tests/test_run_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import sys
import time
import uuid
import warnings
from typing import Any, AsyncGenerator, Generator, Optional, cast
from unittest.mock import MagicMock, patch
Expand Down Expand Up @@ -473,8 +474,12 @@ async def some_async_func(queries: list) -> AsyncGenerator[list, None]:

@traceable
async def another_async_func(query: str) -> str:
with langsmith.trace(name="zee-cm", inputs={"query": query}) as run_tree:
rid = uuid.uuid4()
with langsmith.trace(
name="zee-cm", inputs={"query": query}, run_id=rid
) as run_tree:
run_tree.end(outputs={"query": query})
assert run_tree.id == rid
return query

@traceable
Expand Down Expand Up @@ -848,11 +853,13 @@ def child_fn(a: int, b: int) -> int:

mock_client_ = _get_mock_client()
with tracing_context(enabled=True):
rid = uuid.uuid4()
with langsmith.trace(
name="parent_fn", inputs={"a": 1, "b": 2}, client=mock_client_
name="parent_fn", inputs={"a": 1, "b": 2}, client=mock_client_, run_id=rid
hinthornw marked this conversation as resolved.
Show resolved Hide resolved
) as run:
result = child_fn(1, 2)
run.end(outputs={"result": result})
assert run.id == rid

assert result == 3
assert run.name == "parent_fn"
Expand Down
Loading