Skip to content

Commit

Permalink
Async Eval Functions (#605)
Browse files Browse the repository at this point in the history
  • Loading branch information
hinthornw committed Apr 15, 2024
1 parent 2066011 commit 662ce5d
Show file tree
Hide file tree
Showing 9 changed files with 319 additions and 38 deletions.
1 change: 1 addition & 0 deletions python/langsmith/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ def unit(*args: Any, **kwargs: Any) -> Callable:
def decorator(func: Callable) -> Callable:
if inspect.iscoroutinefunction(func):

@functools.wraps(func)
async def async_wrapper(*test_args: Any, **test_kwargs: Any):
if disable_tracking:
return await func(*test_args, **test_kwargs)
Expand Down
5 changes: 3 additions & 2 deletions python/langsmith/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3387,15 +3387,16 @@ def create_feedback(
The ID of the run to provide feedback for. Either the run_id OR
the project_id must be provided.
key : str
The name of the metric, tag, or 'aspect' this feedback is about.
The name of the metric or 'aspect' this feedback is about.
score : float or int or bool or None, default=None
The score to rate this run on the metric or aspect.
value : float or int or bool or str or dict or None, default=None
The display value or non-numeric value for this feedback.
correction : dict or None, default=None
The proper ground truth for this run.
comment : str or None, default=None
A comment about this feedback.
A comment about this feedback, such as a justification for the score or
chain-of-thought trajectory for an LLM judge.
source_info : Dict[str, Any] or None, default=None
Information about the source of this feedback.
feedback_source_type : FeedbackSourceType or str, default=FeedbackSourceType.API
Expand Down
37 changes: 24 additions & 13 deletions python/langsmith/evaluation/_arunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from langsmith._internal import _aiter as aitertools
from langsmith.beta import warn_beta
from langsmith.evaluation._runner import (
AEVALUATOR_T,
DATA_T,
EVALUATOR_T,
SUMMARY_EVALUATOR_T,
Expand All @@ -42,10 +43,7 @@
_resolve_experiment,
_wrap_summary_evaluators,
)
from langsmith.evaluation.evaluator import (
EvaluationResults,
RunEvaluator,
)
from langsmith.evaluation.evaluator import EvaluationResults, RunEvaluator

logger = logging.getLogger(__name__)

Expand All @@ -57,7 +55,7 @@ async def aevaluate(
target: Union[ATARGET_T, AsyncIterable[dict]],
/,
data: Union[DATA_T, AsyncIterable[schemas.Example], Iterable[schemas.Example]],
evaluators: Optional[Sequence[EVALUATOR_T]] = None,
evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None,
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
Expand Down Expand Up @@ -192,6 +190,24 @@ async def aevaluate(
... )
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
Using Async evaluators:
>>> async def helpfulness(run: Run, example: Example):
... # Row-level evaluator for helpfulness.
... await asyncio.sleep(0.1) # Replace with your LLM API call
... return {"score": run.outputs["output"] == "Yes"}
>>> results = asyncio.run(
... aevaluate(
... apredict,
... data=dataset_name,
... evaluators=[helpfulness],
... summary_evaluators=[precision],
... experiment_prefix="My Helpful Experiment",
... )
... ) # doctest: +ELLIPSIS
View the evaluation results for experiment:...
""" # noqa: E501
return await _aevaluate(
target,
Expand All @@ -210,7 +226,7 @@ async def aevaluate(
async def aevaluate_existing(
experiment: Union[str, uuid.UUID],
/,
evaluators: Optional[Sequence[EVALUATOR_T]] = None,
evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None,
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
metadata: Optional[dict] = None,
max_concurrency: Optional[int] = None,
Expand Down Expand Up @@ -313,7 +329,7 @@ async def _aevaluate(
target: Union[ATARGET_T, AsyncIterable[dict], Iterable[schemas.Run]],
/,
data: Union[DATA_T, AsyncIterable[schemas.Example]],
evaluators: Optional[Sequence[EVALUATOR_T]] = None,
evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None,
summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
Expand Down Expand Up @@ -482,12 +498,7 @@ async def awith_predictions(

async def awith_evaluators(
self,
evaluators: Sequence[
Union[
EVALUATOR_T,
RunEvaluator,
]
],
evaluators: Sequence[Union[EVALUATOR_T, AEVALUATOR_T]],
*,
max_concurrency: Optional[int] = None,
) -> _AsyncExperimentManager:
Expand Down
9 changes: 8 additions & 1 deletion python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import uuid
from contextvars import copy_context
from typing import (
Awaitable,
Callable,
DefaultDict,
Dict,
Expand Down Expand Up @@ -60,6 +61,12 @@
RunEvaluator,
Callable[[schemas.Run, Optional[schemas.Example]], EvaluationResult],
]
AEVALUATOR_T = Union[
Callable[
[schemas.Run, Optional[schemas.Example]],
Awaitable[Union[EvaluationResult, EvaluationResults]],
],
]


def evaluate(
Expand Down Expand Up @@ -972,7 +979,7 @@ def _end(self) -> None:


def _resolve_evaluators(
evaluators: Sequence[EVALUATOR_T],
evaluators: Sequence[Union[EVALUATOR_T, RunEvaluator, AEVALUATOR_T]],
) -> Sequence[RunEvaluator]:
results = []
for evaluator in evaluators:
Expand Down
116 changes: 95 additions & 21 deletions python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""This module contains the evaluator classes for evaluating runs."""

import asyncio
import inspect
import uuid
from abc import abstractmethod
from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
from typing import Any, Awaitable, Callable, Dict, List, Literal, Optional, Union, cast

from typing_extensions import TypedDict

Expand Down Expand Up @@ -101,6 +102,9 @@ async def aevaluate_run(
)


_RUNNABLE_OUTPUT = Union[EvaluationResult, EvaluationResults, dict]


class DynamicRunEvaluator(RunEvaluator):
"""A dynamic evaluator that wraps a function and transforms it into a `RunEvaluator`.
Expand All @@ -115,8 +119,16 @@ class DynamicRunEvaluator(RunEvaluator):
def __init__(
self,
func: Callable[
[Run, Optional[Example]], Union[EvaluationResult, EvaluationResults, dict]
[Run, Optional[Example]],
Union[_RUNNABLE_OUTPUT, Awaitable[_RUNNABLE_OUTPUT]],
],
# Async function to be used for async evaluation. Optional
afunc: Optional[
Callable[
[Run, Optional[Example]],
Awaitable[_RUNNABLE_OUTPUT],
]
] = None,
):
"""Initialize the DynamicRunEvaluator with a given function.
Expand All @@ -127,14 +139,24 @@ def __init__(
wraps(func)(self)
from langsmith import run_helpers # type: ignore

self.func = cast(
run_helpers.SupportsLangsmithExtra,
(
func
if run_helpers.is_traceable_function(func)
else run_helpers.traceable()(func)
),
)
if afunc is not None:
self.afunc = run_helpers.ensure_traceable(afunc)
self._name = getattr(afunc, "__name__", "DynamicRunEvaluator")
if inspect.iscoroutinefunction(func):
if afunc is not None:
raise TypeError(
"Func was provided as a coroutine function, but afunc was "
"also provided. If providing both, func should be a regular "
"function to avoid ambiguity."
)
self.afunc = run_helpers.ensure_traceable(func)
self._name = getattr(func, "__name__", "DynamicRunEvaluator")
else:
self.func = cast(
run_helpers.SupportsLangsmithExtra[_RUNNABLE_OUTPUT],
run_helpers.ensure_traceable(func),
)
self._name = getattr(func, "__name__", "DynamicRunEvaluator")

def _coerce_evaluation_result(
self,
Expand All @@ -149,7 +171,7 @@ def _coerce_evaluation_result(
try:
if "key" not in result:
if allow_no_key:
result["key"] = getattr(self.func, "__name__")
result["key"] = self._name
return EvaluationResult(**{"source_run_id": source_run_id, **result})
except ValidationError as e:
raise ValueError(
Expand All @@ -174,6 +196,30 @@ def _coerce_evaluation_results(
cast(dict, results), allow_no_key=True, source_run_id=source_run_id
)

def _format_result(
self,
result: Union[EvaluationResult, EvaluationResults, dict],
source_run_id: uuid.UUID,
) -> Union[EvaluationResult, EvaluationResults]:
if isinstance(result, EvaluationResult):
if not result.source_run_id:
result.source_run_id = source_run_id
return result
if not isinstance(result, dict):
raise ValueError(
f"Expected a dict, EvaluationResult, or EvaluationResults, got {result}"
)
return self._coerce_evaluation_results(result, source_run_id)

@property
def is_async(self) -> bool:
"""Check if the evaluator function is asynchronous.
Returns:
bool: True if the evaluator function is asynchronous, False otherwise.
"""
return hasattr(self, "afunc")

def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> Union[EvaluationResult, EvaluationResults]:
Expand All @@ -188,6 +234,15 @@ def evaluate_run(
Returns:
Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
""" # noqa: E501
if not hasattr(self, "func"):
running_loop = asyncio.get_event_loop()
if running_loop.is_running():
raise RuntimeError(
"Cannot call `evaluate_run` on an async run evaluator from"
" within an running event loop. Use `aevaluate_run` instead."
)
else:
return running_loop.run_until_complete(self.aevaluate_run(run, example))
source_run_id = uuid.uuid4()
metadata: Dict[str, Any] = {"target_run_id": run.id}
if getattr(run, "session_id", None):
Expand All @@ -197,15 +252,34 @@ def evaluate_run(
example,
langsmith_extra={"run_id": source_run_id, "metadata": metadata},
)
if isinstance(result, EvaluationResult):
if not result.source_run_id:
result.source_run_id = source_run_id
return result
if not isinstance(result, dict):
raise ValueError(
f"Expected a dict, EvaluationResult, or EvaluationResults, got {result}"
)
return self._coerce_evaluation_results(result, source_run_id)
return self._format_result(result, source_run_id)

async def aevaluate_run(self, run: Run, example: Optional[Example] = None):
"""Evaluate a run asynchronously using the wrapped async function.
This method directly invokes the wrapped async function with the
provided arguments.
Args:
run (Run): The run to be evaluated.
example (Optional[Example]): An optional example to be used
in the evaluation.
Returns:
Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
"""
if not hasattr(self, "afunc"):
return await super().aevaluate_run(run, example)
source_run_id = uuid.uuid4()
metadata: Dict[str, Any] = {"target_run_id": run.id}
if getattr(run, "session_id", None):
metadata["experiment"] = str(run.session_id)
result = await self.afunc(
run,
example,
langsmith_extra={"run_id": source_run_id, "metadata": metadata},
)
return self._format_result(result, source_run_id)

def __call__(
self, run: Run, example: Optional[Example] = None
Expand All @@ -231,7 +305,7 @@ def __repr__(self) -> str:

def run_evaluator(
func: Callable[
[Run, Optional[Example]], Union[EvaluationResult, EvaluationResults, dict]
[Run, Optional[Example]], Union[_RUNNABLE_OUTPUT, Awaitable[_RUNNABLE_OUTPUT]]
],
):
"""Create a run evaluator from a function.
Expand Down
8 changes: 8 additions & 0 deletions python/langsmith/run_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,14 @@ def is_traceable_function(func: Callable) -> bool:
)


def ensure_traceable(func: Callable[..., R]) -> Callable[..., R]:
"""Ensure that a function is traceable."""
return cast(
SupportsLangsmithExtra,
(func if is_traceable_function(func) else traceable()(func)),
)


def is_async(func: Callable) -> bool:
"""Inspect function or wrapped function to see if it is async."""
return inspect.iscoroutinefunction(func) or (
Expand Down
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langsmith"
version = "0.1.47"
version = "0.1.48"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
authors = ["LangChain <support@langchain.dev>"]
license = "MIT"
Expand Down
30 changes: 30 additions & 0 deletions python/tests/evaluation/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,33 @@ async def test_baz():
await asyncio.sleep(0.1)
expect(3 + 4).to_equal(7)
return 7


@unit
@pytest.mark.parametrize("x, y", [(1, 2), (2, 3)])
def test_foo_parametrized(x, y):
expect(x + y).to_be_greater_than(0)
return x + y


@unit(output_keys=["z"])
@pytest.mark.parametrize("x, y, z", [(1, 2, 3), (2, 3, 5)])
def test_bar_parametrized(x, y, z):
expect(x + y).to_equal(z)
return {"z": x + y}


@unit
@pytest.mark.parametrize("x, y", [(1, 2), (2, 3)])
async def test_foo_async_parametrized(x, y):
await asyncio.sleep(0.1)
expect(x + y).to_be_greater_than(0)
return x + y


@unit(output_keys=["z"])
@pytest.mark.parametrize("x, y, z", [(1, 2, 3), (2, 3, 5)])
async def test_bar_async_parametrized(x, y, z):
await asyncio.sleep(0.1)
expect(x + y).to_equal(z)
return {"z": x + y}

0 comments on commit 662ce5d

Please sign in to comment.