Async Eval Functions (#605)

langchain-ai · Apr 15, 2024 · 662ce5d · 662ce5d
1 parent 2066011
commit 662ce5d
Show file tree

Hide file tree

Showing 9 changed files with 319 additions and 38 deletions.
diff --git a/python/langsmith/_testing.py b/python/langsmith/_testing.py
@@ -232,6 +232,7 @@ def unit(*args: Any, **kwargs: Any) -> Callable:
     def decorator(func: Callable) -> Callable:
         if inspect.iscoroutinefunction(func):
 
+            @functools.wraps(func)
             async def async_wrapper(*test_args: Any, **test_kwargs: Any):
                 if disable_tracking:
                     return await func(*test_args, **test_kwargs)

diff --git a/python/langsmith/client.py b/python/langsmith/client.py
@@ -3387,15 +3387,16 @@ def create_feedback(
             The ID of the run to provide feedback for. Either the run_id OR
             the project_id must be provided.
         key : str
-            The name of the metric, tag, or 'aspect' this feedback is about.
+            The name of the metric or 'aspect' this feedback is about.
         score : float or int or bool or None, default=None
             The score to rate this run on the metric or aspect.
         value : float or int or bool or str or dict or None, default=None
             The display value or non-numeric value for this feedback.
         correction : dict or None, default=None
             The proper ground truth for this run.
         comment : str or None, default=None
-            A comment about this feedback.
+            A comment about this feedback, such as a justification for the score or
+            chain-of-thought trajectory for an LLM judge.
         source_info : Dict[str, Any] or None, default=None
             Information about the source of this feedback.
         feedback_source_type : FeedbackSourceType or str, default=FeedbackSourceType.API

diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py
@@ -28,6 +28,7 @@
 from langsmith._internal import _aiter as aitertools
 from langsmith.beta import warn_beta
 from langsmith.evaluation._runner import (
+    AEVALUATOR_T,
     DATA_T,
     EVALUATOR_T,
     SUMMARY_EVALUATOR_T,
@@ -42,10 +43,7 @@
     _resolve_experiment,
     _wrap_summary_evaluators,
 )
-from langsmith.evaluation.evaluator import (
-    EvaluationResults,
-    RunEvaluator,
-)
+from langsmith.evaluation.evaluator import EvaluationResults, RunEvaluator
 
 logger = logging.getLogger(__name__)
 
@@ -57,7 +55,7 @@ async def aevaluate(
     target: Union[ATARGET_T, AsyncIterable[dict]],
     /,
     data: Union[DATA_T, AsyncIterable[schemas.Example], Iterable[schemas.Example]],
-    evaluators: Optional[Sequence[EVALUATOR_T]] = None,
+    evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
@@ -192,6 +190,24 @@ async def aevaluate(
         ...     )
         ... )  # doctest: +ELLIPSIS
         View the evaluation results for experiment:...
+
+        Using Async evaluators:
+
+        >>> async def helpfulness(run: Run, example: Example):
+        ...     # Row-level evaluator for helpfulness.
+        ...     await asyncio.sleep(0.1)  # Replace with your LLM API call
+        ...     return {"score": run.outputs["output"] == "Yes"}
+
+        >>> results = asyncio.run(
+        ...     aevaluate(
+        ...         apredict,
+        ...         data=dataset_name,
+        ...         evaluators=[helpfulness],
+        ...         summary_evaluators=[precision],
+        ...         experiment_prefix="My Helpful Experiment",
+        ...     )
+        ... )  # doctest: +ELLIPSIS
+        View the evaluation results for experiment:...
     """  # noqa: E501
     return await _aevaluate(
         target,
@@ -210,7 +226,7 @@ async def aevaluate(
 async def aevaluate_existing(
     experiment: Union[str, uuid.UUID],
     /,
-    evaluators: Optional[Sequence[EVALUATOR_T]] = None,
+    evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
     max_concurrency: Optional[int] = None,
@@ -313,7 +329,7 @@ async def _aevaluate(
     target: Union[ATARGET_T, AsyncIterable[dict], Iterable[schemas.Run]],
     /,
     data: Union[DATA_T, AsyncIterable[schemas.Example]],
-    evaluators: Optional[Sequence[EVALUATOR_T]] = None,
+    evaluators: Optional[Sequence[Union[EVALUATOR_T, AEVALUATOR_T]]] = None,
     summary_evaluators: Optional[Sequence[SUMMARY_EVALUATOR_T]] = None,
     metadata: Optional[dict] = None,
     experiment_prefix: Optional[str] = None,
@@ -482,12 +498,7 @@ async def awith_predictions(
 
     async def awith_evaluators(
         self,
-        evaluators: Sequence[
-            Union[
-                EVALUATOR_T,
-                RunEvaluator,
-            ]
-        ],
+        evaluators: Sequence[Union[EVALUATOR_T, AEVALUATOR_T]],
         *,
         max_concurrency: Optional[int] = None,
     ) -> _AsyncExperimentManager:

diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -13,6 +13,7 @@
 import uuid
 from contextvars import copy_context
 from typing import (
+    Awaitable,
     Callable,
     DefaultDict,
     Dict,
@@ -60,6 +61,12 @@
     RunEvaluator,
     Callable[[schemas.Run, Optional[schemas.Example]], EvaluationResult],
 ]
+AEVALUATOR_T = Union[
+    Callable[
+        [schemas.Run, Optional[schemas.Example]],
+        Awaitable[Union[EvaluationResult, EvaluationResults]],
+    ],
+]
 
 
 def evaluate(
@@ -972,7 +979,7 @@ def _end(self) -> None:
 
 
 def _resolve_evaluators(
-    evaluators: Sequence[EVALUATOR_T],
+    evaluators: Sequence[Union[EVALUATOR_T, RunEvaluator, AEVALUATOR_T]],
 ) -> Sequence[RunEvaluator]:
     results = []
     for evaluator in evaluators:

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
@@ -1,9 +1,10 @@
 """This module contains the evaluator classes for evaluating runs."""
 
 import asyncio
+import inspect
 import uuid
 from abc import abstractmethod
-from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
+from typing import Any, Awaitable, Callable, Dict, List, Literal, Optional, Union, cast
 
 from typing_extensions import TypedDict
 
@@ -101,6 +102,9 @@ async def aevaluate_run(
         )
 
 
+_RUNNABLE_OUTPUT = Union[EvaluationResult, EvaluationResults, dict]
+
+
 class DynamicRunEvaluator(RunEvaluator):
     """A dynamic evaluator that wraps a function and transforms it into a `RunEvaluator`.
 
@@ -115,8 +119,16 @@ class DynamicRunEvaluator(RunEvaluator):
     def __init__(
         self,
         func: Callable[
-            [Run, Optional[Example]], Union[EvaluationResult, EvaluationResults, dict]
+            [Run, Optional[Example]],
+            Union[_RUNNABLE_OUTPUT, Awaitable[_RUNNABLE_OUTPUT]],
         ],
+        # Async function to be used for async evaluation. Optional
+        afunc: Optional[
+            Callable[
+                [Run, Optional[Example]],
+                Awaitable[_RUNNABLE_OUTPUT],
+            ]
+        ] = None,
     ):
         """Initialize the DynamicRunEvaluator with a given function.
 
@@ -127,14 +139,24 @@ def __init__(
         wraps(func)(self)
         from langsmith import run_helpers  # type: ignore
 
-        self.func = cast(
-            run_helpers.SupportsLangsmithExtra,
-            (
-                func
-                if run_helpers.is_traceable_function(func)
-                else run_helpers.traceable()(func)
-            ),
-        )
+        if afunc is not None:
+            self.afunc = run_helpers.ensure_traceable(afunc)
+            self._name = getattr(afunc, "__name__", "DynamicRunEvaluator")
+        if inspect.iscoroutinefunction(func):
+            if afunc is not None:
+                raise TypeError(
+                    "Func was provided as a coroutine function, but afunc was "
+                    "also provided. If providing both, func should be a regular "
+                    "function to avoid ambiguity."
+                )
+            self.afunc = run_helpers.ensure_traceable(func)
+            self._name = getattr(func, "__name__", "DynamicRunEvaluator")
+        else:
+            self.func = cast(
+                run_helpers.SupportsLangsmithExtra[_RUNNABLE_OUTPUT],
+                run_helpers.ensure_traceable(func),
+            )
+            self._name = getattr(func, "__name__", "DynamicRunEvaluator")
 
     def _coerce_evaluation_result(
         self,
@@ -149,7 +171,7 @@ def _coerce_evaluation_result(
         try:
             if "key" not in result:
                 if allow_no_key:
-                    result["key"] = getattr(self.func, "__name__")
+                    result["key"] = self._name
             return EvaluationResult(**{"source_run_id": source_run_id, **result})
         except ValidationError as e:
             raise ValueError(
@@ -174,6 +196,30 @@ def _coerce_evaluation_results(
             cast(dict, results), allow_no_key=True, source_run_id=source_run_id
         )
 
+    def _format_result(
+        self,
+        result: Union[EvaluationResult, EvaluationResults, dict],
+        source_run_id: uuid.UUID,
+    ) -> Union[EvaluationResult, EvaluationResults]:
+        if isinstance(result, EvaluationResult):
+            if not result.source_run_id:
+                result.source_run_id = source_run_id
+            return result
+        if not isinstance(result, dict):
+            raise ValueError(
+                f"Expected a dict, EvaluationResult, or EvaluationResults, got {result}"
+            )
+        return self._coerce_evaluation_results(result, source_run_id)
+
+    @property
+    def is_async(self) -> bool:
+        """Check if the evaluator function is asynchronous.
+
+        Returns:
+            bool: True if the evaluator function is asynchronous, False otherwise.
+        """
+        return hasattr(self, "afunc")
+
     def evaluate_run(
         self, run: Run, example: Optional[Example] = None
     ) -> Union[EvaluationResult, EvaluationResults]:
@@ -188,6 +234,15 @@ def evaluate_run(
         Returns:
             Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
         """  # noqa: E501
+        if not hasattr(self, "func"):
+            running_loop = asyncio.get_event_loop()
+            if running_loop.is_running():
+                raise RuntimeError(
+                    "Cannot call `evaluate_run` on an async run evaluator from"
+                    " within an running event loop. Use `aevaluate_run` instead."
+                )
+            else:
+                return running_loop.run_until_complete(self.aevaluate_run(run, example))
         source_run_id = uuid.uuid4()
         metadata: Dict[str, Any] = {"target_run_id": run.id}
         if getattr(run, "session_id", None):
@@ -197,15 +252,34 @@ def evaluate_run(
             example,
             langsmith_extra={"run_id": source_run_id, "metadata": metadata},
         )
-        if isinstance(result, EvaluationResult):
-            if not result.source_run_id:
-                result.source_run_id = source_run_id
-            return result
-        if not isinstance(result, dict):
-            raise ValueError(
-                f"Expected a dict, EvaluationResult, or EvaluationResults, got {result}"
-            )
-        return self._coerce_evaluation_results(result, source_run_id)
+        return self._format_result(result, source_run_id)
+
+    async def aevaluate_run(self, run: Run, example: Optional[Example] = None):
+        """Evaluate a run asynchronously using the wrapped async function.
+
+        This method directly invokes the wrapped async function with the
+            provided arguments.
+
+        Args:
+            run (Run): The run to be evaluated.
+            example (Optional[Example]): An optional example to be used
+                in the evaluation.
+
+        Returns:
+            Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
+        """
+        if not hasattr(self, "afunc"):
+            return await super().aevaluate_run(run, example)
+        source_run_id = uuid.uuid4()
+        metadata: Dict[str, Any] = {"target_run_id": run.id}
+        if getattr(run, "session_id", None):
+            metadata["experiment"] = str(run.session_id)
+        result = await self.afunc(
+            run,
+            example,
+            langsmith_extra={"run_id": source_run_id, "metadata": metadata},
+        )
+        return self._format_result(result, source_run_id)
 
     def __call__(
         self, run: Run, example: Optional[Example] = None
@@ -231,7 +305,7 @@ def __repr__(self) -> str:
 
 def run_evaluator(
     func: Callable[
-        [Run, Optional[Example]], Union[EvaluationResult, EvaluationResults, dict]
+        [Run, Optional[Example]], Union[_RUNNABLE_OUTPUT, Awaitable[_RUNNABLE_OUTPUT]]
     ],
 ):
     """Create a run evaluator from a function.

diff --git a/python/langsmith/run_helpers.py b/python/langsmith/run_helpers.py
@@ -113,6 +113,14 @@ def is_traceable_function(func: Callable) -> bool:
     )
 
 
+def ensure_traceable(func: Callable[..., R]) -> Callable[..., R]:
+    """Ensure that a function is traceable."""
+    return cast(
+        SupportsLangsmithExtra,
+        (func if is_traceable_function(func) else traceable()(func)),
+    )
+
+
 def is_async(func: Callable) -> bool:
     """Inspect function or wrapped function to see if it is async."""
     return inspect.iscoroutinefunction(func) or (

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.47"
+version = "0.1.48"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <support@langchain.dev>"]
 license = "MIT"

diff --git a/python/tests/evaluation/test_evaluation.py b/python/tests/evaluation/test_evaluation.py
@@ -102,3 +102,33 @@ async def test_baz():
     await asyncio.sleep(0.1)
     expect(3 + 4).to_equal(7)
     return 7
+
+
+@unit
+@pytest.mark.parametrize("x, y", [(1, 2), (2, 3)])
+def test_foo_parametrized(x, y):
+    expect(x + y).to_be_greater_than(0)
+    return x + y
+
+
+@unit(output_keys=["z"])
+@pytest.mark.parametrize("x, y, z", [(1, 2, 3), (2, 3, 5)])
+def test_bar_parametrized(x, y, z):
+    expect(x + y).to_equal(z)
+    return {"z": x + y}
+
+
+@unit
+@pytest.mark.parametrize("x, y", [(1, 2), (2, 3)])
+async def test_foo_async_parametrized(x, y):
+    await asyncio.sleep(0.1)
+    expect(x + y).to_be_greater_than(0)
+    return x + y
+
+
+@unit(output_keys=["z"])
+@pytest.mark.parametrize("x, y, z", [(1, 2, 3), (2, 3, 5)])
+async def test_bar_async_parametrized(x, y, z):
+    await asyncio.sleep(0.1)
+    expect(x + y).to_equal(z)
+    return {"z": x + y}