From d550ab0b4f974cc9e420bdbb1f5e29d1e7e811f2 Mon Sep 17 00:00:00 2001 From: Kathryn May Date: Mon, 10 Nov 2025 12:21:36 -0500 Subject: [PATCH 1/4] Add doc for reading experiment results locally --- pipeline/preprocessors/link_map.py | 5 + src/langsmith/local.mdx | 4 + .../read-local-experiment-results.mdx | 163 ++++++++++++++++++ 3 files changed, 172 insertions(+) create mode 100644 src/langsmith/read-local-experiment-results.mdx diff --git a/pipeline/preprocessors/link_map.py b/pipeline/preprocessors/link_map.py index 0ee24f6ae0..52cf529fb5 100644 --- a/pipeline/preprocessors/link_map.py +++ b/pipeline/preprocessors/link_map.py @@ -212,6 +212,11 @@ class LinkMap(TypedDict): "on_llm_new_token": "langchain_core/callbacks/#langchain_core.callbacks.base.AsyncCallbackHandler.on_llm_new_token", # Rate limiters "InMemoryRateLimiter": "langchain_core/rate_limiters/#langchain_core.rate_limiters.InMemoryRateLimiter", + # LangSmith SDK + "Client": "langsmith/observability/sdk/client/#langsmith.client.Client", + "Client.evaluate": "langsmith/observability/sdk/client/#langsmith.client.Client.evaluate", + "Client.aevaluate": "langsmith/observability/sdk/client/#langsmith.client.Client.aevaluate", + "Client.get_experiment_results": "langsmith/observability/sdk/client/#langsmith.client.Client.get_experiment_results", # LangGraph "get_stream_writer": "langgraph/config/#langgraph.config.get_stream_writer", "StateGraph": "langgraph/graphs/#langgraph.graph.state.StateGraph", diff --git a/src/langsmith/local.mdx b/src/langsmith/local.mdx index 3b8238ae01..e01981a1eb 100644 --- a/src/langsmith/local.mdx +++ b/src/langsmith/local.mdx @@ -9,6 +9,10 @@ You can do this by using the LangSmith Python SDK and passing `upload_results=Fa This will run you application and evaluators exactly as it always does and return the same output, but nothing will be recorded to LangSmith. This includes not just the experiment results but also the application and evaluator traces. + +If you want to upload results to LangSmith but also need to process them in your script (for quality gates, custom aggregations, etc.), refer to [Read experiment results locally](/langsmith/read-local-experiment-results). + + ## Example Let's take a look at an example: diff --git a/src/langsmith/read-local-experiment-results.mdx b/src/langsmith/read-local-experiment-results.mdx new file mode 100644 index 0000000000..69828b6720 --- /dev/null +++ b/src/langsmith/read-local-experiment-results.mdx @@ -0,0 +1,163 @@ +--- +title: How to read experiment results locally +sidebarTitle: Read experiment results locally +--- + +When running [evaluations](/langsmith/evaluation-concepts), you may want to process results programmatically in your script rather than viewing them in the [LangSmith UI](https://smith.langchain.com). This is useful for scenarios like: + +- **CI/CD pipelines**: Implement quality gates that fail builds if evaluation scores drop below a threshold. +- **Local debugging**: Inspect and analyze results without API calls. +- **Custom aggregations**: Calculate metrics and statistics using your own logic. +- **Integration testing**: Use evaluation results to gate merges or deployments. + +This guide shows how to read and process [experiment](/langsmith/evaluation-concepts#experiment) results directly from the @[`Client.evaluate()`][Client.evaluate] response. + + +This page focuses on processing results programmatically while still uploading them to LangSmith. + +If you want to run evaluations locally **without** recording anything to LangSmith (for quick testing or validation), refer to [Run an evaluation locally](/langsmith/local) which uses `upload_results=False`. + + +## Iterate over evaluation results + +The @[`evaluate()`][Client.evaluate] function returns an iterator when called with `blocking=False`. This allows you to process results as they're produced: + +```python +from langsmith import Client +import random + +client = Client() + +def target(inputs): + """Your application or LLM chain""" + return {"output": "MY OUTPUT"} + +def evaluator(run, example): + """Your evaluator function""" + return {"key": "randomness", "score": random.randint(0, 1)} + +# Run evaluation with blocking=False to get an iterator +streamed_results = client.evaluate( + target, + data="MY_DATASET_NAME", + evaluators=[evaluator], + blocking=False +) + +# Collect results as they stream in +aggregated_results = [] +for result in streamed_results: + aggregated_results.append(result) + +# Separate loop to avoid logging at the same time as logs from evaluate() +for result in aggregated_results: + print("Input:", result["run"].inputs) + print("Output:", result["run"].outputs) + print("Evaluation Results:", result["evaluation_results"]["results"]) + print("--------------------------------") +``` + +This produces output like: + +``` +Input: {'input': 'MY INPUT'} +Output: {'output': 'MY OUTPUT'} +Evaluation Results: [EvaluationResult(key='randomness', score=1, value=None, comment=None, correction=None, evaluator_info={}, feedback_config=None, source_run_id=UUID('7ebb4900-91c0-40b0-bb10-f2f6a451fd3c'), target_run_id=None, extra=None)] +-------------------------------- +``` + +## Understand the result structure + +Each result in the iterator contains: + +- `result["run"]`: The execution of your target function. + - `result["run"].inputs`: The inputs from your [dataset](/langsmith/evaluation-concepts#datasets) example. + - `result["run"].outputs`: The outputs produced by your target function. + - `result["run"].id`: The unique ID for this run. + +- `result["evaluation_results"]["results"]`: A list of `EvaluationResult` objects, one per evaluator. + - `key`: The metric name (from your evaluator's return value). + - `score`: The numeric score (typically 0-1 or boolean). + - `comment`: Optional explanatory text. + - `source_run_id`: The ID of the evaluator run. + +- `result["example"]`: The dataset example that was evaluated. + - `result["example"].inputs`: The input values. + - `result["example"].outputs`: The reference outputs (if any). + +## Example: Implement a quality gate + +This example shows how to use evaluation results to pass or fail a CI/CD build automatically based on quality thresholds. The script iterates through results, calculates an average accuracy score, and exits with a non-zero status code if the accuracy falls below 85%. This ensures that you can deploy code changes that meet quality standards. + +```python +from langsmith import Client +import sys + +client = Client() + +def my_application(inputs): + # Your application logic + return {"response": "..."} + +def accuracy_evaluator(run, example): + # Your evaluation logic + is_correct = run.outputs["response"] == example.outputs["expected"] + return {"key": "accuracy", "score": 1 if is_correct else 0} + +# Run evaluation +results = client.evaluate( + my_application, + data="my_test_dataset", + evaluators=[accuracy_evaluator], + blocking=False +) + +# Calculate aggregate metrics +total_score = 0 +count = 0 + +for result in results: + eval_result = result["evaluation_results"]["results"][0] + total_score += eval_result.score + count += 1 + +average_accuracy = total_score / count + +print(f"Average accuracy: {average_accuracy:.2%}") + +# Fail the build if accuracy is too low +if average_accuracy < 0.85: + print("❌ Evaluation failed! Accuracy below 85% threshold.") + sys.exit(1) + +print("✅ Evaluation passed!") +``` + +## Example: Collect results for analysis + +Sometimes you may want to collect all results first before processing them. This is useful when you need to perform operations that require the full dataset (like calculating percentiles, sorting by score, or generating summary reports). Collecting results separately also prevents your output from being mixed with the logging from `evaluate()`. + +```python +# Collect all results first +all_results = [] +for result in client.evaluate(target, data=dataset, evaluators=[evaluator], blocking=False): + all_results.append(result) + +# Then process them separately +# (This avoids mixing your print statements with evaluation logs) +for result in all_results: + print("Input:", result["run"].inputs) + print("Output:", result["run"].outputs) + + # Access individual evaluation results + for eval_result in result["evaluation_results"]["results"]: + print(f" {eval_result.key}: {eval_result.score}") +``` + +For more information on running evaluations without uploading results, refer to [Run an evaluation locally](/langsmith/local). + +## Related + +- [Evaluate your LLM application](/langsmith/evaluate-llm-application) +- [Run an evaluation locally](/langsmith/local) +- [Fetch performance metrics from an experiment](/langsmith/fetch-perf-metrics-experiment) From 8e440f55dc9230dfa4d23a484968a20ba918cf97 Mon Sep 17 00:00:00 2001 From: Kathryn May Date: Wed, 12 Nov 2025 13:02:17 -0500 Subject: [PATCH 2/4] Update + feedback Adjust --- pipeline/preprocessors/link_map.py | 1 + .../read-local-experiment-results.mdx | 37 ++++++++++++------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/pipeline/preprocessors/link_map.py b/pipeline/preprocessors/link_map.py index 52cf529fb5..465355fd56 100644 --- a/pipeline/preprocessors/link_map.py +++ b/pipeline/preprocessors/link_map.py @@ -217,6 +217,7 @@ class LinkMap(TypedDict): "Client.evaluate": "langsmith/observability/sdk/client/#langsmith.client.Client.evaluate", "Client.aevaluate": "langsmith/observability/sdk/client/#langsmith.client.Client.aevaluate", "Client.get_experiment_results": "langsmith/observability/sdk/client/#langsmith.client.Client.get_experiment_results", + "ExperimentResults": "langsmith/observability/sdk/evaluation/#langsmith.evaluation._runner.ExperimentResults", # LangGraph "get_stream_writer": "langgraph/config/#langgraph.config.get_stream_writer", "StateGraph": "langgraph/graphs/#langgraph.graph.state.StateGraph", diff --git a/src/langsmith/read-local-experiment-results.mdx b/src/langsmith/read-local-experiment-results.mdx index 69828b6720..c7dae8e6db 100644 --- a/src/langsmith/read-local-experiment-results.mdx +++ b/src/langsmith/read-local-experiment-results.mdx @@ -10,7 +10,7 @@ When running [evaluations](/langsmith/evaluation-concepts), you may want to proc - **Custom aggregations**: Calculate metrics and statistics using your own logic. - **Integration testing**: Use evaluation results to gate merges or deployments. -This guide shows how to read and process [experiment](/langsmith/evaluation-concepts#experiment) results directly from the @[`Client.evaluate()`][Client.evaluate] response. +This guide shows you how to iterate over and process [experiment](/langsmith/evaluation-concepts#experiment) results from the @[`ExperimentResults`][ExperimentResults] object returned by @[`Client.evaluate()`][Client.evaluate]. This page focuses on processing results programmatically while still uploading them to LangSmith. @@ -20,7 +20,14 @@ If you want to run evaluations locally **without** recording anything to LangSmi ## Iterate over evaluation results -The @[`evaluate()`][Client.evaluate] function returns an iterator when called with `blocking=False`. This allows you to process results as they're produced: +The @[`evaluate()`][Client.evaluate] function returns an @[`ExperimentResults`][ExperimentResults] object that you can iterate over. The `blocking` parameter controls when results become available: + +- `blocking=False`: Returns immediately with an iterator that yields results as they're produced. This allows you to process results in real-time as the evaluation runs. +- `blocking=True` (default): Blocks until all evaluations complete before returning. When you iterate over the results, all data is already available. + +Both modes return the same `ExperimentResults` type; the difference is whether the function waits for completion before returning. Use `blocking=False` for streaming and real-time debugging, or `blocking=True` for batch processing when you need the complete dataset. + +The following example demonstrates `blocking=False`. It iterates over results as they stream in, collects them in a list, then processes them in a separate loop: ```python from langsmith import Client @@ -87,7 +94,7 @@ Each result in the iterator contains: ## Example: Implement a quality gate -This example shows how to use evaluation results to pass or fail a CI/CD build automatically based on quality thresholds. The script iterates through results, calculates an average accuracy score, and exits with a non-zero status code if the accuracy falls below 85%. This ensures that you can deploy code changes that meet quality standards. +This example uses evaluation results to pass or fail a CI/CD build automatically based on quality thresholds. The script iterates through results, calculates an average accuracy score, and exits with a non-zero status code if the accuracy falls below 85%. This ensures that you can deploy code changes that meet quality standards. ```python from langsmith import Client @@ -133,19 +140,21 @@ if average_accuracy < 0.85: print("✅ Evaluation passed!") ``` -## Example: Collect results for analysis +## Example: Batch processing with blocking=True -Sometimes you may want to collect all results first before processing them. This is useful when you need to perform operations that require the full dataset (like calculating percentiles, sorting by score, or generating summary reports). Collecting results separately also prevents your output from being mixed with the logging from `evaluate()`. +When you need to perform operations that require the complete dataset (like calculating percentiles, sorting by score, or generating summary reports), use `blocking=True` to wait for all evaluations to complete before processing: ```python -# Collect all results first -all_results = [] -for result in client.evaluate(target, data=dataset, evaluators=[evaluator], blocking=False): - all_results.append(result) - -# Then process them separately -# (This avoids mixing your print statements with evaluation logs) -for result in all_results: +# Run evaluation and wait for all results +results = client.evaluate( + target, + data=dataset, + evaluators=[evaluator], + blocking=True # Wait for all evaluations to complete +) + +# Process all results after evaluation completes +for result in results: print("Input:", result["run"].inputs) print("Output:", result["run"].outputs) @@ -154,6 +163,8 @@ for result in all_results: print(f" {eval_result.key}: {eval_result.score}") ``` +With `blocking=True`, your processing code runs only after all evaluations are complete, avoiding mixed output with evaluation logs. + For more information on running evaluations without uploading results, refer to [Run an evaluation locally](/langsmith/local). ## Related From d5e633df358fb65ac226abc1be8b676014099873 Mon Sep 17 00:00:00 2001 From: Kathryn May Date: Wed, 12 Nov 2025 13:57:19 -0500 Subject: [PATCH 3/4] Fix merge conflict --- src/docs.json | 1 + 1 file changed, 1 insertion(+) diff --git a/src/docs.json b/src/docs.json index 738c9e5e3f..dbdedd1b3f 100644 --- a/src/docs.json +++ b/src/docs.json @@ -1070,6 +1070,7 @@ "langsmith/repetition", "langsmith/rate-limiting", "langsmith/local", + "langsmith/read-local-experiment-results", "langsmith/langchain-runnable", "langsmith/evaluate-graph", "langsmith/evaluate-existing-experiment", From a5cc8fe72fceabad87f7d0a502016921858aaac3 Mon Sep 17 00:00:00 2001 From: Kathryn May Date: Thu, 13 Nov 2025 13:55:29 -0500 Subject: [PATCH 4/4] LHS feedback --- src/langsmith/read-local-experiment-results.mdx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/langsmith/read-local-experiment-results.mdx b/src/langsmith/read-local-experiment-results.mdx index c7dae8e6db..5d2ecfa5d3 100644 --- a/src/langsmith/read-local-experiment-results.mdx +++ b/src/langsmith/read-local-experiment-results.mdx @@ -92,7 +92,9 @@ Each result in the iterator contains: - `result["example"].inputs`: The input values. - `result["example"].outputs`: The reference outputs (if any). -## Example: Implement a quality gate +## Examples + +### Implement a quality gate This example uses evaluation results to pass or fail a CI/CD build automatically based on quality thresholds. The script iterates through results, calculates an average accuracy score, and exits with a non-zero status code if the accuracy falls below 85%. This ensures that you can deploy code changes that meet quality standards. @@ -140,7 +142,7 @@ if average_accuracy < 0.85: print("✅ Evaluation passed!") ``` -## Example: Batch processing with blocking=True +### Batch processing with blocking=True When you need to perform operations that require the complete dataset (like calculating percentiles, sorting by score, or generating summary reports), use `blocking=True` to wait for all evaluations to complete before processing: