Skip to content

Commit

Permalink
Add diff_dataset_versions (#502)
Browse files Browse the repository at this point in the history
Add support for fetching the modifications between two different dataset
versions
  • Loading branch information
hinthornw authored Mar 6, 2024
1 parent 5d93731 commit 4952ae7
Show file tree
Hide file tree
Showing 9 changed files with 196 additions and 9 deletions.
2 changes: 1 addition & 1 deletion js/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "langsmith",
"version": "0.1.12",
"version": "0.1.13",
"description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.",
"packageManager": "yarn@1.22.19",
"files": [
Expand Down
50 changes: 50 additions & 0 deletions js/src/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { AsyncCaller, AsyncCallerParams } from "./utils/async_caller.js";
import {
DataType,
Dataset,
DatasetDiffInfo,
DatasetShareSchema,
Example,
ExampleCreate,
Expand Down Expand Up @@ -1710,6 +1711,41 @@ export class Client {
return result;
}

public async diffDatasetVersions({
datasetId,
datasetName,
fromVersion,
toVersion,
}: {
datasetId?: string;
datasetName?: string;
fromVersion: string | Date;
toVersion: string | Date;
}): Promise<DatasetDiffInfo> {
let datasetId_ = datasetId;
if (datasetId_ === undefined && datasetName === undefined) {
throw new Error("Must provide either datasetName or datasetId");
} else if (datasetId_ !== undefined && datasetName !== undefined) {
throw new Error("Must provide either datasetName or datasetId, not both");
} else if (datasetId_ === undefined) {
const dataset = await this.readDataset({ datasetName });
datasetId_ = dataset.id;
}
const urlParams = new URLSearchParams({
from_version:
typeof fromVersion === "string"
? fromVersion
: fromVersion.toISOString(),
to_version:
typeof toVersion === "string" ? toVersion : toVersion.toISOString(),
});
const response = await this._get<DatasetDiffInfo>(
`/datasets/${datasetId_}/versions/diff`,
urlParams
);
return response as DatasetDiffInfo;
}

public async readDatasetOpenaiFinetuning({
datasetId,
datasetName,
Expand Down Expand Up @@ -1939,10 +1975,14 @@ export class Client {
datasetId,
datasetName,
exampleIds,
asOf,
inlineS3Urls,
}: {
datasetId?: string;
datasetName?: string;
exampleIds?: string[];
asOf?: string | Date;
inlineS3Urls?: boolean;
} = {}): AsyncIterable<Example> {
let datasetId_;
if (datasetId !== undefined && datasetName !== undefined) {
Expand All @@ -1956,6 +1996,16 @@ export class Client {
throw new Error("Must provide a datasetName or datasetId");
}
const params = new URLSearchParams({ dataset: datasetId_ });
const dataset_version = asOf
? typeof asOf === "string"
? asOf
: asOf?.toISOString()
: undefined;
if (dataset_version) {
params.append("as_of", dataset_version);
}
const inlineS3Urls_ = inlineS3Urls ?? true;
params.append("inline_s3_urls", inlineS3Urls_.toString());
if (exampleIds !== undefined) {
for (const id_ of exampleIds) {
params.append("id", id_);
Expand Down
2 changes: 1 addition & 1 deletion js/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ export type {
export { RunTree, type RunTreeConfig } from "./run_trees.js";

// Update using yarn bump-version
export const __version__ = "0.1.12";
export const __version__ = "0.1.13";
6 changes: 6 additions & 0 deletions js/src/schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -327,3 +327,9 @@ export interface FeedbackConfig {
*/
categories?: FeedbackCategory[] | null;
}

export interface DatasetDiffInfo {
examples_modified: string[];
examples_added: string[];
examples_removed: string[];
}
16 changes: 15 additions & 1 deletion js/src/tests/client.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ test.concurrent(
"Examples CRUD",
async () => {
const client = new Client({ autoBatchTracing: false });
const datasetName = "__test_examples_crud JS";
const datasetName = "__test_examples_crud JS" + Date.now();
await deleteDataset(client, datasetName);
const dataset = await client.createDataset(datasetName);
const example = await client.createExample(
Expand All @@ -449,6 +449,7 @@ test.concurrent(
}
);
const exampleValue = await client.readExample(example.id);
const initialVersion = exampleValue.modified_at;
expect(exampleValue.inputs.input).toEqual("hello world");
expect(exampleValue?.outputs?.output).toEqual("hi there");
// Create multiple
Expand All @@ -465,6 +466,10 @@ test.concurrent(
],
datasetId: dataset.id,
});
const initialExamplesList = await toArray(
client.listExamples({ datasetId: dataset.id, asOf: initialVersion })
);
expect(initialExamplesList.length).toEqual(1);
const examplesList = await toArray(
client.listExamples({ datasetId: dataset.id })
);
Expand All @@ -474,6 +479,15 @@ test.concurrent(
client.listExamples({ datasetId: dataset.id })
);
expect(examplesList2.length).toEqual(3);
const datasetDiff = await client.diffDatasetVersions({
datasetId: dataset.id,
fromVersion: initialVersion,
toVersion: "latest",
});
expect(datasetDiff.examples_added.length).toEqual(3);
expect(datasetDiff.examples_modified.length).toEqual(0);
expect(datasetDiff.examples_removed.length).toEqual(1);

await client.deleteDataset({ datasetId: dataset.id });
},
180_000
Expand Down
70 changes: 70 additions & 0 deletions python/langsmith/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2202,6 +2202,76 @@ def read_dataset(
_tenant_id=self._get_optional_tenant_id(),
)

def diff_dataset_versions(
self,
dataset_id: Optional[ID_TYPE] = None,
*,
dataset_name: Optional[str] = None,
from_version: Union[str, datetime.datetime],
to_version: Union[str, datetime.datetime],
) -> ls_schemas.DatasetDiffInfo:
"""Get the difference between two versions of a dataset.
Parameters
----------
dataset_id : str or None, default=None
The ID of the dataset.
dataset_name : str or None, default=None
The name of the dataset.
from_version : str or datetime.datetime
The starting version for the diff.
to_version : str or datetime.datetime
The ending version for the diff.
Returns:
-------
DatasetDiffInfo
The difference between the two versions of the dataset.
Examples:
.. code-block:: python
# Get the difference between two tagged versions of a dataset
from_version = "prod"
to_version = "dev"
diff = client.diff_dataset_versions(
dataset_name="my-dataset",
from_version=from_version,
to_version=to_version,
)
print(diff)
# Get the difference between two timestamped versions of a dataset
from_version = datetime.datetime(2024, 1, 1)
to_version = datetime.datetime(2024, 2, 1)
diff = client.diff_dataset_versions(
dataset_name="my-dataset",
from_version=from_version,
to_version=to_version,
)
print(diff)
"""
if dataset_id is None:
if dataset_name is None:
raise ValueError("Must provide either dataset name or ID")
dataset_id = self.read_dataset(dataset_name=dataset_name).id
dsid = _as_uuid(dataset_id, "dataset_id")
response = self.session.get(
f"{self.api_url}/datasets/{dsid}/versions/diff",
headers=self._headers,
params={
"from_version": from_version.isoformat()
if isinstance(from_version, datetime.datetime)
else from_version,
"to_version": to_version.isoformat()
if isinstance(to_version, datetime.datetime)
else to_version,
},
)
ls_utils.raise_for_status_with_text(response)
return ls_schemas.DatasetDiffInfo(**response.json())

def read_dataset_openai_finetuning(
self, dataset_id: Optional[str] = None, *, dataset_name: Optional[str] = None
) -> list:
Expand Down
17 changes: 17 additions & 0 deletions python/langsmith/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,3 +669,20 @@ class TimeDeltaInput(TypedDict, total=False):
"""Number of hours."""
minutes: int
"""Number of minutes."""


class DatasetDiffInfo(BaseModel):
"""Represents the difference information between two datasets.
Attributes:
examples_modified (List[UUID]): A list of UUIDs representing
the modified examples.
examples_added (List[UUID]): A list of UUIDs representing
the added examples.
examples_removed (List[UUID]): A list of UUIDs representing
the removed examples.
"""

examples_modified: List[UUID]
examples_added: List[UUID]
examples_removed: List[UUID]
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langsmith"
version = "0.1.21"
version = "0.1.22"
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
authors = ["LangChain <support@langchain.dev>"]
license = "MIT"
Expand Down
40 changes: 35 additions & 5 deletions python/tests/integration_tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def wait_for(

@pytest.fixture
def langchain_client(monkeypatch: pytest.MonkeyPatch) -> Client:
monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
# monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://dev.api.smith.langchain.com")
monkeypatch.setenv("LANGCHAIN_API_KEY", "ls__29e47afc8ad24323aef27444617fb8db")
return Client()


Expand Down Expand Up @@ -196,24 +198,52 @@ def test_create_project(
langchain_client.delete_project(project_name=project_name)


@freeze_time("2023-01-01")
def test_create_dataset(
monkeypatch: pytest.MonkeyPatch, langchain_client: Client
) -> None:
"""Test persisting runs and adding feedback."""
monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
dataset_name = "__test_create_dataset"
monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://dev.api.smith.langchain.com")
monkeypatch.setenv("LANGCHAIN_API_KEY", "ls__29e47afc8ad24323aef27444617fb8db")
dataset_name = "__test_create_dataset" + uuid4().hex[:4]
if langchain_client.has_dataset(dataset_name=dataset_name):
langchain_client.delete_dataset(dataset_name=dataset_name)
dataset = langchain_client.create_dataset(dataset_name, data_type=DataType.llm)
ground_truth = "bcde"
langchain_client.create_example(
example = langchain_client.create_example(
inputs={"input": "hello world"},
outputs={"output": ground_truth},
dataset_id=dataset.id,
)
initial_version = example.modified_at
loaded_dataset = langchain_client.read_dataset(dataset_name=dataset_name)
assert loaded_dataset.data_type == DataType.llm
example_2 = langchain_client.create_example(
inputs={"input": "hello world 2"},
outputs={"output": "fghi"},
dataset_id=dataset.id,
)
langchain_client.update_example(
example_id=example.id,
inputs={"input": "hello world"},
outputs={"output": "bcde"},
)
initial_examples = list(
langchain_client.list_examples(dataset_id=dataset.id, as_of=initial_version)
)
assert len(initial_examples) == 1
latest_examples = list(langchain_client.list_examples(dataset_id=dataset.id))
assert len(latest_examples) == 2
latest_tagged_examples = list(
langchain_client.list_examples(dataset_id=dataset.id, as_of="latest")
)
assert len(latest_tagged_examples) == 2
assert latest_tagged_examples == latest_examples
diffs = langchain_client.diff_dataset_versions(
loaded_dataset.id, from_version=initial_version, to_version="latest"
)
assert diffs.examples_added == [example_2.id]
assert diffs.examples_removed == []
assert diffs.examples_modified == [example.id]
langchain_client.delete_dataset(dataset_id=dataset.id)


Expand Down

0 comments on commit 4952ae7

Please sign in to comment.