Skip to content

Commit

Permalink
feat(python, js): add anonymizer API (#788)
Browse files Browse the repository at this point in the history
Expected TypeScript usage:
```ts
import { createAnonymizer } from "langsmith/anonymizer"

// provide list of regex and replacement values
const anonymizer = createAnonymizer([
  { pattern: "...", replace: "[value]" }
])

const client = new Client({ anonymizer })

// replace string values
const anonymizer = createAnonymizer((value) => value.replace("...", "[value]"))
```

Expected Python usage:
```python
from langsmith.anonymizer import create_anonymizer

# provide list of regex and replacement values
anonymizer = create_anonymizer([{"pattern": r"...", "replace": "[value]"}])

client = Client(anonymizer=anonymizer)

# replace string values
anonymizer = create_anonymizer(lambda text: r"...".sub("[value]", text))
```
  • Loading branch information
dqbd committed Jun 19, 2024
2 parents 31c3a40 + e9c321e commit 96ddd94
Show file tree
Hide file tree
Showing 15 changed files with 661 additions and 14 deletions.
4 changes: 4 additions & 0 deletions js/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ Chinook_Sqlite.sql
/wrappers.js
/wrappers.d.ts
/wrappers.d.cts
/anonymizer.cjs
/anonymizer.js
/anonymizer.d.ts
/anonymizer.d.cts
/wrappers/openai.cjs
/wrappers/openai.js
/wrappers/openai.d.ts
Expand Down
19 changes: 17 additions & 2 deletions js/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "langsmith",
"version": "0.1.31",
"version": "0.1.33",
"description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.",
"packageManager": "yarn@1.22.19",
"files": [
Expand Down Expand Up @@ -37,6 +37,10 @@
"wrappers.js",
"wrappers.d.ts",
"wrappers.d.cts",
"anonymizer.cjs",
"anonymizer.js",
"anonymizer.d.ts",
"anonymizer.d.cts",
"wrappers/openai.cjs",
"wrappers/openai.js",
"wrappers/openai.d.ts",
Expand Down Expand Up @@ -91,6 +95,7 @@
"dependencies": {
"@types/uuid": "^9.0.1",
"commander": "^10.0.1",
"lodash.set": "^4.3.2",
"p-queue": "^6.6.2",
"p-retry": "4",
"uuid": "^9.0.0"
Expand All @@ -104,6 +109,7 @@
"@langchain/langgraph": "^0.0.19",
"@tsconfig/recommended": "^1.0.2",
"@types/jest": "^29.5.1",
"@types/lodash.set": "^4.3.9",
"@typescript-eslint/eslint-plugin": "^5.59.8",
"@typescript-eslint/parser": "^5.59.8",
"babel-jest": "^29.5.0",
Expand Down Expand Up @@ -228,6 +234,15 @@
"import": "./wrappers.js",
"require": "./wrappers.cjs"
},
"./anonymizer": {
"types": {
"import": "./anonymizer.d.ts",
"require": "./anonymizer.d.cts",
"default": "./anonymizer.d.ts"
},
"import": "./anonymizer.js",
"require": "./anonymizer.cjs"
},
"./wrappers/openai": {
"types": {
"import": "./wrappers/openai.d.ts",
Expand All @@ -248,4 +263,4 @@
},
"./package.json": "./package.json"
}
}
}
2 changes: 2 additions & 0 deletions js/scripts/create-entrypoints.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@ const entrypoints = {
schemas: "schemas",
langchain: "langchain",
wrappers: "wrappers/index",
anonymizer: "anonymizer/index",
"wrappers/openai": "wrappers/openai",
"singletons/traceable": "singletons/traceable",
};

const updateJsonFile = (relativePath, updateFunction) => {
const contents = fs.readFileSync(relativePath).toString();
const res = updateFunction(JSON.parse(contents));
Expand Down
139 changes: 139 additions & 0 deletions js/src/anonymizer/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import set from "lodash.set";

export interface StringNode {
value: string;
path: string;
}

function extractStringNodes(data: unknown, options: { maxDepth?: number }) {
const parsedOptions = { ...options, maxDepth: options.maxDepth ?? 10 };

const queue: [value: unknown, depth: number, path: string][] = [
[data, 0, ""],
];

const result: StringNode[] = [];
while (queue.length > 0) {
const task = queue.shift();
if (task == null) continue;
const [value, depth, path] = task;
if (typeof value === "object" && value != null) {
if (depth >= parsedOptions.maxDepth) continue;
for (const [key, nestedValue] of Object.entries(value)) {
queue.push([nestedValue, depth + 1, path ? `${path}.${key}` : key]);
}
} else if (Array.isArray(value)) {
if (depth >= parsedOptions.maxDepth) continue;
for (let i = 0; i < value.length; i++) {
queue.push([value[i], depth + 1, `${path}[${i}]`]);
}
} else if (typeof value === "string") {
result.push({ value, path });
}
}

return result;
}

function deepClone<T>(data: T): T {
if ("structuredClone" in globalThis) {
return globalThis.structuredClone(data);
}

return JSON.parse(JSON.stringify(data));
}

export interface StringNodeProcessor {
maskNodes: (nodes: StringNode[]) => StringNode[];
}

export interface StringNodeRule {
type?: "pattern";
pattern: RegExp | string;
replace?: string;
}

export type ReplacerType =
| ((value: string, path?: string) => string)
| StringNodeRule[]
| StringNodeProcessor;

export function createAnonymizer(
replacer: ReplacerType,
options?: {
maxDepth?: number;
deepClone?: boolean;
}
) {
return <T>(data: T): T => {
const nodes = extractStringNodes(data, {
maxDepth: options?.maxDepth,
});

// by default we opt-in to mutate the value directly
// to improve performance
let mutateValue = options?.deepClone ? deepClone(data) : data;

const processor: StringNodeProcessor = Array.isArray(replacer)
? (() => {
const replacers: [regex: RegExp, replace: string][] = replacer.map(
({ pattern, type, replace }) => {
if (type != null && type !== "pattern")
throw new Error("Invalid anonymizer type");
return [
typeof pattern === "string"
? new RegExp(pattern, "g")
: pattern,
replace ?? "[redacted]",
];
}
);

if (replacers.length === 0) throw new Error("No replacers provided");
return {
maskNodes: (nodes: StringNode[]) => {
return nodes.reduce<StringNode[]>((memo, item) => {
const newValue = replacers.reduce((value, [regex, replace]) => {
const result = value.replace(regex, replace);

// make sure we reset the state of regex
regex.lastIndex = 0;

return result;
}, item.value);

if (newValue !== item.value) {
memo.push({ value: newValue, path: item.path });
}

return memo;
}, []);
},
};
})()
: typeof replacer === "function"
? {
maskNodes: (nodes: StringNode[]) =>
nodes.reduce<StringNode[]>((memo, item) => {
const newValue = replacer(item.value, item.path);
if (newValue !== item.value) {
memo.push({ value: newValue, path: item.path });
}

return memo;
}, []),
}
: replacer;

const toUpdate = processor.maskNodes(nodes);
for (const node of toUpdate) {
if (node.path === "") {
mutateValue = node.value as unknown as T;
} else {
set(mutateValue as unknown as object, node.path, node.value);
}
}

return mutateValue;
};
}
13 changes: 9 additions & 4 deletions js/src/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ interface ClientConfig {
callerOptions?: AsyncCallerParams;
timeout_ms?: number;
webUrl?: string;
hideInputs?: boolean;
hideOutputs?: boolean;
anonymizer?: (values: KVMap) => KVMap;
hideInputs?: boolean | ((inputs: KVMap) => KVMap);
hideOutputs?: boolean | ((outputs: KVMap) => KVMap);
autoBatchTracing?: boolean;
pendingAutoBatchedRunLimit?: number;
fetchOptions?: RequestInit;
Expand Down Expand Up @@ -429,8 +430,12 @@ export class Client {
...(config.callerOptions ?? {}),
onFailedResponseHook: handle429,
});
this.hideInputs = config.hideInputs ?? defaultConfig.hideInputs;
this.hideOutputs = config.hideOutputs ?? defaultConfig.hideOutputs;

this.hideInputs =
config.hideInputs ?? config.anonymizer ?? defaultConfig.hideInputs;
this.hideOutputs =
config.hideOutputs ?? config.anonymizer ?? defaultConfig.hideOutputs;

this.autoBatchTracing = config.autoBatchTracing ?? this.autoBatchTracing;
this.pendingAutoBatchedRunLimit =
config.pendingAutoBatchedRunLimit ?? this.pendingAutoBatchedRunLimit;
Expand Down
2 changes: 1 addition & 1 deletion js/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ export type {
export { RunTree, type RunTreeConfig } from "./run_trees.js";

// Update using yarn bump-version
export const __version__ = "0.1.31";
export const __version__ = "0.1.33";
131 changes: 131 additions & 0 deletions js/src/tests/anonymizer.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import { StringNodeRule, createAnonymizer } from "../anonymizer/index.js";
import { v4 as uuid } from "uuid";
import { traceable } from "../traceable.js";
import { BaseMessage, SystemMessage } from "@langchain/core/messages";
import { mockClient } from "./utils/mock_client.js";
import { getAssumedTreeFromCalls } from "./utils/tree.js";

const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+.[a-zA-Z]{2,}/g;
const UUID_REGEX =
/[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}/g;

describe("replacer", () => {
const replacer = (text: string) =>
text.replace(EMAIL_REGEX, "[email address]").replace(UUID_REGEX, "[uuid]");

test("object", () => {
expect(
createAnonymizer(replacer)({
message: "Hello, this is my email: hello@example.com",
metadata: uuid(),
})
).toEqual({
message: "Hello, this is my email: [email address]",
metadata: "[uuid]",
});
});

test("array", () => {
expect(createAnonymizer(replacer)(["human", "hello@example.com"])).toEqual([
"human",
"[email address]",
]);
});

test("string", () => {
expect(createAnonymizer(replacer)("hello@example.com")).toEqual(
"[email address]"
);
});
});

describe("declared", () => {
const replacers: StringNodeRule[] = [
{ pattern: EMAIL_REGEX, replace: "[email address]" },
{ pattern: UUID_REGEX, replace: "[uuid]" },
];

test("object", () => {
expect(
createAnonymizer(replacers)({
message: "Hello, this is my email: hello@example.com",
metadata: uuid(),
})
).toEqual({
message: "Hello, this is my email: [email address]",
metadata: "[uuid]",
});
});

test("array", () => {
expect(createAnonymizer(replacers)(["human", "hello@example.com"])).toEqual(
["human", "[email address]"]
);
});

test("string", () => {
expect(createAnonymizer(replacers)("hello@example.com")).toEqual(
"[email address]"
);
});
});

describe("client", () => {
test("messages", async () => {
const anonymizer = createAnonymizer([
{ pattern: EMAIL_REGEX, replace: "[email]" },
{ pattern: UUID_REGEX, replace: "[uuid]" },
]);

const { client, callSpy } = mockClient({ anonymizer });

const id = uuid();
const child = traceable(
(value: { messages: BaseMessage[]; values: Record<string, unknown> }) => {
return [
...value.messages.map((message) => message.content.toString()),
...Object.entries(value.values).map((lst) => lst.join(": ")),
].join("\n");
},
{ name: "child" }
);

const evaluate = traceable(
(values: Record<string, unknown>) => {
const messages = [new SystemMessage(`UUID: ${id}`)];
return child({ messages, values });
},
{ client, name: "evaluate", tracingEnabled: true }
);

const result = await evaluate({ email: "hello@example.com" });

expect(result).toEqual(
[`UUID: ${id}`, `email: hello@example.com`].join("\n")
);

expect(getAssumedTreeFromCalls(callSpy.mock.calls)).toMatchObject({
nodes: ["evaluate:0", "child:1"],
data: {
"evaluate:0": {
inputs: { email: "[email]" },
outputs: { outputs: [`UUID: [uuid]`, `email: [email]`].join("\n") },
},
"child:1": {
inputs: {
messages: [
{
lc: 1,
type: "constructor",
id: ["langchain_core", "messages", "SystemMessage"],
kwargs: { content: "UUID: [uuid]" },
},
],
values: { email: "[email]" },
},
outputs: { outputs: [`UUID: [uuid]`, `email: [email]`].join("\n") },
},
},
});
});
});
5 changes: 3 additions & 2 deletions js/src/tests/utils/mock_client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import { jest } from "@jest/globals";
import { Client } from "../../index.js";

export const mockClient = () => {
const client = new Client({ autoBatchTracing: false });
type ClientParams = Exclude<ConstructorParameters<typeof Client>[0], undefined>;
export const mockClient = (config?: Omit<ClientParams, "autoBatchTracing">) => {
const client = new Client({ ...config, autoBatchTracing: false });
const callSpy = jest
.spyOn((client as any).caller, "call")
.mockResolvedValue({ ok: true, text: () => "" });
Expand Down
Loading

0 comments on commit 96ddd94

Please sign in to comment.