-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(python, js): add anonymizer API (#788)
Expected TypeScript usage: ```ts import { createAnonymizer } from "langsmith/anonymizer" // provide list of regex and replacement values const anonymizer = createAnonymizer([ { pattern: "...", replace: "[value]" } ]) const client = new Client({ anonymizer }) // replace string values const anonymizer = createAnonymizer((value) => value.replace("...", "[value]")) ``` Expected Python usage: ```python from langsmith.anonymizer import create_anonymizer # provide list of regex and replacement values anonymizer = create_anonymizer([{"pattern": r"...", "replace": "[value]"}]) client = Client(anonymizer=anonymizer) # replace string values anonymizer = create_anonymizer(lambda text: r"...".sub("[value]", text)) ```
- Loading branch information
Showing
15 changed files
with
661 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
import set from "lodash.set"; | ||
|
||
export interface StringNode { | ||
value: string; | ||
path: string; | ||
} | ||
|
||
function extractStringNodes(data: unknown, options: { maxDepth?: number }) { | ||
const parsedOptions = { ...options, maxDepth: options.maxDepth ?? 10 }; | ||
|
||
const queue: [value: unknown, depth: number, path: string][] = [ | ||
[data, 0, ""], | ||
]; | ||
|
||
const result: StringNode[] = []; | ||
while (queue.length > 0) { | ||
const task = queue.shift(); | ||
if (task == null) continue; | ||
const [value, depth, path] = task; | ||
if (typeof value === "object" && value != null) { | ||
if (depth >= parsedOptions.maxDepth) continue; | ||
for (const [key, nestedValue] of Object.entries(value)) { | ||
queue.push([nestedValue, depth + 1, path ? `${path}.${key}` : key]); | ||
} | ||
} else if (Array.isArray(value)) { | ||
if (depth >= parsedOptions.maxDepth) continue; | ||
for (let i = 0; i < value.length; i++) { | ||
queue.push([value[i], depth + 1, `${path}[${i}]`]); | ||
} | ||
} else if (typeof value === "string") { | ||
result.push({ value, path }); | ||
} | ||
} | ||
|
||
return result; | ||
} | ||
|
||
function deepClone<T>(data: T): T { | ||
if ("structuredClone" in globalThis) { | ||
return globalThis.structuredClone(data); | ||
} | ||
|
||
return JSON.parse(JSON.stringify(data)); | ||
} | ||
|
||
export interface StringNodeProcessor { | ||
maskNodes: (nodes: StringNode[]) => StringNode[]; | ||
} | ||
|
||
export interface StringNodeRule { | ||
type?: "pattern"; | ||
pattern: RegExp | string; | ||
replace?: string; | ||
} | ||
|
||
export type ReplacerType = | ||
| ((value: string, path?: string) => string) | ||
| StringNodeRule[] | ||
| StringNodeProcessor; | ||
|
||
export function createAnonymizer( | ||
replacer: ReplacerType, | ||
options?: { | ||
maxDepth?: number; | ||
deepClone?: boolean; | ||
} | ||
) { | ||
return <T>(data: T): T => { | ||
const nodes = extractStringNodes(data, { | ||
maxDepth: options?.maxDepth, | ||
}); | ||
|
||
// by default we opt-in to mutate the value directly | ||
// to improve performance | ||
let mutateValue = options?.deepClone ? deepClone(data) : data; | ||
|
||
const processor: StringNodeProcessor = Array.isArray(replacer) | ||
? (() => { | ||
const replacers: [regex: RegExp, replace: string][] = replacer.map( | ||
({ pattern, type, replace }) => { | ||
if (type != null && type !== "pattern") | ||
throw new Error("Invalid anonymizer type"); | ||
return [ | ||
typeof pattern === "string" | ||
? new RegExp(pattern, "g") | ||
: pattern, | ||
replace ?? "[redacted]", | ||
]; | ||
} | ||
); | ||
|
||
if (replacers.length === 0) throw new Error("No replacers provided"); | ||
return { | ||
maskNodes: (nodes: StringNode[]) => { | ||
return nodes.reduce<StringNode[]>((memo, item) => { | ||
const newValue = replacers.reduce((value, [regex, replace]) => { | ||
const result = value.replace(regex, replace); | ||
|
||
// make sure we reset the state of regex | ||
regex.lastIndex = 0; | ||
|
||
return result; | ||
}, item.value); | ||
|
||
if (newValue !== item.value) { | ||
memo.push({ value: newValue, path: item.path }); | ||
} | ||
|
||
return memo; | ||
}, []); | ||
}, | ||
}; | ||
})() | ||
: typeof replacer === "function" | ||
? { | ||
maskNodes: (nodes: StringNode[]) => | ||
nodes.reduce<StringNode[]>((memo, item) => { | ||
const newValue = replacer(item.value, item.path); | ||
if (newValue !== item.value) { | ||
memo.push({ value: newValue, path: item.path }); | ||
} | ||
|
||
return memo; | ||
}, []), | ||
} | ||
: replacer; | ||
|
||
const toUpdate = processor.maskNodes(nodes); | ||
for (const node of toUpdate) { | ||
if (node.path === "") { | ||
mutateValue = node.value as unknown as T; | ||
} else { | ||
set(mutateValue as unknown as object, node.path, node.value); | ||
} | ||
} | ||
|
||
return mutateValue; | ||
}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
import { StringNodeRule, createAnonymizer } from "../anonymizer/index.js"; | ||
import { v4 as uuid } from "uuid"; | ||
import { traceable } from "../traceable.js"; | ||
import { BaseMessage, SystemMessage } from "@langchain/core/messages"; | ||
import { mockClient } from "./utils/mock_client.js"; | ||
import { getAssumedTreeFromCalls } from "./utils/tree.js"; | ||
|
||
const EMAIL_REGEX = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+.[a-zA-Z]{2,}/g; | ||
const UUID_REGEX = | ||
/[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}/g; | ||
|
||
describe("replacer", () => { | ||
const replacer = (text: string) => | ||
text.replace(EMAIL_REGEX, "[email address]").replace(UUID_REGEX, "[uuid]"); | ||
|
||
test("object", () => { | ||
expect( | ||
createAnonymizer(replacer)({ | ||
message: "Hello, this is my email: hello@example.com", | ||
metadata: uuid(), | ||
}) | ||
).toEqual({ | ||
message: "Hello, this is my email: [email address]", | ||
metadata: "[uuid]", | ||
}); | ||
}); | ||
|
||
test("array", () => { | ||
expect(createAnonymizer(replacer)(["human", "hello@example.com"])).toEqual([ | ||
"human", | ||
"[email address]", | ||
]); | ||
}); | ||
|
||
test("string", () => { | ||
expect(createAnonymizer(replacer)("hello@example.com")).toEqual( | ||
"[email address]" | ||
); | ||
}); | ||
}); | ||
|
||
describe("declared", () => { | ||
const replacers: StringNodeRule[] = [ | ||
{ pattern: EMAIL_REGEX, replace: "[email address]" }, | ||
{ pattern: UUID_REGEX, replace: "[uuid]" }, | ||
]; | ||
|
||
test("object", () => { | ||
expect( | ||
createAnonymizer(replacers)({ | ||
message: "Hello, this is my email: hello@example.com", | ||
metadata: uuid(), | ||
}) | ||
).toEqual({ | ||
message: "Hello, this is my email: [email address]", | ||
metadata: "[uuid]", | ||
}); | ||
}); | ||
|
||
test("array", () => { | ||
expect(createAnonymizer(replacers)(["human", "hello@example.com"])).toEqual( | ||
["human", "[email address]"] | ||
); | ||
}); | ||
|
||
test("string", () => { | ||
expect(createAnonymizer(replacers)("hello@example.com")).toEqual( | ||
"[email address]" | ||
); | ||
}); | ||
}); | ||
|
||
describe("client", () => { | ||
test("messages", async () => { | ||
const anonymizer = createAnonymizer([ | ||
{ pattern: EMAIL_REGEX, replace: "[email]" }, | ||
{ pattern: UUID_REGEX, replace: "[uuid]" }, | ||
]); | ||
|
||
const { client, callSpy } = mockClient({ anonymizer }); | ||
|
||
const id = uuid(); | ||
const child = traceable( | ||
(value: { messages: BaseMessage[]; values: Record<string, unknown> }) => { | ||
return [ | ||
...value.messages.map((message) => message.content.toString()), | ||
...Object.entries(value.values).map((lst) => lst.join(": ")), | ||
].join("\n"); | ||
}, | ||
{ name: "child" } | ||
); | ||
|
||
const evaluate = traceable( | ||
(values: Record<string, unknown>) => { | ||
const messages = [new SystemMessage(`UUID: ${id}`)]; | ||
return child({ messages, values }); | ||
}, | ||
{ client, name: "evaluate", tracingEnabled: true } | ||
); | ||
|
||
const result = await evaluate({ email: "hello@example.com" }); | ||
|
||
expect(result).toEqual( | ||
[`UUID: ${id}`, `email: hello@example.com`].join("\n") | ||
); | ||
|
||
expect(getAssumedTreeFromCalls(callSpy.mock.calls)).toMatchObject({ | ||
nodes: ["evaluate:0", "child:1"], | ||
data: { | ||
"evaluate:0": { | ||
inputs: { email: "[email]" }, | ||
outputs: { outputs: [`UUID: [uuid]`, `email: [email]`].join("\n") }, | ||
}, | ||
"child:1": { | ||
inputs: { | ||
messages: [ | ||
{ | ||
lc: 1, | ||
type: "constructor", | ||
id: ["langchain_core", "messages", "SystemMessage"], | ||
kwargs: { content: "UUID: [uuid]" }, | ||
}, | ||
], | ||
values: { email: "[email]" }, | ||
}, | ||
outputs: { outputs: [`UUID: [uuid]`, `email: [email]`].join("\n") }, | ||
}, | ||
}, | ||
}); | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.