From 5a4258aaba25120a3fbcda443f509f5b039d87e0 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 6 Nov 2025 22:34:42 +0000 Subject: [PATCH 1/3] fix: Include the AI Judge Config key with tracked metrics --- packages/sdk/server-ai/src/LDAIClientImpl.ts | 4 +-- .../server-ai/src/LDAIConfigTrackerImpl.ts | 7 ++++- .../src/api/config/LDAIConfigUtils.ts | 29 +++++++++++++------ .../sdk/server-ai/src/api/config/types.ts | 4 +++ packages/sdk/server-ai/src/api/judge/Judge.ts | 1 + packages/sdk/server-ai/src/api/judge/types.ts | 2 ++ 6 files changed, 35 insertions(+), 12 deletions(-) diff --git a/packages/sdk/server-ai/src/LDAIClientImpl.ts b/packages/sdk/server-ai/src/LDAIClientImpl.ts index 54cb054fe..6955e1a05 100644 --- a/packages/sdk/server-ai/src/LDAIClientImpl.ts +++ b/packages/sdk/server-ai/src/LDAIClientImpl.ts @@ -63,7 +63,7 @@ export class LDAIClientImpl implements LDAIClient { this._logger?.warn( `AI Config mode mismatch for ${key}: expected ${mode}, got ${flagMode}. Returning disabled config.`, ); - return LDAIConfigUtils.createDisabledConfig(mode); + return LDAIConfigUtils.createDisabledConfig(key, mode); } const tracker = new LDAIConfigTrackerImpl( @@ -78,7 +78,7 @@ export class LDAIClientImpl implements LDAIClient { context, ); - const config = LDAIConfigUtils.fromFlagValue(value, tracker); + const config = LDAIConfigUtils.fromFlagValue(key, value, tracker); // Apply variable interpolation (always needed for ldctx) return this._applyInterpolation(config, context, variables); diff --git a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts index f4e6624a7..5dcdf4405 100644 --- a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts +++ b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts @@ -78,7 +78,12 @@ export class LDAIConfigTrackerImpl implements LDAIConfigTracker { trackEvalScores(scores: Record) { // Track each evaluation score individually Object.entries(scores).forEach(([metricKey, evalScore]) => { - this._ldClient.track(metricKey, this._context, this.getTrackData(), evalScore.score); + this._ldClient.track( + metricKey, + this._context, + { ...this.getTrackData(), judgeConfigKey: evalScore.judgeConfigKey }, + evalScore.score, + ); }); } diff --git a/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts b/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts index b03ee2882..cd943be6c 100644 --- a/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts +++ b/packages/sdk/server-ai/src/api/config/LDAIConfigUtils.ts @@ -82,19 +82,23 @@ export class LDAIConfigUtils { * @param tracker The tracker to add to the config * @returns The appropriate AI configuration type */ - static fromFlagValue(flagValue: LDAIConfigFlagValue, tracker: LDAIConfigTracker): LDAIConfigKind { + static fromFlagValue( + key: string, + flagValue: LDAIConfigFlagValue, + tracker: LDAIConfigTracker, + ): LDAIConfigKind { // Determine the actual mode from flag value // eslint-disable-next-line no-underscore-dangle const flagValueMode = flagValue._ldMeta?.mode; switch (flagValueMode) { case 'agent': - return this.toAgentConfig(flagValue, tracker); + return this.toAgentConfig(key, flagValue, tracker); case 'judge': - return this.toJudgeConfig(flagValue, tracker); + return this.toJudgeConfig(key, flagValue, tracker); case 'completion': default: - return this.toCompletionConfig(flagValue, tracker); + return this.toCompletionConfig(key, flagValue, tracker); } } @@ -104,15 +108,17 @@ export class LDAIConfigUtils { * @param mode The mode for the disabled config * @returns A disabled config of the appropriate type */ - static createDisabledConfig(mode: LDAIConfigMode): LDAIConfigKind { + static createDisabledConfig(key: string, mode: LDAIConfigMode): LDAIConfigKind { switch (mode) { case 'agent': return { + key, enabled: false, tracker: undefined, } as LDAIAgentConfig; case 'judge': return { + key, enabled: false, tracker: undefined, evaluationMetricKeys: [], @@ -121,6 +127,7 @@ export class LDAIConfigUtils { default: // Default to completion config for completion mode or any unexpected mode return { + key, enabled: false, tracker: undefined, } as LDAICompletionConfig; @@ -133,8 +140,9 @@ export class LDAIConfigUtils { * @param flagValue The flag value from LaunchDarkly * @returns Base configuration object */ - private static _toBaseConfig(flagValue: LDAIConfigFlagValue) { + private static _toBaseConfig(key: string, flagValue: LDAIConfigFlagValue) { return { + key, // eslint-disable-next-line no-underscore-dangle enabled: flagValue._ldMeta?.enabled ?? false, model: flagValue.model, @@ -150,11 +158,12 @@ export class LDAIConfigUtils { * @returns A completion configuration */ static toCompletionConfig( + key: string, flagValue: LDAIConfigFlagValue, tracker: LDAIConfigTracker, ): LDAICompletionConfig { return { - ...this._toBaseConfig(flagValue), + ...this._toBaseConfig(key, flagValue), tracker, messages: flagValue.messages, judgeConfiguration: flagValue.judgeConfiguration, @@ -169,11 +178,12 @@ export class LDAIConfigUtils { * @returns An agent configuration */ static toAgentConfig( + key: string, flagValue: LDAIConfigFlagValue, tracker: LDAIConfigTracker, ): LDAIAgentConfig { return { - ...this._toBaseConfig(flagValue), + ...this._toBaseConfig(key, flagValue), tracker, instructions: flagValue.instructions, judgeConfiguration: flagValue.judgeConfiguration, @@ -188,11 +198,12 @@ export class LDAIConfigUtils { * @returns A judge configuration */ static toJudgeConfig( + key: string, flagValue: LDAIConfigFlagValue, tracker: LDAIConfigTracker, ): LDAIJudgeConfig { return { - ...this._toBaseConfig(flagValue), + ...this._toBaseConfig(key, flagValue), tracker, messages: flagValue.messages, evaluationMetricKeys: flagValue.evaluationMetricKeys || [], diff --git a/packages/sdk/server-ai/src/api/config/types.ts b/packages/sdk/server-ai/src/api/config/types.ts index d6682676e..ade099037 100644 --- a/packages/sdk/server-ai/src/api/config/types.ts +++ b/packages/sdk/server-ai/src/api/config/types.ts @@ -95,6 +95,10 @@ export interface LDAIConfigDefault { * Base AI Config interface without mode-specific fields. */ export interface LDAIConfig extends Omit { + /** + * The key of the AI Config. + */ + key: string; /** * Whether the configuration is enabled. */ diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index 581a36d57..3762519fe 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -208,6 +208,7 @@ export class Judge { results[metricKey] = { score: evalData.score, reasoning: evalData.reasoning, + judgeConfigKey: this._aiConfig.key, }; }); diff --git a/packages/sdk/server-ai/src/api/judge/types.ts b/packages/sdk/server-ai/src/api/judge/types.ts index a265506b1..ea61f8bca 100644 --- a/packages/sdk/server-ai/src/api/judge/types.ts +++ b/packages/sdk/server-ai/src/api/judge/types.ts @@ -24,6 +24,8 @@ export interface EvalScore { score: number; /** Reasoning behind the provided score for this metric */ reasoning: string; + /** The key of the judge configuration that was used to evaluate this metric */ + judgeConfigKey?: string; } /** From 6cf398039196facc6a44792834e9f5a3da1398da Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Thu, 6 Nov 2025 23:25:37 +0000 Subject: [PATCH 2/3] fix unit tests --- .../sdk/server-ai/__tests__/Judge.test.ts | 49 ++++++++++++++----- .../__tests__/LDAIClientImpl.test.ts | 5 ++ .../server-ai/__tests__/TrackedChat.test.ts | 1 + 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts index ed04f1fc4..23efd3eec 100644 --- a/packages/sdk/server-ai/__tests__/Judge.test.ts +++ b/packages/sdk/server-ai/__tests__/Judge.test.ts @@ -39,6 +39,7 @@ describe('Judge', () => { // Create a basic judge config judgeConfig = { + key: 'test-judge', enabled: true, messages: [ { role: 'system', content: 'You are a helpful judge that evaluates AI responses.' }, @@ -106,9 +107,21 @@ describe('Judge', () => { expect(result).toEqual({ evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - accuracy: { score: 0.9, reasoning: 'The response is factually accurate' }, - helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' }, + relevance: { + score: 0.8, + reasoning: 'The response is relevant to the question', + judgeConfigKey: 'test-judge', + }, + accuracy: { + score: 0.9, + reasoning: 'The response is factually accurate', + judgeConfigKey: 'test-judge', + }, + helpfulness: { + score: 0.7, + reasoning: 'The response provides helpful information', + judgeConfigKey: 'test-judge', + }, }, success: true, }); @@ -254,8 +267,8 @@ describe('Judge', () => { // When one metric is missing, it returns the partial evals it has with success: false expect(result).toEqual({ evals: { - relevance: { score: 0.8, reasoning: 'Good' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + relevance: { score: 0.8, reasoning: 'Good', judgeConfigKey: 'test-judge' }, + helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' }, }, success: false, }); @@ -364,9 +377,21 @@ describe('Judge', () => { expect(result).toEqual({ evals: { - relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - accuracy: { score: 0.9, reasoning: 'The response is factually accurate' }, - helpfulness: { score: 0.7, reasoning: 'The response provides helpful information' }, + relevance: { + score: 0.8, + reasoning: 'The response is relevant to the question', + judgeConfigKey: 'test-judge', + }, + accuracy: { + score: 0.9, + reasoning: 'The response is factually accurate', + judgeConfigKey: 'test-judge', + }, + helpfulness: { + score: 0.7, + reasoning: 'The response provides helpful information', + judgeConfigKey: 'test-judge', + }, }, success: true, }); @@ -454,9 +479,9 @@ describe('Judge', () => { const result = parseResponse(responseData); expect(result).toEqual({ - relevance: { score: 0.8, reasoning: 'Good' }, - accuracy: { score: 0.9, reasoning: 'Accurate' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + relevance: { score: 0.8, reasoning: 'Good', judgeConfigKey: 'test-judge' }, + accuracy: { score: 0.9, reasoning: 'Accurate', judgeConfigKey: 'test-judge' }, + helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' }, }); }); @@ -489,7 +514,7 @@ describe('Judge', () => { // Only helpfulness passes validation, relevance and accuracy are skipped expect(result).toEqual({ - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' }, }); }); }); diff --git a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts index 2f99ed3a4..bfb5e13ff 100644 --- a/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts +++ b/packages/sdk/server-ai/__tests__/LDAIClientImpl.test.ts @@ -372,6 +372,7 @@ describe('agents method', () => { expect(result).toEqual({ 'research-agent': { + key: 'research-agent', model: { name: 'research-model', parameters: { temperature: 0.3, maxTokens: 2048 }, @@ -382,6 +383,7 @@ describe('agents method', () => { enabled: true, }, 'writing-agent': { + key: 'writing-agent', model: { name: 'writing-model', parameters: { temperature: 0.7, maxTokens: 1024 }, @@ -482,6 +484,7 @@ describe('createJudge method', () => { }; const mockJudgeConfig = { + key: 'test-judge', enabled: true, model: { name: 'gpt-4' }, provider: { name: 'openai' }, @@ -525,6 +528,7 @@ describe('createJudge method', () => { }; const mockJudgeConfig = { + key: 'test-judge', enabled: false, evaluationMetricKeys: [], }; @@ -548,6 +552,7 @@ describe('createJudge method', () => { }; const mockJudgeConfig = { + key: 'test-judge', enabled: true, model: { name: 'gpt-4' }, provider: { name: 'openai' }, diff --git a/packages/sdk/server-ai/__tests__/TrackedChat.test.ts b/packages/sdk/server-ai/__tests__/TrackedChat.test.ts index e70835e4f..d750a47e6 100644 --- a/packages/sdk/server-ai/__tests__/TrackedChat.test.ts +++ b/packages/sdk/server-ai/__tests__/TrackedChat.test.ts @@ -33,6 +33,7 @@ describe('TrackedChat', () => { // Create a basic AI config aiConfig = { + key: 'test-config', enabled: true, messages: [{ role: 'system', content: 'You are a helpful assistant.' }], model: { name: 'gpt-4' }, From a5e82d0a2f06f236f1d9cc9638f3a24fb423fcc7 Mon Sep 17 00:00:00 2001 From: jsonbailey Date: Fri, 7 Nov 2025 17:57:09 +0000 Subject: [PATCH 3/3] move key to judge response --- .../sdk/server-ai/__tests__/Judge.test.ts | 24 +++++++++---------- .../server-ai/src/LDAIConfigTrackerImpl.ts | 11 ++++++--- .../sdk/server-ai/src/api/chat/TrackedChat.ts | 14 +++++++---- .../src/api/config/LDAIConfigTracker.ts | 9 ++++++- packages/sdk/server-ai/src/api/judge/Judge.ts | 3 ++- packages/sdk/server-ai/src/api/judge/types.ts | 4 ++-- 6 files changed, 41 insertions(+), 24 deletions(-) diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts index 23efd3eec..248509574 100644 --- a/packages/sdk/server-ai/__tests__/Judge.test.ts +++ b/packages/sdk/server-ai/__tests__/Judge.test.ts @@ -110,20 +110,18 @@ describe('Judge', () => { relevance: { score: 0.8, reasoning: 'The response is relevant to the question', - judgeConfigKey: 'test-judge', }, accuracy: { score: 0.9, reasoning: 'The response is factually accurate', - judgeConfigKey: 'test-judge', }, helpfulness: { score: 0.7, reasoning: 'The response provides helpful information', - judgeConfigKey: 'test-judge', }, }, success: true, + judgeConfigKey: 'test-judge', }); expect(mockProvider.invokeStructuredModel).toHaveBeenCalledWith( @@ -267,10 +265,11 @@ describe('Judge', () => { // When one metric is missing, it returns the partial evals it has with success: false expect(result).toEqual({ evals: { - relevance: { score: 0.8, reasoning: 'Good', judgeConfigKey: 'test-judge' }, - helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' }, + relevance: { score: 0.8, reasoning: 'Good' }, + helpfulness: { score: 0.7, reasoning: 'Helpful' }, }, success: false, + judgeConfigKey: 'test-judge', }); }); @@ -302,6 +301,7 @@ describe('Judge', () => { expect(result).toEqual({ evals: {}, success: false, + judgeConfigKey: 'test-judge', }); }); @@ -315,6 +315,7 @@ describe('Judge', () => { evals: {}, success: false, error: 'Provider error', + judgeConfigKey: 'test-judge', }); expect(mockLogger.error).toHaveBeenCalledWith('Judge evaluation failed:', error); }); @@ -328,6 +329,7 @@ describe('Judge', () => { evals: {}, success: false, error: 'Unknown error', + judgeConfigKey: 'test-judge', }); }); }); @@ -380,20 +382,18 @@ describe('Judge', () => { relevance: { score: 0.8, reasoning: 'The response is relevant to the question', - judgeConfigKey: 'test-judge', }, accuracy: { score: 0.9, reasoning: 'The response is factually accurate', - judgeConfigKey: 'test-judge', }, helpfulness: { score: 0.7, reasoning: 'The response provides helpful information', - judgeConfigKey: 'test-judge', }, }, success: true, + judgeConfigKey: 'test-judge', }); expect(mockProvider.invokeStructuredModel).toHaveBeenCalledWith( @@ -479,9 +479,9 @@ describe('Judge', () => { const result = parseResponse(responseData); expect(result).toEqual({ - relevance: { score: 0.8, reasoning: 'Good', judgeConfigKey: 'test-judge' }, - accuracy: { score: 0.9, reasoning: 'Accurate', judgeConfigKey: 'test-judge' }, - helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' }, + relevance: { score: 0.8, reasoning: 'Good' }, + accuracy: { score: 0.9, reasoning: 'Accurate' }, + helpfulness: { score: 0.7, reasoning: 'Helpful' }, }); }); @@ -514,7 +514,7 @@ describe('Judge', () => { // Only helpfulness passes validation, relevance and accuracy are skipped expect(result).toEqual({ - helpfulness: { score: 0.7, reasoning: 'Helpful', judgeConfigKey: 'test-judge' }, + helpfulness: { score: 0.7, reasoning: 'Helpful' }, }); }); }); diff --git a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts index 5dcdf4405..582b03224 100644 --- a/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts +++ b/packages/sdk/server-ai/src/LDAIConfigTrackerImpl.ts @@ -3,7 +3,7 @@ import { LDContext } from '@launchdarkly/js-server-sdk-common'; import { name as aiSdkName, version as aiSdkVersion } from '../package.json'; import { LDAIConfigTracker } from './api/config'; import { LDAIMetricSummary } from './api/config/LDAIConfigTracker'; -import { EvalScore } from './api/judge/types'; +import { EvalScore, JudgeResponse } from './api/judge/types'; import { createBedrockTokenUsage, createOpenAiUsage, @@ -76,12 +76,17 @@ export class LDAIConfigTrackerImpl implements LDAIConfigTracker { } trackEvalScores(scores: Record) { - // Track each evaluation score individually Object.entries(scores).forEach(([metricKey, evalScore]) => { + this._ldClient.track(metricKey, this._context, this.getTrackData(), evalScore.score); + }); + } + + trackJudgeResponse(response: JudgeResponse) { + Object.entries(response.evals).forEach(([metricKey, evalScore]) => { this._ldClient.track( metricKey, this._context, - { ...this.getTrackData(), judgeConfigKey: evalScore.judgeConfigKey }, + { ...this.getTrackData(), judgeConfigKey: response.judgeConfigKey }, evalScore.score, ); }); diff --git a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts index dca7f8488..542547bff 100644 --- a/packages/sdk/server-ai/src/api/chat/TrackedChat.ts +++ b/packages/sdk/server-ai/src/api/chat/TrackedChat.ts @@ -84,13 +84,17 @@ export class TrackedChat { return undefined; } - const evalResult = await judge.evaluateMessages(messages, response, judgeConfig.samplingRate); - - if (evalResult && evalResult.success) { - this.tracker.trackEvalScores(evalResult.evals); + const judgeResponse = await judge.evaluateMessages( + messages, + response, + judgeConfig.samplingRate, + ); + + if (judgeResponse && judgeResponse.success) { + this.tracker.trackJudgeResponse(judgeResponse); } - return evalResult; + return judgeResponse; }); // ensure all evaluations complete even if some fail diff --git a/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts b/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts index eb9beb258..4c7436642 100644 --- a/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts +++ b/packages/sdk/server-ai/src/api/config/LDAIConfigTracker.ts @@ -1,4 +1,4 @@ -import { EvalScore } from '../judge/types'; +import { EvalScore, JudgeResponse } from '../judge/types'; import { LDAIMetrics, LDFeedbackKind, LDTokenUsage } from '../metrics'; /** @@ -94,6 +94,13 @@ export interface LDAIConfigTracker { */ trackEvalScores(scores: Record): void; + /** + * Track a judge response containing evaluation scores and judge configuration key. + * + * @param response Judge response containing evaluation scores and judge configuration key + */ + trackJudgeResponse(response: JudgeResponse): void; + /** * Track the duration of execution of the provided function. * diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index 3762519fe..e608743ac 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -91,6 +91,7 @@ export class Judge { return { evals, success, + judgeConfigKey: this._aiConfig.key, }; } catch (error) { this._logger?.error('Judge evaluation failed:', error); @@ -98,6 +99,7 @@ export class Judge { evals: {}, success: false, error: error instanceof Error ? error.message : 'Unknown error', + judgeConfigKey: this._aiConfig.key, }; } } @@ -208,7 +210,6 @@ export class Judge { results[metricKey] = { score: evalData.score, reasoning: evalData.reasoning, - judgeConfigKey: this._aiConfig.key, }; }); diff --git a/packages/sdk/server-ai/src/api/judge/types.ts b/packages/sdk/server-ai/src/api/judge/types.ts index ea61f8bca..68ad141c8 100644 --- a/packages/sdk/server-ai/src/api/judge/types.ts +++ b/packages/sdk/server-ai/src/api/judge/types.ts @@ -24,14 +24,14 @@ export interface EvalScore { score: number; /** Reasoning behind the provided score for this metric */ reasoning: string; - /** The key of the judge configuration that was used to evaluate this metric */ - judgeConfigKey?: string; } /** * Response from a judge evaluation containing scores and reasoning for multiple metrics. */ export interface JudgeResponse { + /** The key of the judge configuration that was used to generate this response */ + judgeConfigKey?: string; /** Dictionary where keys are metric names and values contain score and reasoning */ evals: Record; /** Whether the evaluation completed successfully */