From a2321fe75cc7e7abe1448b0cbe884477c99a3b39 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 27 Apr 2026 06:44:07 +0000 Subject: [PATCH 1/3] feat(metrics): add playbackLatency metric on assistant ChatMessages Ports livekit/agents#5524 to TypeScript. Adds a new `playbackLatency` field on `MetricsReport` measuring the delay (in seconds) between forwarding the first audio frame and the `AudioOutput` reporting that playback started. - `_AudioOut` tracks `startedForwardingAt` (ms) inside `forwardAudio` - pipeline-reply and tts-say paths in `AgentActivity` capture `audioOut.startedForwardingAt` in their `onFirstFrame` callback and populate `playbackLatency = (startedSpeakingAt - startedForwardingAt) / 1000` on the assistant `ChatMessage` metrics. Near-zero for the default room output; meaningful when a remote avatar worker is in the chain and reports playback via `lk.playback_started` RPC. --- .changeset/sweet-hippos-render.md | 5 +++++ agents/src/llm/chat_context.ts | 11 +++++++++++ agents/src/voice/agent_activity.ts | 31 ++++++++++++++++++++++++------ agents/src/voice/generation.ts | 13 +++++++++++++ 4 files changed, 54 insertions(+), 6 deletions(-) create mode 100644 .changeset/sweet-hippos-render.md diff --git a/.changeset/sweet-hippos-render.md b/.changeset/sweet-hippos-render.md new file mode 100644 index 000000000..3e9812f70 --- /dev/null +++ b/.changeset/sweet-hippos-render.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents": patch +--- + +feat(metrics): add `playbackLatency` metric on assistant `ChatMessage`s diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts index 35794277b..ad8d546f6 100644 --- a/agents/src/llm/chat_context.ts +++ b/agents/src/llm/chat_context.ts @@ -89,6 +89,17 @@ export interface MetricsReport { onUserTurnCompletedDelay?: number; llmNodeTtft?: number; ttsNodeTtfb?: number; + /** + * Delay (in seconds) between forwarding the first audio frame and the `AudioOutput` + * reporting playback started. Near-zero for the default room output (self-reported + * when the frame is pushed to the track, so it doesn't account for network delivery + * to the client); meaningful when a remote avatar worker is in the chain and reports + * playback via the `lk.playback_started` RPC. + * + * Assistant `ChatMessage` only. + */ + // Ref: python livekit-agents/livekit/agents/llm/chat_context.py - 294-301 lines + playbackLatency?: number; e2eLatency?: number; } diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index b0ddf9d30..71f79f5b5 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -1829,10 +1829,13 @@ export class AgentActivity implements RecognitionHooks { } let replyStartedSpeakingAt: number | undefined; + let replyStartedForwardingAt: number | undefined; let replyTtsGenData: _TTSGenerationData | null = null; - const onFirstFrame = (startedSpeakingAt?: number) => { + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2195-2208 lines + const onFirstFrame = (audioOut: _AudioOut | null, startedSpeakingAt?: number) => { replyStartedSpeakingAt = startedSpeakingAt ?? Date.now(); + replyStartedForwardingAt = audioOut?.startedForwardingAt ?? replyStartedSpeakingAt; this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, @@ -1846,7 +1849,7 @@ export class AgentActivity implements RecognitionHooks { if (!audioOutput) { if (textOut) { textOut.firstTextFut.await - .then(() => onFirstFrame()) + .then(() => onFirstFrame(null)) .catch(() => this.logger.debug('firstTextFut cancelled before first frame')); } } else { @@ -1881,8 +1884,9 @@ export class AgentActivity implements RecognitionHooks { tasks.push(forwardTask); audioOut = _audioOut; } + const audioOutForCb = audioOut; audioOut.firstFrameFut.await - .then((ts) => onFirstFrame(ts)) + .then((ts) => onFirstFrame(audioOutForCb, ts)) .catch(() => this.logger.debug('firstFrameFut cancelled before first frame')); } @@ -1910,6 +1914,12 @@ export class AgentActivity implements RecognitionHooks { if (replyStartedSpeakingAt !== undefined) { replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds + + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2320-2323 lines + if (replyStartedForwardingAt !== undefined) { + replyAssistantMetrics.playbackLatency = + (replyStartedSpeakingAt - replyStartedForwardingAt) / 1000; // ms -> seconds + } } const message = ChatMessage.create({ @@ -2107,8 +2117,11 @@ export class AgentActivity implements RecognitionHooks { } let agentStartedSpeakingAt: number | undefined; - const onFirstFrame = (startedSpeakingAt?: number) => { + let agentStartedForwardingAt: number | undefined; + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2526-2548 lines + const onFirstFrame = (audioOutRef: _AudioOut | null, startedSpeakingAt?: number) => { agentStartedSpeakingAt = startedSpeakingAt ?? Date.now(); + agentStartedForwardingAt = audioOutRef?.startedForwardingAt ?? agentStartedSpeakingAt; this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, @@ -2130,14 +2143,14 @@ export class AgentActivity implements RecognitionHooks { audioOut = _audioOut; tasks.push(forwardTask); audioOut.firstFrameFut.await - .then((ts) => onFirstFrame(ts)) + .then((ts) => onFirstFrame(audioOut, ts)) .catch(() => this.logger.debug('firstFrameFut cancelled before first frame')); } else { throw Error('ttsGenData is null when audioOutput is enabled'); } } else { textOut?.firstTextFut.await - .then(() => onFirstFrame()) + .then(() => onFirstFrame(null)) .catch(() => this.logger.debug('firstTextFut cancelled before first frame')); } @@ -2186,6 +2199,12 @@ export class AgentActivity implements RecognitionHooks { assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds + // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2645-2647 lines + if (agentStartedForwardingAt !== undefined) { + assistantMetrics.playbackLatency = + (agentStartedSpeakingAt - agentStartedForwardingAt) / 1000; // ms -> seconds + } + if (userMetrics?.stoppedSpeakingAt !== undefined) { const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt; assistantMetrics.e2eLatency = e2eLatency; diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index cc24fead1..42adedb68 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -786,6 +786,15 @@ export function performTextForwarding( export interface _AudioOut { audio: Array; firstFrameFut: Future; + /** + * Timestamp (ms, `Date.now()`) when the first audio frame was forwarded to the + * `AudioOutput`. Set by `forwardAudio` as soon as the first TTS frame is + * appended; remains `undefined` until then. Used together with the playback-started + * timestamp from `firstFrameFut` to derive the assistant's `playbackLatency` + * metric. + */ + // Ref: python livekit-agents/livekit/agents/voice/generation.py - 380 lines + startedForwardingAt?: number; } async function forwardAudio( @@ -822,6 +831,10 @@ async function forwardAudio( if (done) break; out.audio.push(frame); + // Ref: python livekit-agents/livekit/agents/voice/generation.py - 414-416 lines + if (out.startedForwardingAt === undefined) { + out.startedForwardingAt = Date.now(); + } if ( !out.firstFrameFut.done && From 81de3d7804db80996c6326df69089ef150184675 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 28 Apr 2026 08:08:03 +0000 Subject: [PATCH 2/3] chore: remove // Ref: comments per review feedback --- agents/src/llm/chat_context.ts | 1 - agents/src/voice/agent_activity.ts | 4 ---- agents/src/voice/generation.ts | 2 -- 3 files changed, 7 deletions(-) diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts index ad8d546f6..c1341af7d 100644 --- a/agents/src/llm/chat_context.ts +++ b/agents/src/llm/chat_context.ts @@ -98,7 +98,6 @@ export interface MetricsReport { * * Assistant `ChatMessage` only. */ - // Ref: python livekit-agents/livekit/agents/llm/chat_context.py - 294-301 lines playbackLatency?: number; e2eLatency?: number; } diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 71f79f5b5..9b7482373 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -1832,7 +1832,6 @@ export class AgentActivity implements RecognitionHooks { let replyStartedForwardingAt: number | undefined; let replyTtsGenData: _TTSGenerationData | null = null; - // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2195-2208 lines const onFirstFrame = (audioOut: _AudioOut | null, startedSpeakingAt?: number) => { replyStartedSpeakingAt = startedSpeakingAt ?? Date.now(); replyStartedForwardingAt = audioOut?.startedForwardingAt ?? replyStartedSpeakingAt; @@ -1915,7 +1914,6 @@ export class AgentActivity implements RecognitionHooks { replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds - // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2320-2323 lines if (replyStartedForwardingAt !== undefined) { replyAssistantMetrics.playbackLatency = (replyStartedSpeakingAt - replyStartedForwardingAt) / 1000; // ms -> seconds @@ -2118,7 +2116,6 @@ export class AgentActivity implements RecognitionHooks { let agentStartedSpeakingAt: number | undefined; let agentStartedForwardingAt: number | undefined; - // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2526-2548 lines const onFirstFrame = (audioOutRef: _AudioOut | null, startedSpeakingAt?: number) => { agentStartedSpeakingAt = startedSpeakingAt ?? Date.now(); agentStartedForwardingAt = audioOutRef?.startedForwardingAt ?? agentStartedSpeakingAt; @@ -2199,7 +2196,6 @@ export class AgentActivity implements RecognitionHooks { assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds - // Ref: python livekit-agents/livekit/agents/voice/agent_activity.py - 2645-2647 lines if (agentStartedForwardingAt !== undefined) { assistantMetrics.playbackLatency = (agentStartedSpeakingAt - agentStartedForwardingAt) / 1000; // ms -> seconds diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index 42adedb68..6e8158638 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -793,7 +793,6 @@ export interface _AudioOut { * timestamp from `firstFrameFut` to derive the assistant's `playbackLatency` * metric. */ - // Ref: python livekit-agents/livekit/agents/voice/generation.py - 380 lines startedForwardingAt?: number; } @@ -831,7 +830,6 @@ async function forwardAudio( if (done) break; out.audio.push(frame); - // Ref: python livekit-agents/livekit/agents/voice/generation.py - 414-416 lines if (out.startedForwardingAt === undefined) { out.startedForwardingAt = Date.now(); } From 1a154eae3c7869004407f8e852c089d2c350f35f Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 28 Apr 2026 08:17:56 +0000 Subject: [PATCH 3/3] fix(telemetry): include playbackLatency in ProtoMetricsReport Without this, the metric is silently dropped when ChatItems are serialized into OTel traces. Addresses Devin Review finding on PR #1323. --- agents/src/telemetry/traces.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/agents/src/telemetry/traces.ts b/agents/src/telemetry/traces.ts index 5e992de69..276f48ebb 100644 --- a/agents/src/telemetry/traces.ts +++ b/agents/src/telemetry/traces.ts @@ -307,6 +307,7 @@ interface ProtoMetricsReport { onUserTurnCompletedDelay?: number; llmNodeTtft?: number; ttsNodeTtfb?: number; + playbackLatency?: number; e2eLatency?: number; } @@ -403,6 +404,9 @@ function chatItemToProto(item: ChatItem): ProtoChatItem { if (metrics.ttsNodeTtfb !== undefined) { protoMetrics.ttsNodeTtfb = metrics.ttsNodeTtfb; } + if (metrics.playbackLatency !== undefined) { + protoMetrics.playbackLatency = metrics.playbackLatency; + } if (metrics.e2eLatency !== undefined) { protoMetrics.e2eLatency = metrics.e2eLatency; }