diff --git a/.changeset/sweet-hippos-render.md b/.changeset/sweet-hippos-render.md new file mode 100644 index 000000000..3e9812f70 --- /dev/null +++ b/.changeset/sweet-hippos-render.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents": patch +--- + +feat(metrics): add `playbackLatency` metric on assistant `ChatMessage`s diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts index 35794277b..c1341af7d 100644 --- a/agents/src/llm/chat_context.ts +++ b/agents/src/llm/chat_context.ts @@ -89,6 +89,16 @@ export interface MetricsReport { onUserTurnCompletedDelay?: number; llmNodeTtft?: number; ttsNodeTtfb?: number; + /** + * Delay (in seconds) between forwarding the first audio frame and the `AudioOutput` + * reporting playback started. Near-zero for the default room output (self-reported + * when the frame is pushed to the track, so it doesn't account for network delivery + * to the client); meaningful when a remote avatar worker is in the chain and reports + * playback via the `lk.playback_started` RPC. + * + * Assistant `ChatMessage` only. + */ + playbackLatency?: number; e2eLatency?: number; } diff --git a/agents/src/telemetry/traces.ts b/agents/src/telemetry/traces.ts index 5e992de69..276f48ebb 100644 --- a/agents/src/telemetry/traces.ts +++ b/agents/src/telemetry/traces.ts @@ -307,6 +307,7 @@ interface ProtoMetricsReport { onUserTurnCompletedDelay?: number; llmNodeTtft?: number; ttsNodeTtfb?: number; + playbackLatency?: number; e2eLatency?: number; } @@ -403,6 +404,9 @@ function chatItemToProto(item: ChatItem): ProtoChatItem { if (metrics.ttsNodeTtfb !== undefined) { protoMetrics.ttsNodeTtfb = metrics.ttsNodeTtfb; } + if (metrics.playbackLatency !== undefined) { + protoMetrics.playbackLatency = metrics.playbackLatency; + } if (metrics.e2eLatency !== undefined) { protoMetrics.e2eLatency = metrics.e2eLatency; } diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index b0ddf9d30..9b7482373 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -1829,10 +1829,12 @@ export class AgentActivity implements RecognitionHooks { } let replyStartedSpeakingAt: number | undefined; + let replyStartedForwardingAt: number | undefined; let replyTtsGenData: _TTSGenerationData | null = null; - const onFirstFrame = (startedSpeakingAt?: number) => { + const onFirstFrame = (audioOut: _AudioOut | null, startedSpeakingAt?: number) => { replyStartedSpeakingAt = startedSpeakingAt ?? Date.now(); + replyStartedForwardingAt = audioOut?.startedForwardingAt ?? replyStartedSpeakingAt; this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, @@ -1846,7 +1848,7 @@ export class AgentActivity implements RecognitionHooks { if (!audioOutput) { if (textOut) { textOut.firstTextFut.await - .then(() => onFirstFrame()) + .then(() => onFirstFrame(null)) .catch(() => this.logger.debug('firstTextFut cancelled before first frame')); } } else { @@ -1881,8 +1883,9 @@ export class AgentActivity implements RecognitionHooks { tasks.push(forwardTask); audioOut = _audioOut; } + const audioOutForCb = audioOut; audioOut.firstFrameFut.await - .then((ts) => onFirstFrame(ts)) + .then((ts) => onFirstFrame(audioOutForCb, ts)) .catch(() => this.logger.debug('firstFrameFut cancelled before first frame')); } @@ -1910,6 +1913,11 @@ export class AgentActivity implements RecognitionHooks { if (replyStartedSpeakingAt !== undefined) { replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds + + if (replyStartedForwardingAt !== undefined) { + replyAssistantMetrics.playbackLatency = + (replyStartedSpeakingAt - replyStartedForwardingAt) / 1000; // ms -> seconds + } } const message = ChatMessage.create({ @@ -2107,8 +2115,10 @@ export class AgentActivity implements RecognitionHooks { } let agentStartedSpeakingAt: number | undefined; - const onFirstFrame = (startedSpeakingAt?: number) => { + let agentStartedForwardingAt: number | undefined; + const onFirstFrame = (audioOutRef: _AudioOut | null, startedSpeakingAt?: number) => { agentStartedSpeakingAt = startedSpeakingAt ?? Date.now(); + agentStartedForwardingAt = audioOutRef?.startedForwardingAt ?? agentStartedSpeakingAt; this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, @@ -2130,14 +2140,14 @@ export class AgentActivity implements RecognitionHooks { audioOut = _audioOut; tasks.push(forwardTask); audioOut.firstFrameFut.await - .then((ts) => onFirstFrame(ts)) + .then((ts) => onFirstFrame(audioOut, ts)) .catch(() => this.logger.debug('firstFrameFut cancelled before first frame')); } else { throw Error('ttsGenData is null when audioOutput is enabled'); } } else { textOut?.firstTextFut.await - .then(() => onFirstFrame()) + .then(() => onFirstFrame(null)) .catch(() => this.logger.debug('firstTextFut cancelled before first frame')); } @@ -2186,6 +2196,11 @@ export class AgentActivity implements RecognitionHooks { assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds + if (agentStartedForwardingAt !== undefined) { + assistantMetrics.playbackLatency = + (agentStartedSpeakingAt - agentStartedForwardingAt) / 1000; // ms -> seconds + } + if (userMetrics?.stoppedSpeakingAt !== undefined) { const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt; assistantMetrics.e2eLatency = e2eLatency; diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index cc24fead1..6e8158638 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -786,6 +786,14 @@ export function performTextForwarding( export interface _AudioOut { audio: Array; firstFrameFut: Future; + /** + * Timestamp (ms, `Date.now()`) when the first audio frame was forwarded to the + * `AudioOutput`. Set by `forwardAudio` as soon as the first TTS frame is + * appended; remains `undefined` until then. Used together with the playback-started + * timestamp from `firstFrameFut` to derive the assistant's `playbackLatency` + * metric. + */ + startedForwardingAt?: number; } async function forwardAudio( @@ -822,6 +830,9 @@ async function forwardAudio( if (done) break; out.audio.push(frame); + if (out.startedForwardingAt === undefined) { + out.startedForwardingAt = Date.now(); + } if ( !out.firstFrameFut.done &&