diff --git a/.changeset/sweet-hippos-render.md b/.changeset/sweet-hippos-render.md
new file mode 100644
index 000000000..3e9812f70
--- /dev/null
+++ b/.changeset/sweet-hippos-render.md
@@ -0,0 +1,5 @@
+---
+"@livekit/agents": patch
+---
+
+feat(metrics): add `playbackLatency` metric on assistant `ChatMessage`s
diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts
index 35794277b..c1341af7d 100644
--- a/agents/src/llm/chat_context.ts
+++ b/agents/src/llm/chat_context.ts
@@ -89,6 +89,16 @@ export interface MetricsReport {
   onUserTurnCompletedDelay?: number;
   llmNodeTtft?: number;
   ttsNodeTtfb?: number;
+  /**
+   * Delay (in seconds) between forwarding the first audio frame and the `AudioOutput`
+   * reporting playback started. Near-zero for the default room output (self-reported
+   * when the frame is pushed to the track, so it doesn't account for network delivery
+   * to the client); meaningful when a remote avatar worker is in the chain and reports
+   * playback via the `lk.playback_started` RPC.
+   *
+   * Assistant `ChatMessage` only.
+   */
+  playbackLatency?: number;
   e2eLatency?: number;
 }
 
diff --git a/agents/src/telemetry/traces.ts b/agents/src/telemetry/traces.ts
index 5e992de69..276f48ebb 100644
--- a/agents/src/telemetry/traces.ts
+++ b/agents/src/telemetry/traces.ts
@@ -307,6 +307,7 @@ interface ProtoMetricsReport {
   onUserTurnCompletedDelay?: number;
   llmNodeTtft?: number;
   ttsNodeTtfb?: number;
+  playbackLatency?: number;
   e2eLatency?: number;
 }
 
@@ -403,6 +404,9 @@ function chatItemToProto(item: ChatItem): ProtoChatItem {
       if (metrics.ttsNodeTtfb !== undefined) {
         protoMetrics.ttsNodeTtfb = metrics.ttsNodeTtfb;
       }
+      if (metrics.playbackLatency !== undefined) {
+        protoMetrics.playbackLatency = metrics.playbackLatency;
+      }
       if (metrics.e2eLatency !== undefined) {
         protoMetrics.e2eLatency = metrics.e2eLatency;
       }
diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts
index b0ddf9d30..9b7482373 100644
--- a/agents/src/voice/agent_activity.ts
+++ b/agents/src/voice/agent_activity.ts
@@ -1829,10 +1829,12 @@ export class AgentActivity implements RecognitionHooks {
     }
 
     let replyStartedSpeakingAt: number | undefined;
+    let replyStartedForwardingAt: number | undefined;
     let replyTtsGenData: _TTSGenerationData | null = null;
 
-    const onFirstFrame = (startedSpeakingAt?: number) => {
+    const onFirstFrame = (audioOut: _AudioOut | null, startedSpeakingAt?: number) => {
       replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
+      replyStartedForwardingAt = audioOut?.startedForwardingAt ?? replyStartedSpeakingAt;
       this.agentSession._updateAgentState('speaking', {
         startTime: startedSpeakingAt,
         otelContext: speechHandle._agentTurnContext,
@@ -1846,7 +1848,7 @@ export class AgentActivity implements RecognitionHooks {
     if (!audioOutput) {
       if (textOut) {
         textOut.firstTextFut.await
-          .then(() => onFirstFrame())
+          .then(() => onFirstFrame(null))
           .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
       }
     } else {
@@ -1881,8 +1883,9 @@ export class AgentActivity implements RecognitionHooks {
         tasks.push(forwardTask);
         audioOut = _audioOut;
       }
+      const audioOutForCb = audioOut;
       audioOut.firstFrameFut.await
-        .then((ts) => onFirstFrame(ts))
+        .then((ts) => onFirstFrame(audioOutForCb, ts))
         .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
     }
 
@@ -1910,6 +1913,11 @@ export class AgentActivity implements RecognitionHooks {
       if (replyStartedSpeakingAt !== undefined) {
         replyAssistantMetrics.startedSpeakingAt = replyStartedSpeakingAt / 1000; // ms -> seconds
         replyAssistantMetrics.stoppedSpeakingAt = replyStoppedSpeakingAt / 1000; // ms -> seconds
+
+        if (replyStartedForwardingAt !== undefined) {
+          replyAssistantMetrics.playbackLatency =
+            (replyStartedSpeakingAt - replyStartedForwardingAt) / 1000; // ms -> seconds
+        }
       }
 
       const message = ChatMessage.create({
@@ -2107,8 +2115,10 @@ export class AgentActivity implements RecognitionHooks {
     }
 
     let agentStartedSpeakingAt: number | undefined;
-    const onFirstFrame = (startedSpeakingAt?: number) => {
+    let agentStartedForwardingAt: number | undefined;
+    const onFirstFrame = (audioOutRef: _AudioOut | null, startedSpeakingAt?: number) => {
       agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
+      agentStartedForwardingAt = audioOutRef?.startedForwardingAt ?? agentStartedSpeakingAt;
       this.agentSession._updateAgentState('speaking', {
         startTime: startedSpeakingAt,
         otelContext: speechHandle._agentTurnContext,
@@ -2130,14 +2140,14 @@ export class AgentActivity implements RecognitionHooks {
         audioOut = _audioOut;
         tasks.push(forwardTask);
         audioOut.firstFrameFut.await
-          .then((ts) => onFirstFrame(ts))
+          .then((ts) => onFirstFrame(audioOut, ts))
           .catch(() => this.logger.debug('firstFrameFut cancelled before first frame'));
       } else {
         throw Error('ttsGenData is null when audioOutput is enabled');
       }
     } else {
       textOut?.firstTextFut.await
-        .then(() => onFirstFrame())
+        .then(() => onFirstFrame(null))
         .catch(() => this.logger.debug('firstTextFut cancelled before first frame'));
     }
 
@@ -2186,6 +2196,11 @@ export class AgentActivity implements RecognitionHooks {
       assistantMetrics.startedSpeakingAt = agentStartedSpeakingAt / 1000; // ms -> seconds
       assistantMetrics.stoppedSpeakingAt = agentStoppedSpeakingAt / 1000; // ms -> seconds
 
+      if (agentStartedForwardingAt !== undefined) {
+        assistantMetrics.playbackLatency =
+          (agentStartedSpeakingAt - agentStartedForwardingAt) / 1000; // ms -> seconds
+      }
+
       if (userMetrics?.stoppedSpeakingAt !== undefined) {
         const e2eLatency = agentStartedSpeakingAt / 1000 - userMetrics.stoppedSpeakingAt;
         assistantMetrics.e2eLatency = e2eLatency;
diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts
index cc24fead1..6e8158638 100644
--- a/agents/src/voice/generation.ts
+++ b/agents/src/voice/generation.ts
@@ -786,6 +786,14 @@ export function performTextForwarding(
 export interface _AudioOut {
   audio: Array<AudioFrame>;
   firstFrameFut: Future<number>;
+  /**
+   * Timestamp (ms, `Date.now()`) when the first audio frame was forwarded to the
+   * `AudioOutput`. Set by `forwardAudio` as soon as the first TTS frame is
+   * appended; remains `undefined` until then. Used together with the playback-started
+   * timestamp from `firstFrameFut` to derive the assistant's `playbackLatency`
+   * metric.
+   */
+  startedForwardingAt?: number;
 }
 
 async function forwardAudio(
@@ -822,6 +830,9 @@ async function forwardAudio(
       if (done) break;
 
       out.audio.push(frame);
+      if (out.startedForwardingAt === undefined) {
+        out.startedForwardingAt = Date.now();
+      }
 
       if (
         !out.firstFrameFut.done &&