From 356dfb757a8980da6978c0c38e1053601b9f6614 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Wed, 6 Aug 2025 17:02:08 -0400 Subject: [PATCH 01/51] feat: add wip hacked together start to an agents sdk --- agent-sdk/index.tsx | 661 ++++++++++++++++++ components/app.tsx | 33 +- .../agent-control-bar/agent-control-bar.tsx | 6 +- .../hooks/use-agent-control-bar.ts | 14 +- components/livekit/chat/chat-entry.tsx | 13 +- components/session-view.tsx | 66 +- 6 files changed, 726 insertions(+), 67 deletions(-) create mode 100644 agent-sdk/index.tsx diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx new file mode 100644 index 000000000..6f3862282 --- /dev/null +++ b/agent-sdk/index.tsx @@ -0,0 +1,661 @@ +import * as React from "react"; +import { useContext, useEffect, useState, useCallback } from "react"; +import { parallelMerge } from 'streaming-iterables'; +import { ConnectionState, LocalParticipant, Participant, ParticipantEvent, RemoteParticipant, Room, RoomEvent, Track, TrackEvent, TrackPublication, TranscriptionSegment } from "livekit-client"; +import { EventEmitter } from "stream"; +import { addMediaTimestampToTranscription, dedupeSegments, participantTrackEvents, TrackReference } from '@livekit/components-core'; +import { getParticipantTrackRefs } from '@livekit/components-core/src/observables/track'; +import { ParticipantEventCallbacks, ParticipantKind } from "livekit-client/dist/src/room/participant/Participant"; +import { TRACK_TRANSCRIPTION_DEFAULTS } from "../hooks"; +import { Future } from "livekit-client/dist/src/room/utils"; + +// --------------------- +// REACT +// --------------------- + +const AgentSessionContext = React.createContext(null); +export const AgentSessionProvider: React.FunctionComponent> = ({ agentSession, children }) => ( + + {children} + +); + +export function useAgentSession() { + const agentSession = useContext(AgentSessionContext); + if (!agentSession) { + throw new Error('useAgentSession not used within AgentSessionContext!'); + } + return agentSession; +} + +export function useAgentMessages() { + const agentSession = useAgentSession(); + + const [messages, setMessages] = useState< + Array + >(agentSession.messages); + useEffect(() => { + agentSession.on(AgentSessionEvent.MessagesChanged, setMessages); + }, [agentSession]); + + const send = useCallback(async (message: OutboundMessage) => { + return agentSession.sendMessage(message); + }, [agentSession]); + + return { messages, send }; +} + +export function useAgentSessionEvent( + eventName: AgentSessionEvent, + callback: (data: any /* FIXME: types */) => void, + dependencies: React.DependencyList, +) { + const agentSession = useAgentSession(); + + // FIXME: is doing this memoiztion here a good idea? Maybe useAgentSessionEvent(..., useCallback(...)) is preferrable? + const memoizedCallback = useCallback(callback, dependencies); + + useEffect(() => { + agentSession.on(eventName, memoizedCallback); + return () => { + agentSession.off(eventName, memoizedCallback); + }; + }, [eventName, memoizedCallback]); +} + +export function useAgentState() { + const agentSession = useAgentSession(); + const [agentState, setAgentState] = useState(agentSession.state); + const [isAvailable, setIsAvailable] = useState(agentSession.isAvailable); + + useAgentSessionEvent(AgentSessionEvent.AgentStateChanged, (newAgentState: AgentState) => { + setAgentState(newAgentState); + setIsAvailable(agentSession.isAvailable); + }, []); + + return { state: agentState, isAvailable }; +} + +function useParticipantEvents

( + participant: P, + eventNames: Array, + callback: (data: any /* FIXME: types */) => void, + dependencies: React.DependencyList, +) { + // FIXME: is doing this memoiztion here a good idea? Maybe useAgentSessionEvent(..., useCallback(...)) is preferrable? + const memoizedCallback = useCallback(callback, dependencies); + + useEffect(() => { + for (const eventName of eventNames) { + participant.on(eventName, memoizedCallback); + } + return () => { + for (const eventName of eventNames) { + participant.off(eventName, memoizedCallback); + } + }; + }, [participant, eventNames, memoizedCallback]); +} + +export function useAgentLocalParticipant() { + const agentSession = useAgentSession(); + + const [localParticipant, setLocalParticipant] = React.useState(agentSession.localParticipant); + const [microphoneTrack, setMicrophoneTrack] = React.useState(null); + + const participantObserver = useParticipantEvents(agentSession.localParticipant, [ + ParticipantEvent.TrackMuted, + ParticipantEvent.TrackUnmuted, + ParticipantEvent.ParticipantPermissionsChanged, + // ParticipantEvent.IsSpeakingChanged, + ParticipantEvent.TrackPublished, + ParticipantEvent.TrackUnpublished, + ParticipantEvent.LocalTrackPublished, + ParticipantEvent.LocalTrackUnpublished, + ParticipantEvent.MediaDevicesError, + ParticipantEvent.TrackSubscriptionStatusChanged, + // ParticipantEvent.ConnectionQualityChanged, + ], (p: LocalParticipant) => { + setLocalParticipant(p); + // FIXME: is the rest of this stuff needed? + // const { isMicrophoneEnabled, isCameraEnabled, isScreenShareEnabled } = p; + const microphoneTrack = p.getTrackPublication(Track.Source.Microphone); + setMicrophoneTrack(microphoneTrack ?? null); + // const cameraTrack = p.getTrackPublication(Track.Source.Camera); + // const participantMedia: ParticipantMedia = { + // isCameraEnabled, + // isMicrophoneEnabled, + // isScreenShareEnabled, + // cameraTrack, + // microphoneTrack, + // participant: p, + // }; + // return participantMedia; + }, []); + + return { localParticipant, microphoneTrack }; +} + +// hook ideas: +// useAgentTracks? (video) +// useAgentControls? (control bar stuff) + +// --------------------- +// BASE +// --------------------- + +const stateAttribute = 'lk.agent.state'; + +export type AgentState = + | 'disconnected' + | 'connecting' + | 'initializing' + | 'listening' + | 'thinking' + | 'speaking'; + +enum AgentParticipantEvent { + VideoTrackChanged = 'videoTrackChanged', + AudioTrackChanged = 'videoTrackChanged', + AgentAttributesChanged = 'agentAttributesChanged', + AgentTranscriptionsChanged = 'agentTranscriptionsChanged', +} + +// Goal: some sort of abstraction layer to provide information specific to the agent's interactions +// like video stream / audio stream / transcriptions / underlying participant attributes / etc, +// since it doesn't just come from one RemoteParticipant +// FIXME: maybe this could be named better? ... +class AgentParticipant extends EventEmitter { + private room: Room; + + private agentParticipant: RemoteParticipant | null = null; + private workerParticipant: RemoteParticipant | null = null; + audioTrack: TrackReference | null = null; + videoTrack: TrackReference | null = null; + + audioTrackSyncTime: { timestamp: number, rtpTimestamp?: number } | null = null; + + attributes: Record = {}; + + transcriptions: Array = []; + transcriptionBufferSize: number = TRACK_TRANSCRIPTION_DEFAULTS.bufferSize; + + constructor(room: Room) { + super(); + this.room = room; + + this.room.on(RoomEvent.ParticipantConnected, this.handleParticipantConnected); + this.room.on(RoomEvent.ParticipantDisconnected, this.handleParticipantDisconnected); + } + + teardown() { + this.room.off(RoomEvent.ParticipantConnected, this.handleParticipantConnected); + this.room.off(RoomEvent.ParticipantDisconnected, this.handleParticipantDisconnected); + } + + private handleParticipantConnected = () => { + this.updateParticipants(); + } + private handleParticipantDisconnected = () => { + this.updateParticipants(); + } + + private updateParticipants() { + const newAgentParticipant = this.roomRemoteParticipants.find( + (p) => p.kind === ParticipantKind.AGENT && !('lk.publish_on_behalf' in p.attributes), + ) ?? null; + const newWorkerParticipant = newAgentParticipant ? ( + this.roomRemoteParticipants.find( + (p) => + p.kind === ParticipantKind.AGENT && p.attributes['lk.publish_on_behalf'] === newAgentParticipant.identity, + ) ?? null + ) : null; + + // Keep this.agentParticipant / this.workerParticipant up to date + for (const event of participantTrackEvents) { + if (this.agentParticipant !== newAgentParticipant) { + this.agentParticipant?.off(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + // FIXME: emit AgentParticipantChanged? + newAgentParticipant?.on(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + this.agentParticipant = newAgentParticipant; + } + if (this.workerParticipant !== newWorkerParticipant) { + this.workerParticipant?.off(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + // FIXME: emit WorkerParticipantChanged? + newWorkerParticipant?.on(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + this.workerParticipant = newWorkerParticipant; + } + } + + if (this.agentParticipant !== newAgentParticipant) { + this.agentParticipant?.off(ParticipantEvent.AttributesChanged, this.handleAttributesChanged); + // FIXME: emit AgentAttributesChanged? + newAgentParticipant?.on(ParticipantEvent.AttributesChanged, this.handleAttributesChanged); + this.agentParticipant = newAgentParticipant; + } + } + + private handleUpdateTracks = () => { + const newVideoTrack = ( + this.agentTracks.find((t) => t.source === Track.Source.Camera) ?? + this.workerTracks.find((t) => t.source === Track.Source.Camera) ?? null + ); + if (this.videoTrack !== newVideoTrack) { + this.videoTrack = newVideoTrack; + this.emit(AgentParticipantEvent.VideoTrackChanged, newVideoTrack); + } + + const newAudioTrack = ( + this.agentTracks.find((t) => t.source === Track.Source.Microphone) ?? + this.workerTracks.find((t) => t.source === Track.Source.Microphone) ?? null + ); + if (this.audioTrack !== newAudioTrack) { + this.audioTrack = newAudioTrack; + this.audioTrack?.on(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); + + this.audioTrackSyncTime = { + timestamp: Date.now(), + rtpTimestamp: this.audioTrack?.publication.track?.rtpTimestamp, + }; + this.audioTrack?.publication.track?.on(TrackEvent.TimeSyncUpdate, this.handleTimeSyncUpdate); + + this.emit(AgentParticipantEvent.AudioTrackChanged, newAudioTrack); + } + }; + + private handleAttributesChanged = (attributes: Record) => { + this.attributes = attributes; + this.emit(AgentParticipantEvent.AgentAttributesChanged, attributes); + }; + + private handleTranscriptionReceived = (event: Array>) => { + const segments = event[0]; + if (!segments) { + return; + } + if (!this.audioTrackSyncTime) { + throw new Error('AgentParticipant - audioTrackSyncTime missing'); + } + const audioTrackSyncTime = this.audioTrackSyncTime; + + this.transcriptions = dedupeSegments( + this.transcriptions, + // when first receiving a segment, add the current media timestamp to it + segments.map((s) => addMediaTimestampToTranscription(s, audioTrackSyncTime)), + this.transcriptionBufferSize, + ); + this.emit(AgentParticipantEvent.AgentTranscriptionsChanged, this.transcriptions); + } + + private handleTimeSyncUpdate = (update: { timestamp: number; rtpTimestamp: number }) => { + this.audioTrackSyncTime = update; + }; + + private get roomRemoteParticipants() { + return Array.from(this.room.remoteParticipants.values()); + } + + private get agentTracks() { + if (!this.agentParticipant) { + return []; + } + return getParticipantTrackRefs( + this.agentParticipant, + { sources: [Track.Source.Microphone, Track.Source.Camera] } + ); + } + + private get workerTracks() { + if (!this.workerParticipant) { + return []; + } + return getParticipantTrackRefs( + this.workerParticipant, + { sources: [Track.Source.Microphone, Track.Source.Camera] } + ); + } +} + +abstract class BaseContent { + complete: boolean = false; +} + +export class TextContent extends BaseContent { + // TODO: some sort of id / `key`able field? + data: string; + + constructor(data: string) { + super(); + this.data = data; + this.complete = true; + } +} + +class TranscriptionContent extends BaseContent { + // TODO: some sort of id / `key`able field? How does this get generated / where does this come + // from? + data: string; + segmentId: TranscriptionSegment['id']; + + constructor(segment: TranscriptionSegment) { + super(); + this.segmentId = segment.id; + this.data = segment.text; + } +} + + + + + +abstract class BaseMessage { + id: string; + timestamp: Date; + metadata: Record = {}; + + constructor(id: string, timestamp: Date) { + this.id = id; + this.timestamp = timestamp; + } +} + +// TODO: images? attachments? rpc? +type InboundMessageContent = TranscriptionContent; + +export class InboundMessage extends BaseMessage { + contents: Array = []; + + constructor( + contents: Array, + id: string, + timestamp: Date = new Date(), + ) { + super(id, timestamp); + this.contents = contents; + } + + get complete() { + return this.contents.every(c => c.complete); + } +} + +type OutboundMessageContent = TextContent; +export class OutboundMessage extends BaseMessage { + contents: Array = []; + + constructor( + contents: Array, + id: string, + timestamp: Date = new Date() + ) { + super(id, timestamp); + this.contents = contents; + } +} + + + +enum MessageReceiverEvents { + NewIncomingMessage = 'newIncomingMessage' +} + +class MessageReceiverTerminationError extends Error {} + +abstract class MessageReceiver extends EventEmitter { + private signallingFuture = new Future(); + private queue: Array = []; + + // This returns cleanup function like useEffect maybe? That could be a good pattern? + abstract start(): Promise void)>; + + /** Submit new IncomingMessages to be received by anybody reading from messages() */ + protected enqueue(...messages: Array) { + for (const message of messages) { + this.queue.push(message); + this.emit(MessageReceiverEvents.NewIncomingMessage, message); + } + const oldSignallingFuture = this.signallingFuture; + this.signallingFuture = new Future(); + oldSignallingFuture.resolve?.(null); + } + + /** Terminate the messages() iteration from out of band */ + close() { + const name: string = (this as any).constructor.name ?? 'MessageReceiver'; + this.signallingFuture.reject?.( + new MessageReceiverTerminationError(`${name} terminated messages() iteration`) + ); + } + + /** A stream of newly generated `IncomingMessage`s */ + async *messages(): AsyncGenerator { + const cleanup = await this.start(); + try { + while (true) { + await this.signallingFuture.promise; + yield* this.queue; + this.queue = []; + } + } catch (err) { + if (err instanceof MessageReceiverTerminationError) { + cleanup?.(); + return; + } + } finally { + cleanup?.(); + } + } +} + +abstract class MessageSender { + abstract send(message: OutboundMessage): Promise; +} + +class TranscriptionMessageReceiver extends MessageReceiver { + agentParticipant: AgentParticipant; + + constructor(agentParticipant: AgentParticipant) { + super(); + this.agentParticipant = agentParticipant; + } + + async start() { + const handleAgentTranscriptionsChanged = (newTranscriptionSegments: Array) => { + for (const segment of newTranscriptionSegments) { + this.enqueue(new InboundMessage([ + new TranscriptionContent(segment), + ], new Date(segment.startTime))); + } + }; + + this.agentParticipant.on( + AgentParticipantEvent.AgentTranscriptionsChanged, + handleAgentTranscriptionsChanged, + ); + return () => { + this.agentParticipant.off( + AgentParticipantEvent.AgentTranscriptionsChanged, + handleAgentTranscriptionsChanged, + ); + }; + } +} + + + + +/** + * A `MessageReceiver` which takes a list of other `MessageReceiver`s and forwards along their `InboundMessage`s + * Conceptually, think `Promise.race` being run across each async iterator iteration. + */ +class CombinedMessageReceiver extends MessageReceiver { + private messageReceivers: Array; + + constructor(...messageReceivers: Array) { + super(); + this.messageReceivers = messageReceivers; + } + + async start() { + for await (const inboundMessage of parallelMerge(...this.messageReceivers.map(mr => mr.messages()))) { + this.enqueue(inboundMessage); + } + + return () => { + for (const messageReceiver of this.messageReceivers) { + messageReceiver.close(); + } + }; + } +} + + +export enum AgentSessionEvent { + AgentStateChanged = 'agentStateChanged', + AudioTrackChanged = 'audioTrackChanged', + VideoTrackChanged = 'videoTrackChanged', + AgentAttributesChanged = 'agentAttributesChanged', + MessagesChanged = 'messagesChanged', + AgentConnectionFailure = 'AgentConnectionFailure', +} + +export class AgentSession extends EventEmitter { + private room: Room; + state: AgentState = 'disconnected'; + + agentParticipant: AgentParticipant | null = null; + messageReceiver: MessageReceiver | null = null; + messages: Array = []; + // private transcriptionMessageReceiver: TranscriptionMessageReceiver; + // this.transcriptionMessageReceiver = new TranscriptionMessageReceiver(agentParticipant); + // this.transcriptionMessageReceiver.messages(), + // /* more `MessageReceiver`s here later */ + + constructor() { + super(); + + this.room = new Room(); + this.room.on(RoomEvent.Connected, this.handleRoomConnected); + this.room.on(RoomEvent.Disconnected, this.handleRoomDisconnected); + this.room.on(RoomEvent.ConnectionStateChanged, this.handleConnectionStateChanged); + } + + async connect(url: string, token: string) { + // FIXME: catch connection errors here and reraise? idk + await Promise.all([ + this.room.connect(url, token), + // FIXME: make it so the preconenct buffer thing can be disabled? + this.room.localParticipant.setMicrophoneEnabled(true, undefined, { preConnectBuffer: true }), + ]); + } + async disconnect() { + await this.room.disconnect(); + } + + private handleRoomConnected = () => { + this.agentParticipant = new AgentParticipant(this.room); + this.agentParticipant.on(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); + + this.messageReceiver = new CombinedMessageReceiver( + new TranscriptionMessageReceiver(this.agentParticipant), + ); + this.messageReceiver.on(MessageReceiverEvents.NewIncomingMessage, this.handleIncomingMessage); + + this.startAgentConnectedTimeout(); + } + + private handleRoomDisconnected = () => { + this.agentParticipant?.teardown(); + this.agentParticipant = null; + + this.messageReceiver?.off(MessageReceiverEvents.NewIncomingMessage, this.handleIncomingMessage); + this.messageReceiver?.close(); + this.messageReceiver = null; + + if (this.agentConnectedTimeout) { + clearTimeout(this.agentConnectedTimeout); + this.agentConnectedTimeout = null; + } + } + + private agentConnectedTimeout: NodeJS.Timeout | null = null; + private startAgentConnectedTimeout = () => { + this.agentConnectedTimeout = setTimeout(() => { + if (!this.isAvailable) { + const reason = + this.state === 'connecting' + ? 'Agent did not join the room. ' + : 'Agent connected but did not complete initializing. '; + + this.emit(AgentSessionEvent.AgentConnectionFailure, reason); + this.room.disconnect(); + } + }, 10_000); + } + + private handleConnectionStateChanged = async () => { + this.updateAgentState(); + } + + private handleAgentAttributesChanged = () => { + this.updateAgentState(); + } + + private handleIncomingMessage = (incomingMessage: InboundMessage) => { + // FIXME: Do message accumulation here? Or maybe add some other entity to handle it? + this.messages.push(incomingMessage); + this.emit(AgentSessionEvent.MessagesChanged, this.messages); + } + + private updateAgentState = () => { + if (!this.agentParticipant) { + throw new Error('AgentSession.agentParticipant is unset'); + } + const agentParticipantAttributes = this.agentParticipant.attributes; + const connectionState = this.room.state; + + let newAgentState: AgentState | null = null; + if (connectionState === ConnectionState.Disconnected) { + newAgentState = 'disconnected'; + } else if ( + connectionState === ConnectionState.Connecting || + !this.agentParticipant || + !agentParticipantAttributes?.[stateAttribute] + ) { + newAgentState = 'connecting'; + } else { + newAgentState = agentParticipantAttributes[stateAttribute] as AgentState; + } + + if (this.state !== newAgentState) { + this.state = newAgentState; + this.emit(AgentSessionEvent.AgentStateChanged, newAgentState); + } + } + + get isAvailable() { + return this.state == 'listening' || this.state == 'thinking' || this.state == 'speaking'; + } + + get localParticipant() { + return this.room?.localParticipant ?? null; + } + + // Mesasges: + // - transcriptions are probably how agent generated messages come into being? + // - lk.chat data channel messages also exist + async sendMessage(message: OutboundMessage) { + /* TODO */ + } + + generateReply() {} +} + + +// Proposal: +// Copy of LiveKitRoom, but for agents (LiveKitAgentSession?) +// - This exposes a context like RoomContext +// Hooks that replicate a lot of useVoiceAssistant functionality which tap into agent context: +// - useAgent gets raw AgentSession +// - useAgentMessages? +// - useAgentSend diff --git a/components/app.tsx b/components/app.tsx index 725170b28..8e1e8a03f 100644 --- a/components/app.tsx +++ b/components/app.tsx @@ -10,6 +10,7 @@ import { Toaster } from '@/components/ui/sonner'; import { Welcome } from '@/components/welcome'; import useConnectionDetails from '@/hooks/useConnectionDetails'; import type { AppConfig } from '@/lib/types'; +import { AgentSessionProvider } from '@/agent-sdk'; const MotionWelcome = motion.create(Welcome); const MotionSessionView = motion.create(SessionView); @@ -19,7 +20,7 @@ interface AppProps { } export function App({ appConfig }: AppProps) { - const room = useMemo(() => new Room(), []); + const agentSession = useMemo(() => new AgentSession(), []); const [sessionStarted, setSessionStarted] = useState(false); const { connectionDetails, refreshConnectionDetails } = useConnectionDetails(); @@ -34,23 +35,21 @@ export function App({ appConfig }: AppProps) { description: `${error.name}: ${error.message}`, }); }; - room.on(RoomEvent.MediaDevicesError, onMediaDevicesError); - room.on(RoomEvent.Disconnected, onDisconnected); + agentSession.room.on(RoomEvent.MediaDevicesError, onMediaDevicesError); + agentSession.room.on(RoomEvent.Disconnected, onDisconnected); return () => { - room.off(RoomEvent.Disconnected, onDisconnected); - room.off(RoomEvent.MediaDevicesError, onMediaDevicesError); + agentSession.room.off(RoomEvent.Disconnected, onDisconnected); + agentSession.room.off(RoomEvent.MediaDevicesError, onMediaDevicesError); }; - }, [room, refreshConnectionDetails]); + }, [agentSession, refreshConnectionDetails]); useEffect(() => { let aborted = false; - if (sessionStarted && room.state === 'disconnected' && connectionDetails) { - Promise.all([ - room.localParticipant.setMicrophoneEnabled(true, undefined, { - preConnectBuffer: appConfig.isPreConnectBufferEnabled, - }), - room.connect(connectionDetails.serverUrl, connectionDetails.participantToken), - ]).catch((error) => { + if (sessionStarted && agentSession.state === 'disconnected' && connectionDetails) { + agentSession.connect( + connectionDetails.serverUrl, + connectionDetails.participantToken, + ).catch((error) => { if (aborted) { // Once the effect has cleaned up after itself, drop any errors // @@ -68,9 +67,9 @@ export function App({ appConfig }: AppProps) { } return () => { aborted = true; - room.disconnect(); + agentSession.disconnect(); }; - }, [room, sessionStarted, connectionDetails, appConfig.isPreConnectBufferEnabled]); + }, [agentSession, sessionStarted, connectionDetails /* , appConfig.isPreConnectBufferEnabled */]); const { startButtonText } = appConfig; @@ -86,7 +85,7 @@ export function App({ appConfig }: AppProps) { transition={{ duration: 0.5, ease: 'linear', delay: sessionStarted ? 0 : 0.5 }} /> - + {/* --- */} @@ -103,7 +102,7 @@ export function App({ appConfig }: AppProps) { delay: sessionStarted ? 0.5 : 0, }} /> - + diff --git a/components/livekit/agent-control-bar/agent-control-bar.tsx b/components/livekit/agent-control-bar/agent-control-bar.tsx index 3f5051c58..2cd127c00 100644 --- a/components/livekit/agent-control-bar/agent-control-bar.tsx +++ b/components/livekit/agent-control-bar/agent-control-bar.tsx @@ -13,6 +13,7 @@ import { cn } from '@/lib/utils'; import { DeviceSelect } from '../device-select'; import { TrackToggle } from '../track-toggle'; import { UseAgentControlBarProps, useAgentControlBar } from './hooks/use-agent-control-bar'; +import { useAgentSession, useAgentState } from '@/agent-sdk'; export interface AgentControlBarProps extends React.HTMLAttributes, @@ -38,11 +39,12 @@ export function AgentControlBar({ onDeviceError, ...props }: AgentControlBarProps) { - const participants = useRemoteParticipants(); + // const participants = useRemoteParticipants(); const [chatOpen, setChatOpen] = React.useState(false); const [isSendingMessage, setIsSendingMessage] = React.useState(false); - const isAgentAvailable = participants.some((p) => p.isAgent); + // const isAgentAvailable = participants.some((p) => p.isAgent); + const { isAvailable: isAgentAvailable } = useAgentState(); const isInputDisabled = !chatOpen || !isAgentAvailable || isSendingMessage; const [isDisconnecting, setIsDisconnecting] = React.useState(false); diff --git a/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts b/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts index 2c1495e8f..7716df56e 100644 --- a/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts +++ b/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts @@ -8,6 +8,7 @@ import { useTrackToggle, } from '@livekit/components-react'; import { usePublishPermissions } from './use-publish-permissions'; +import { useAgentLocalParticipant } from '@/agent-sdk'; export interface ControlBarControls { microphone?: boolean; @@ -40,19 +41,20 @@ export function useAgentControlBar(props: UseAgentControlBarProps = {}): UseAgen leave: true, ...controls, }; - const { microphoneTrack, localParticipant } = useLocalParticipant(); - const publishPermissions = usePublishPermissions(); + // const { microphoneTrack, localParticipant } = useLocalParticipant(); + const { microphoneTrack, localParticipant } = useAgentLocalParticipant(); + const publishPermissions = usePublishPermissions(); // FIXME: replace this hook? const room = useRoomContext(); - const microphoneToggle = useTrackToggle({ + const microphoneToggle = useTrackToggle({ // FIXME: replace this hook? source: Track.Source.Microphone, onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.Microphone, error }), }); - const cameraToggle = useTrackToggle({ + const cameraToggle = useTrackToggle({ // FIXME: replace this hook? source: Track.Source.Camera, onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.Camera, error }), }); - const screenShareToggle = useTrackToggle({ + const screenShareToggle = useTrackToggle({ // FIXME: replace this hook? source: Track.Source.ScreenShare, onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.ScreenShare, error }), }); @@ -75,7 +77,7 @@ export function useAgentControlBar(props: UseAgentControlBarProps = {}): UseAgen saveAudioInputDeviceId, saveVideoInputEnabled, saveVideoInputDeviceId, - } = usePersistentUserChoices({ + } = usePersistentUserChoices({ // FIXME: replace this hook? preventSave: !saveUserChoices, }); diff --git a/components/livekit/chat/chat-entry.tsx b/components/livekit/chat/chat-entry.tsx index 1ad1ab849..3939fdd81 100644 --- a/components/livekit/chat/chat-entry.tsx +++ b/components/livekit/chat/chat-entry.tsx @@ -2,10 +2,11 @@ import * as React from 'react'; import type { MessageFormatter, ReceivedChatMessage } from '@livekit/components-react'; import { cn } from '@/lib/utils'; import { useChatMessage } from './hooks/utils'; +import { InboundMessage, OutboundMessage } from '@/agent-sdk'; export interface ChatEntryProps extends React.HTMLAttributes { /** The chat massage object to display. */ - entry: ReceivedChatMessage; + entry: InboundMessage | OutboundMessage; /** Hide sender name. Useful when displaying multiple consecutive chat messages from the same person. */ hideName?: boolean; /** Hide message timestamp. */ @@ -22,9 +23,15 @@ export const ChatEntry = ({ className, ...props }: ChatEntryProps) => { - const { message, hasBeenEdited, time, locale, name } = useChatMessage(entry, messageFormatter); + // FIXME: Where would this kind of metadata come from for real? + // const { message, hasBeenEdited, time, locale, name } = useChatMessage(entry, messageFormatter); + const message = entry.contents.map(c => c.data).join(''); + const hasBeenEdited = false; + const time = entry.timestamp; + const locale = typeof navigator !== 'undefined' ? navigator.language : 'en-US'; + const name = entry instanceof OutboundMessage ? 'User' : 'Agent'; - const isUser = entry.from?.isLocal ?? false; + const isUser = entry instanceof OutboundMessage;//entry.from?.isLocal ?? false; const messageOrigin = isUser ? 'remote' : 'local'; return ( diff --git a/components/session-view.tsx b/components/session-view.tsx index ae6019933..a553ef8ea 100644 --- a/components/session-view.tsx +++ b/components/session-view.tsx @@ -17,6 +17,7 @@ import useChatAndTranscription from '@/hooks/useChatAndTranscription'; import { useDebugMode } from '@/hooks/useDebug'; import type { AppConfig } from '@/lib/types'; import { cn } from '@/lib/utils'; +import { AgentSessionEvent, OutboundMessage, TextContent, useAgentMessages, useAgentSession, useAgentSessionEvent } from '@/agent-sdk'; function isAgentAvailable(agentState: AgentState) { return agentState == 'listening' || agentState == 'thinking' || agentState == 'speaking'; @@ -34,50 +35,36 @@ export const SessionView = ({ sessionStarted, ref, }: React.ComponentProps<'div'> & SessionViewProps) => { - const { state: agentState } = useVoiceAssistant(); + const agentSession = useAgentSession(); const [chatOpen, setChatOpen] = useState(false); - const { messages, send } = useChatAndTranscription(); - const room = useRoomContext(); + + const { messages, send } = useAgentMessages(); useDebugMode(); async function handleSendMessage(message: string) { - await send(message); + await send(new OutboundMessage([new TextContent(message)], `${Math.random()}` /* FIXME: fix id generation */)); } - useEffect(() => { - if (sessionStarted) { - const timeout = setTimeout(() => { - if (!isAgentAvailable(agentState)) { - const reason = - agentState === 'connecting' - ? 'Agent did not join the room. ' - : 'Agent connected but did not complete initializing. '; - - toastAlert({ - title: 'Session ended', - description: ( -

- {reason} - - See quickstart guide - - . -

- ), - }); - room.disconnect(); - } - }, 10_000); - - return () => clearTimeout(timeout); - } - }, [agentState, sessionStarted, room]); + useAgentSessionEvent(AgentSessionEvent.AgentConnectionFailure, (reason: string) => { + toastAlert({ + title: 'Session ended', + description: ( +

+ {reason} + + See quickstart guide + + . +

+ ), + }); + }, []); const { supportsChatInput, supportsVideoInput, supportsScreenShare } = appConfig; const capabilities = { @@ -104,7 +91,7 @@ export const SessionView = ({ >
- {messages.map((message: ReceivedChatMessage) => ( + {messages.map((message) => (
- + {/* FIXME: add video back in! */} + {/* */}
Date: Thu, 7 Aug 2025 09:24:21 -0400 Subject: [PATCH 02/51] fix: address imports --- agent-sdk/index.tsx | 12 ++++++------ components/app.tsx | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index 6f3862282..34f1c689f 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -4,10 +4,10 @@ import { parallelMerge } from 'streaming-iterables'; import { ConnectionState, LocalParticipant, Participant, ParticipantEvent, RemoteParticipant, Room, RoomEvent, Track, TrackEvent, TrackPublication, TranscriptionSegment } from "livekit-client"; import { EventEmitter } from "stream"; import { addMediaTimestampToTranscription, dedupeSegments, participantTrackEvents, TrackReference } from '@livekit/components-core'; -import { getParticipantTrackRefs } from '@livekit/components-core/src/observables/track'; -import { ParticipantEventCallbacks, ParticipantKind } from "livekit-client/dist/src/room/participant/Participant"; -import { TRACK_TRANSCRIPTION_DEFAULTS } from "../hooks"; -import { Future } from "livekit-client/dist/src/room/utils"; +import { getParticipantTrackRefs } from '@livekit/components/src/observables/track'; +import { ParticipantEventCallbacks, ParticipantKind } from "../node_modules/livekit-client/src/room/participant/Participant"; +// import { TRACK_TRANSCRIPTION_DEFAULTS } from "../hooks"; +import { Future } from "../node_modules/livekit-client/src/room/utils"; // --------------------- // REACT @@ -178,7 +178,7 @@ class AgentParticipant extends EventEmitter { attributes: Record = {}; transcriptions: Array = []; - transcriptionBufferSize: number = TRACK_TRANSCRIPTION_DEFAULTS.bufferSize; + transcriptionBufferSize: number = 100//TRACK_TRANSCRIPTION_DEFAULTS.bufferSize; constructor(room: Room) { super(); @@ -520,7 +520,7 @@ export enum AgentSessionEvent { } export class AgentSession extends EventEmitter { - private room: Room; + room: Room; // FIXME: should this be private? state: AgentState = 'disconnected'; agentParticipant: AgentParticipant | null = null; diff --git a/components/app.tsx b/components/app.tsx index 8e1e8a03f..455356db9 100644 --- a/components/app.tsx +++ b/components/app.tsx @@ -10,7 +10,7 @@ import { Toaster } from '@/components/ui/sonner'; import { Welcome } from '@/components/welcome'; import useConnectionDetails from '@/hooks/useConnectionDetails'; import type { AppConfig } from '@/lib/types'; -import { AgentSessionProvider } from '@/agent-sdk'; +import { AgentSession, AgentSessionProvider } from '@/agent-sdk'; const MotionWelcome = motion.create(Welcome); const MotionSessionView = motion.create(SessionView); From ec879818c64e1c5d3d97a137e82eae8e7857a4ca Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Thu, 7 Aug 2025 10:08:57 -0400 Subject: [PATCH 03/51] feat: get agent sdk to compile --- agent-sdk/index.tsx | 165 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 149 insertions(+), 16 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index 34f1c689f..d87f9679d 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -1,13 +1,148 @@ import * as React from "react"; import { useContext, useEffect, useState, useCallback } from "react"; import { parallelMerge } from 'streaming-iterables'; -import { ConnectionState, LocalParticipant, Participant, ParticipantEvent, RemoteParticipant, Room, RoomEvent, Track, TrackEvent, TrackPublication, TranscriptionSegment } from "livekit-client"; -import { EventEmitter } from "stream"; -import { addMediaTimestampToTranscription, dedupeSegments, participantTrackEvents, TrackReference } from '@livekit/components-core'; -import { getParticipantTrackRefs } from '@livekit/components/src/observables/track'; -import { ParticipantEventCallbacks, ParticipantKind } from "../node_modules/livekit-client/src/room/participant/Participant"; +import { + ConnectionState, + LocalParticipant, + Participant, + ParticipantEvent, + RemoteParticipant, + Room, + RoomEvent, + Track, + TrackEvent, + TrackPublication, + TranscriptionSegment, + ParticipantKind, +} from "livekit-client"; +import { EventEmitter } from "events"; +// import { addMediaTimestampToTranscription, dedupeSegments, ReceivedTranscriptionSegment } from '@livekit/components-core'; +// import { getParticipantTrackRefs } from '@livekit/components/src/observables/track'; +import { ParticipantEventCallbacks } from "../node_modules/livekit-client/src/room/participant/Participant"; +import { ParticipantTrackIdentifier } from "@livekit/components-core"; // import { TRACK_TRANSCRIPTION_DEFAULTS } from "../hooks"; -import { Future } from "../node_modules/livekit-client/src/room/utils"; +// import { Future } from "../node_modules/livekit-client/src/room/utils"; + +/* FROM LIVEKIT-CLIENT */ +class Future { + promise: Promise; + + resolve?: (arg: T) => void; + + reject?: (e: any) => void; + + onFinally?: () => void; + + get isResolved(): boolean { + return this._isResolved; + } + + private _isResolved: boolean = false; + + constructor( + futureBase?: (resolve: (arg: T) => void, reject: (e: any) => void) => void, + onFinally?: () => void, + ) { + this.onFinally = onFinally; + this.promise = new Promise(async (resolve, reject) => { + this.resolve = resolve; + this.reject = reject; + if (futureBase) { + await futureBase(resolve, reject); + } + }).finally(() => { + this._isResolved = true; + this.onFinally?.(); + }); + } +} +/* END FROM LIVEKIT CLIENT */ + +/* FROM COMPONENTS JS: */ +/** @public */ +type TrackReference = { + participant: Participant; + publication: TrackPublication; + source: Track.Source; +}; + +const participantTrackEvents = [ + ParticipantEvent.TrackPublished, + ParticipantEvent.TrackUnpublished, + ParticipantEvent.TrackMuted, + ParticipantEvent.TrackUnmuted, + ParticipantEvent.TrackStreamStateChanged, + ParticipantEvent.TrackSubscribed, + ParticipantEvent.TrackUnsubscribed, + ParticipantEvent.TrackSubscriptionPermissionChanged, + ParticipantEvent.TrackSubscriptionFailed, + ParticipantEvent.LocalTrackPublished, + ParticipantEvent.LocalTrackUnpublished, +]; + +type ReceivedTranscriptionSegment = TranscriptionSegment & { + receivedAtMediaTimestamp: number; + receivedAt: number; +}; + +function addMediaTimestampToTranscription( + segment: TranscriptionSegment, + timestamps: { timestamp: number; rtpTimestamp?: number }, +): ReceivedTranscriptionSegment { + return { + ...segment, + receivedAtMediaTimestamp: timestamps.rtpTimestamp ?? 0, + receivedAt: timestamps.timestamp, + }; +} + +/** + * @returns An array of unique (by id) `TranscriptionSegment`s. Latest wins. If the resulting array would be longer than `windowSize`, the array will be reduced to `windowSize` length + */ +function dedupeSegments( + prevSegments: T[], + newSegments: T[], + windowSize: number, +) { + return [...prevSegments, ...newSegments] + .reduceRight((acc, segment) => { + if (!acc.find((val) => val.id === segment.id)) { + acc.unshift(segment); + } + return acc; + }, [] as Array) + .slice(0 - windowSize); +} + +/** + * Create `TrackReferences` for all tracks that are included in the sources property. + * */ +function getParticipantTrackRefs( + participant: Participant, + identifier: ParticipantTrackIdentifier, + onlySubscribedTracks = false, +): TrackReference[] { + const { sources, kind, name } = identifier; + const sourceReferences = Array.from(participant.trackPublications.values()) + .filter( + (pub) => + (!sources || sources.includes(pub.source)) && + (!kind || pub.kind === kind) && + (!name || pub.trackName === name) && + // either return all or only the ones that are subscribed + (!onlySubscribedTracks || pub.track), + ) + .map((track): TrackReference => { + return { + participant: participant, + publication: track, + source: track.source, + }; + }); + + return sourceReferences; +} +/* END FROM COMPONENTS JS: */ // --------------------- // REACT @@ -87,11 +222,11 @@ function useParticipantEvents

( useEffect(() => { for (const eventName of eventNames) { - participant.on(eventName, memoizedCallback); + participant.on(eventName as keyof ParticipantEventCallbacks, memoizedCallback); } return () => { for (const eventName of eventNames) { - participant.off(eventName, memoizedCallback); + participant.off(eventName as keyof ParticipantEventCallbacks, memoizedCallback); } }; }, [participant, eventNames, memoizedCallback]); @@ -103,7 +238,7 @@ export function useAgentLocalParticipant() { const [localParticipant, setLocalParticipant] = React.useState(agentSession.localParticipant); const [microphoneTrack, setMicrophoneTrack] = React.useState(null); - const participantObserver = useParticipantEvents(agentSession.localParticipant, [ + useParticipantEvents(agentSession.localParticipant, [ ParticipantEvent.TrackMuted, ParticipantEvent.TrackUnmuted, ParticipantEvent.ParticipantPermissionsChanged, @@ -250,8 +385,9 @@ class AgentParticipant extends EventEmitter { this.workerTracks.find((t) => t.source === Track.Source.Microphone) ?? null ); if (this.audioTrack !== newAudioTrack) { + this.audioTrack?.publication.off(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); this.audioTrack = newAudioTrack; - this.audioTrack?.on(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); + this.audioTrack?.publication.on(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); this.audioTrackSyncTime = { timestamp: Date.now(), @@ -268,11 +404,7 @@ class AgentParticipant extends EventEmitter { this.emit(AgentParticipantEvent.AgentAttributesChanged, attributes); }; - private handleTranscriptionReceived = (event: Array>) => { - const segments = event[0]; - if (!segments) { - return; - } + private handleTranscriptionReceived = (segments: Array) => { if (!this.audioTrackSyncTime) { throw new Error('AgentParticipant - audioTrackSyncTime missing'); } @@ -526,7 +658,7 @@ export class AgentSession extends EventEmitter { agentParticipant: AgentParticipant | null = null; messageReceiver: MessageReceiver | null = null; messages: Array = []; - // private transcriptionMessageReceiver: TranscriptionMessageReceiver; + private transcriptionMessageReceiver: TranscriptionMessageReceiver; // this.transcriptionMessageReceiver = new TranscriptionMessageReceiver(agentParticipant); // this.transcriptionMessageReceiver.messages(), // /* more `MessageReceiver`s here later */ @@ -565,6 +697,7 @@ export class AgentSession extends EventEmitter { } private handleRoomDisconnected = () => { + this.agentParticipant?.off(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); this.agentParticipant?.teardown(); this.agentParticipant = null; From ed4a803b69306f2a98a4f4b32d58304e712cd590 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Thu, 7 Aug 2025 11:12:35 -0400 Subject: [PATCH 04/51] feat: mostly get AgentParticipant working --- agent-sdk/index.tsx | 191 ++++++++++++------ components/app.tsx | 36 ++-- .../agent-control-bar/agent-control-bar.tsx | 6 +- .../hooks/use-agent-control-bar.ts | 14 +- components/livekit/chat/chat-entry.tsx | 13 +- components/session-view.tsx | 21 +- 6 files changed, 170 insertions(+), 111 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index d87f9679d..f3988a864 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -14,12 +14,14 @@ import { TrackPublication, TranscriptionSegment, ParticipantKind, + TextStreamReader, + // TextStreamInfo, } from "livekit-client"; import { EventEmitter } from "events"; // import { addMediaTimestampToTranscription, dedupeSegments, ReceivedTranscriptionSegment } from '@livekit/components-core'; // import { getParticipantTrackRefs } from '@livekit/components/src/observables/track'; import { ParticipantEventCallbacks } from "../node_modules/livekit-client/src/room/participant/Participant"; -import { ParticipantTrackIdentifier } from "@livekit/components-core"; +// import { DataTopic /* , ParticipantTrackIdentifier */ } from "@livekit/components-core"; // import { TRACK_TRANSCRIPTION_DEFAULTS } from "../hooks"; // import { Future } from "../node_modules/livekit-client/src/room/utils"; @@ -119,7 +121,7 @@ function dedupeSegments( * */ function getParticipantTrackRefs( participant: Participant, - identifier: ParticipantTrackIdentifier, + identifier: any/* ParticipantTrackIdentifier */, onlySubscribedTracks = false, ): TrackReference[] { const { sources, kind, name } = identifier; @@ -142,6 +144,17 @@ function getParticipantTrackRefs( return sourceReferences; } + +interface TextStreamData { + text: string; + participantInfo: { identity: string }; // Replace with the correct type from livekit-client + streamInfo: any /* TextStreamInfo */; +} + +const DataTopic = { + CHAT: 'lk.chat', + TRANSCRIPTION: 'lk.transcription', +} as const; /* END FROM COMPONENTS JS: */ // --------------------- @@ -346,27 +359,37 @@ class AgentParticipant extends EventEmitter { ) ?? null ) : null; - // Keep this.agentParticipant / this.workerParticipant up to date - for (const event of participantTrackEvents) { - if (this.agentParticipant !== newAgentParticipant) { - this.agentParticipant?.off(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); - // FIXME: emit AgentParticipantChanged? - newAgentParticipant?.on(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); - this.agentParticipant = newAgentParticipant; - } - if (this.workerParticipant !== newWorkerParticipant) { - this.workerParticipant?.off(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); - // FIXME: emit WorkerParticipantChanged? - newWorkerParticipant?.on(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); - this.workerParticipant = newWorkerParticipant; + const oldAgentParticipant = this.agentParticipant; + const oldWorkerParticipant = this.workerParticipant; + this.agentParticipant = newAgentParticipant; + this.workerParticipant = newWorkerParticipant; + + // 1. Listen for attribute changes + if (oldAgentParticipant !== this.agentParticipant) { + oldAgentParticipant?.off(ParticipantEvent.AttributesChanged, this.handleAttributesChanged); + + if (this.agentParticipant) { + this.agentParticipant.on(ParticipantEvent.AttributesChanged, this.handleAttributesChanged); + this.handleAttributesChanged(this.agentParticipant.attributes); } } - if (this.agentParticipant !== newAgentParticipant) { - this.agentParticipant?.off(ParticipantEvent.AttributesChanged, this.handleAttributesChanged); - // FIXME: emit AgentAttributesChanged? - newAgentParticipant?.on(ParticipantEvent.AttributesChanged, this.handleAttributesChanged); - this.agentParticipant = newAgentParticipant; + // 2. Listen for track updates + for (const event of participantTrackEvents) { + if (oldAgentParticipant !== this.agentParticipant) { + oldAgentParticipant?.off(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + if (this.agentParticipant) { + this.agentParticipant.on(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + this.handleUpdateTracks(); + } + } + if (oldWorkerParticipant !== this.workerParticipant) { + oldWorkerParticipant?.off(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + if (this.workerParticipant) { + this.workerParticipant.on(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + this.handleUpdateTracks(); + } + } } } @@ -385,6 +408,7 @@ class AgentParticipant extends EventEmitter { this.workerTracks.find((t) => t.source === Track.Source.Microphone) ?? null ); if (this.audioTrack !== newAudioTrack) { + console.log('!! audio track changed', this.audioTrack?.publication); this.audioTrack?.publication.off(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); this.audioTrack = newAudioTrack; this.audioTrack?.publication.on(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); @@ -405,6 +429,7 @@ class AgentParticipant extends EventEmitter { }; private handleTranscriptionReceived = (segments: Array) => { + console.log('!! TRANSCRIPTION', segments, this.audioTrackSyncTime); if (!this.audioTrackSyncTime) { throw new Error('AgentParticipant - audioTrackSyncTime missing'); } @@ -420,6 +445,7 @@ class AgentParticipant extends EventEmitter { } private handleTimeSyncUpdate = (update: { timestamp: number; rtpTimestamp: number }) => { + console.log('!! TIME SYNC UPDATE', update); this.audioTrackSyncTime = update; }; @@ -527,13 +553,9 @@ export class OutboundMessage extends BaseMessage { -enum MessageReceiverEvents { - NewIncomingMessage = 'newIncomingMessage' -} - class MessageReceiverTerminationError extends Error {} -abstract class MessageReceiver extends EventEmitter { +abstract class MessageReceiver { private signallingFuture = new Future(); private queue: Array = []; @@ -544,7 +566,6 @@ abstract class MessageReceiver extends EventEmitter { protected enqueue(...messages: Array) { for (const message of messages) { this.queue.push(message); - this.emit(MessageReceiverEvents.NewIncomingMessage, message); } const oldSignallingFuture = this.signallingFuture; this.signallingFuture = new Future(); @@ -583,32 +604,64 @@ abstract class MessageSender { abstract send(message: OutboundMessage): Promise; } +const segmentAttribute = 'lk.segment_id'; class TranscriptionMessageReceiver extends MessageReceiver { - agentParticipant: AgentParticipant; + room: Room; - constructor(agentParticipant: AgentParticipant) { + constructor(room: Room) { super(); - this.agentParticipant = agentParticipant; + this.room = room; } async start() { - const handleAgentTranscriptionsChanged = (newTranscriptionSegments: Array) => { - for (const segment of newTranscriptionSegments) { - this.enqueue(new InboundMessage([ - new TranscriptionContent(segment), - ], new Date(segment.startTime))); + console.log('!! START!'); + const textStreamHandler = async (reader: TextStreamReader, participantInfo: { identity: string }) => { + const id = `${Math.random()}`; // FIXME: somehow generate an id? + + const isTranscription = Boolean(reader.info.attributes?.[segmentAttribute]); + let textStreams: Array = []; + + let accumulatedText: string = ''; + for await (const chunk of reader) { + accumulatedText += chunk; + + // Find and update the stream in our array + const index = textStreams.findIndex((stream) => { + if (stream.streamInfo.id === reader.info.id) { + return true; + } + if (isTranscription && + stream.streamInfo.attributes?.[segmentAttribute] === + reader.info.attributes?.[segmentAttribute]) { + return true; + } + return false; + }); + + if (index >= 0) { + textStreams[index] = { + ...textStreams[index], + text: accumulatedText, + }; + } else { + // Handle case where stream ID wasn't found (new stream) + textStreams.push({ + text: accumulatedText, + participantInfo, + streamInfo: reader.info, + }); + } + + console.log('!! TEXT STREAMS:', textStreams); + // this.enqueue(new InboundMessage([ + // new TranscriptionContent(chunk), + // ], id)); } }; + this.room.registerTextStreamHandler(DataTopic.TRANSCRIPTION, textStreamHandler); - this.agentParticipant.on( - AgentParticipantEvent.AgentTranscriptionsChanged, - handleAgentTranscriptionsChanged, - ); return () => { - this.agentParticipant.off( - AgentParticipantEvent.AgentTranscriptionsChanged, - handleAgentTranscriptionsChanged, - ); + this.room.unregisterTextStreamHandler(DataTopic.TRANSCRIPTION); }; } } @@ -657,11 +710,8 @@ export class AgentSession extends EventEmitter { agentParticipant: AgentParticipant | null = null; messageReceiver: MessageReceiver | null = null; + messageReceiverCleanup: (() => void) | undefined = undefined; messages: Array = []; - private transcriptionMessageReceiver: TranscriptionMessageReceiver; - // this.transcriptionMessageReceiver = new TranscriptionMessageReceiver(agentParticipant); - // this.transcriptionMessageReceiver.messages(), - // /* more `MessageReceiver`s here later */ constructor() { super(); @@ -685,23 +735,30 @@ export class AgentSession extends EventEmitter { } private handleRoomConnected = () => { + console.log('!! CONNECTED'); this.agentParticipant = new AgentParticipant(this.room); this.agentParticipant.on(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); + this.updateAgentState(); - this.messageReceiver = new CombinedMessageReceiver( - new TranscriptionMessageReceiver(this.agentParticipant), - ); - this.messageReceiver.on(MessageReceiverEvents.NewIncomingMessage, this.handleIncomingMessage); + // this.messageReceiver = new CombinedMessageReceiver( + this.messageReceiver = new TranscriptionMessageReceiver(this.room); + (async () => { + // FIXME: is this sort of pattern a better idea than just making MessageReceiver an EventEmitter? + for await (const message of this.messageReceiver!.messages()) { + this.handleIncomingMessage(message); + } + })(); this.startAgentConnectedTimeout(); } private handleRoomDisconnected = () => { + console.log('!! DISCONNECTED'); this.agentParticipant?.off(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); this.agentParticipant?.teardown(); this.agentParticipant = null; + this.updateAgentState(); - this.messageReceiver?.off(MessageReceiverEvents.NewIncomingMessage, this.handleIncomingMessage); this.messageReceiver?.close(); this.messageReceiver = null; @@ -731,6 +788,7 @@ export class AgentSession extends EventEmitter { } private handleAgentAttributesChanged = () => { + console.log('!! ATTRIB CHANGED:', this.agentParticipant?.attributes) this.updateAgentState(); } @@ -741,24 +799,27 @@ export class AgentSession extends EventEmitter { } private updateAgentState = () => { - if (!this.agentParticipant) { - throw new Error('AgentSession.agentParticipant is unset'); - } - const agentParticipantAttributes = this.agentParticipant.attributes; - const connectionState = this.room.state; - let newAgentState: AgentState | null = null; - if (connectionState === ConnectionState.Disconnected) { + if (!this.agentParticipant) { + // throw new Error('AgentSession.agentParticipant is unset'); newAgentState = 'disconnected'; - } else if ( - connectionState === ConnectionState.Connecting || - !this.agentParticipant || - !agentParticipantAttributes?.[stateAttribute] - ) { - newAgentState = 'connecting'; } else { - newAgentState = agentParticipantAttributes[stateAttribute] as AgentState; + const agentParticipantAttributes = this.agentParticipant.attributes; + const connectionState = this.room.state; + + if (connectionState === ConnectionState.Disconnected) { + newAgentState = 'disconnected'; + } else if ( + connectionState === ConnectionState.Connecting || + !this.agentParticipant || + !agentParticipantAttributes?.[stateAttribute] + ) { + newAgentState = 'connecting'; + } else { + newAgentState = agentParticipantAttributes[stateAttribute] as AgentState; + } } + console.log('!! STATE:', newAgentState, this.agentParticipant?.attributes); if (this.state !== newAgentState) { this.state = newAgentState; diff --git a/components/app.tsx b/components/app.tsx index 455356db9..7b65e3463 100644 --- a/components/app.tsx +++ b/components/app.tsx @@ -1,7 +1,7 @@ 'use client'; import { useEffect, useMemo, useState } from 'react'; -import { Room, RoomEvent } from 'livekit-client'; +import { RoomEvent } from 'livekit-client'; import { motion } from 'motion/react'; import { RoomAudioRenderer, RoomContext, StartAudio } from '@livekit/components-react'; import { toastAlert } from '@/components/alert-toast'; @@ -86,22 +86,24 @@ export function App({ appConfig }: AppProps) { /> - - - {/* --- */} - + + + + {/* --- */} + + diff --git a/components/livekit/agent-control-bar/agent-control-bar.tsx b/components/livekit/agent-control-bar/agent-control-bar.tsx index 2cd127c00..3f5051c58 100644 --- a/components/livekit/agent-control-bar/agent-control-bar.tsx +++ b/components/livekit/agent-control-bar/agent-control-bar.tsx @@ -13,7 +13,6 @@ import { cn } from '@/lib/utils'; import { DeviceSelect } from '../device-select'; import { TrackToggle } from '../track-toggle'; import { UseAgentControlBarProps, useAgentControlBar } from './hooks/use-agent-control-bar'; -import { useAgentSession, useAgentState } from '@/agent-sdk'; export interface AgentControlBarProps extends React.HTMLAttributes, @@ -39,12 +38,11 @@ export function AgentControlBar({ onDeviceError, ...props }: AgentControlBarProps) { - // const participants = useRemoteParticipants(); + const participants = useRemoteParticipants(); const [chatOpen, setChatOpen] = React.useState(false); const [isSendingMessage, setIsSendingMessage] = React.useState(false); - // const isAgentAvailable = participants.some((p) => p.isAgent); - const { isAvailable: isAgentAvailable } = useAgentState(); + const isAgentAvailable = participants.some((p) => p.isAgent); const isInputDisabled = !chatOpen || !isAgentAvailable || isSendingMessage; const [isDisconnecting, setIsDisconnecting] = React.useState(false); diff --git a/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts b/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts index 7716df56e..2c1495e8f 100644 --- a/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts +++ b/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts @@ -8,7 +8,6 @@ import { useTrackToggle, } from '@livekit/components-react'; import { usePublishPermissions } from './use-publish-permissions'; -import { useAgentLocalParticipant } from '@/agent-sdk'; export interface ControlBarControls { microphone?: boolean; @@ -41,20 +40,19 @@ export function useAgentControlBar(props: UseAgentControlBarProps = {}): UseAgen leave: true, ...controls, }; - // const { microphoneTrack, localParticipant } = useLocalParticipant(); - const { microphoneTrack, localParticipant } = useAgentLocalParticipant(); - const publishPermissions = usePublishPermissions(); // FIXME: replace this hook? + const { microphoneTrack, localParticipant } = useLocalParticipant(); + const publishPermissions = usePublishPermissions(); const room = useRoomContext(); - const microphoneToggle = useTrackToggle({ // FIXME: replace this hook? + const microphoneToggle = useTrackToggle({ source: Track.Source.Microphone, onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.Microphone, error }), }); - const cameraToggle = useTrackToggle({ // FIXME: replace this hook? + const cameraToggle = useTrackToggle({ source: Track.Source.Camera, onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.Camera, error }), }); - const screenShareToggle = useTrackToggle({ // FIXME: replace this hook? + const screenShareToggle = useTrackToggle({ source: Track.Source.ScreenShare, onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.ScreenShare, error }), }); @@ -77,7 +75,7 @@ export function useAgentControlBar(props: UseAgentControlBarProps = {}): UseAgen saveAudioInputDeviceId, saveVideoInputEnabled, saveVideoInputDeviceId, - } = usePersistentUserChoices({ // FIXME: replace this hook? + } = usePersistentUserChoices({ preventSave: !saveUserChoices, }); diff --git a/components/livekit/chat/chat-entry.tsx b/components/livekit/chat/chat-entry.tsx index 3939fdd81..1ad1ab849 100644 --- a/components/livekit/chat/chat-entry.tsx +++ b/components/livekit/chat/chat-entry.tsx @@ -2,11 +2,10 @@ import * as React from 'react'; import type { MessageFormatter, ReceivedChatMessage } from '@livekit/components-react'; import { cn } from '@/lib/utils'; import { useChatMessage } from './hooks/utils'; -import { InboundMessage, OutboundMessage } from '@/agent-sdk'; export interface ChatEntryProps extends React.HTMLAttributes { /** The chat massage object to display. */ - entry: InboundMessage | OutboundMessage; + entry: ReceivedChatMessage; /** Hide sender name. Useful when displaying multiple consecutive chat messages from the same person. */ hideName?: boolean; /** Hide message timestamp. */ @@ -23,15 +22,9 @@ export const ChatEntry = ({ className, ...props }: ChatEntryProps) => { - // FIXME: Where would this kind of metadata come from for real? - // const { message, hasBeenEdited, time, locale, name } = useChatMessage(entry, messageFormatter); - const message = entry.contents.map(c => c.data).join(''); - const hasBeenEdited = false; - const time = entry.timestamp; - const locale = typeof navigator !== 'undefined' ? navigator.language : 'en-US'; - const name = entry instanceof OutboundMessage ? 'User' : 'Agent'; + const { message, hasBeenEdited, time, locale, name } = useChatMessage(entry, messageFormatter); - const isUser = entry instanceof OutboundMessage;//entry.from?.isLocal ?? false; + const isUser = entry.from?.isLocal ?? false; const messageOrigin = isUser ? 'remote' : 'local'; return ( diff --git a/components/session-view.tsx b/components/session-view.tsx index a553ef8ea..886d353e8 100644 --- a/components/session-view.tsx +++ b/components/session-view.tsx @@ -17,7 +17,7 @@ import useChatAndTranscription from '@/hooks/useChatAndTranscription'; import { useDebugMode } from '@/hooks/useDebug'; import type { AppConfig } from '@/lib/types'; import { cn } from '@/lib/utils'; -import { AgentSessionEvent, OutboundMessage, TextContent, useAgentMessages, useAgentSession, useAgentSessionEvent } from '@/agent-sdk'; +import { AgentSessionEvent, OutboundMessage, TextContent, useAgentMessages, useAgentSession, useAgentSessionEvent, useAgentState } from '@/agent-sdk'; function isAgentAvailable(agentState: AgentState) { return agentState == 'listening' || agentState == 'thinking' || agentState == 'speaking'; @@ -36,14 +36,22 @@ export const SessionView = ({ ref, }: React.ComponentProps<'div'> & SessionViewProps) => { const agentSession = useAgentSession(); - const [chatOpen, setChatOpen] = useState(false); - + const { state: agentState } = useAgentState(); const { messages, send } = useAgentMessages(); + // const { state: agentState } = useVoiceAssistant(); + const [chatOpen, setChatOpen] = useState(false); + // const { messages, send } = useChatAndTranscription(); + const room = useRoomContext(); + useDebugMode(); async function handleSendMessage(message: string) { - await send(new OutboundMessage([new TextContent(message)], `${Math.random()}` /* FIXME: fix id generation */)); + await send(new OutboundMessage( + [new TextContent(message)], + `${Math.random()}` /* FIXME: fix id generation */ + )); + // await send(message); } useAgentSessionEvent(AgentSessionEvent.AgentConnectionFailure, (reason: string) => { @@ -91,7 +99,7 @@ export const SessionView = ({ >

- {messages.map((message) => ( + {messages.map((message: ReceivedChatMessage) => (
- {/* FIXME: add video back in! */} - {/* */} +
Date: Thu, 7 Aug 2025 11:16:12 -0400 Subject: [PATCH 05/51] feat: comment out old transcription stuff --- agent-sdk/index.tsx | 56 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index f3988a864..b1e00dfce 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -408,16 +408,16 @@ class AgentParticipant extends EventEmitter { this.workerTracks.find((t) => t.source === Track.Source.Microphone) ?? null ); if (this.audioTrack !== newAudioTrack) { - console.log('!! audio track changed', this.audioTrack?.publication); - this.audioTrack?.publication.off(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); + // console.log('!! audio track changed', this.audioTrack?.publication); + // this.audioTrack?.publication.off(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); this.audioTrack = newAudioTrack; - this.audioTrack?.publication.on(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); + // this.audioTrack?.publication.on(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); - this.audioTrackSyncTime = { - timestamp: Date.now(), - rtpTimestamp: this.audioTrack?.publication.track?.rtpTimestamp, - }; - this.audioTrack?.publication.track?.on(TrackEvent.TimeSyncUpdate, this.handleTimeSyncUpdate); + // this.audioTrackSyncTime = { + // timestamp: Date.now(), + // rtpTimestamp: this.audioTrack?.publication.track?.rtpTimestamp, + // }; + // this.audioTrack?.publication.track?.on(TrackEvent.TimeSyncUpdate, this.handleTimeSyncUpdate); this.emit(AgentParticipantEvent.AudioTrackChanged, newAudioTrack); } @@ -428,26 +428,26 @@ class AgentParticipant extends EventEmitter { this.emit(AgentParticipantEvent.AgentAttributesChanged, attributes); }; - private handleTranscriptionReceived = (segments: Array) => { - console.log('!! TRANSCRIPTION', segments, this.audioTrackSyncTime); - if (!this.audioTrackSyncTime) { - throw new Error('AgentParticipant - audioTrackSyncTime missing'); - } - const audioTrackSyncTime = this.audioTrackSyncTime; - - this.transcriptions = dedupeSegments( - this.transcriptions, - // when first receiving a segment, add the current media timestamp to it - segments.map((s) => addMediaTimestampToTranscription(s, audioTrackSyncTime)), - this.transcriptionBufferSize, - ); - this.emit(AgentParticipantEvent.AgentTranscriptionsChanged, this.transcriptions); - } - - private handleTimeSyncUpdate = (update: { timestamp: number; rtpTimestamp: number }) => { - console.log('!! TIME SYNC UPDATE', update); - this.audioTrackSyncTime = update; - }; + // private handleTranscriptionReceived = (segments: Array) => { + // console.log('!! TRANSCRIPTION', segments, this.audioTrackSyncTime); + // if (!this.audioTrackSyncTime) { + // throw new Error('AgentParticipant - audioTrackSyncTime missing'); + // } + // const audioTrackSyncTime = this.audioTrackSyncTime; + + // this.transcriptions = dedupeSegments( + // this.transcriptions, + // // when first receiving a segment, add the current media timestamp to it + // segments.map((s) => addMediaTimestampToTranscription(s, audioTrackSyncTime)), + // this.transcriptionBufferSize, + // ); + // this.emit(AgentParticipantEvent.AgentTranscriptionsChanged, this.transcriptions); + // } + + // private handleTimeSyncUpdate = (update: { timestamp: number; rtpTimestamp: number }) => { + // console.log('!! TIME SYNC UPDATE', update); + // this.audioTrackSyncTime = update; + // }; private get roomRemoteParticipants() { return Array.from(this.room.remoteParticipants.values()); From 809e32250c5cde94250cdac1d3cac7099e7ce26d Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Thu, 7 Aug 2025 12:06:24 -0400 Subject: [PATCH 06/51] feat: mostly get the transcriptions incoming pipeline working --- agent-sdk/index.tsx | 225 ++++++++++++------------- components/livekit/chat/chat-entry.tsx | 13 +- components/session-view.tsx | 12 +- 3 files changed, 126 insertions(+), 124 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index b1e00dfce..ee6b196b9 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -58,6 +58,21 @@ class Future { }); } } + +interface BaseStreamInfo { + id: string; + mimeType: string; + topic: string; + timestamp: number; + /** total size in bytes for finite streams and undefined for streams of unknown size */ + size?: number; + attributes?: Record; +} +interface ByteStreamInfo extends BaseStreamInfo { + name: string; +} + +interface TextStreamInfo extends BaseStreamInfo {} /* END FROM LIVEKIT CLIENT */ /* FROM COMPONENTS JS: */ @@ -180,13 +195,16 @@ export function useAgentMessages() { const agentSession = useAgentSession(); const [messages, setMessages] = useState< - Array + Array >(agentSession.messages); useEffect(() => { agentSession.on(AgentSessionEvent.MessagesChanged, setMessages); + return () => { + agentSession.off(AgentSessionEvent.MessagesChanged, setMessages); + }; }, [agentSession]); - const send = useCallback(async (message: OutboundMessage) => { + const send = useCallback(async (message: SentMessage) => { return agentSession.sendMessage(message); }, [agentSession]); @@ -474,82 +492,32 @@ class AgentParticipant extends EventEmitter { } } -abstract class BaseContent { - complete: boolean = false; -} - -export class TextContent extends BaseContent { - // TODO: some sort of id / `key`able field? - data: string; - - constructor(data: string) { - super(); - this.data = data; - this.complete = true; - } -} - -class TranscriptionContent extends BaseContent { - // TODO: some sort of id / `key`able field? How does this get generated / where does this come - // from? - data: string; - segmentId: TranscriptionSegment['id']; - - constructor(segment: TranscriptionSegment) { - super(); - this.segmentId = segment.id; - this.data = segment.text; - } -} - - -abstract class BaseMessage { - id: string; +type BaseMessageId = string; +type BaseMessage = { + id: BaseMessageId; + direction: Direction; timestamp: Date; - metadata: Record = {}; - - constructor(id: string, timestamp: Date) { - this.id = id; - this.timestamp = timestamp; - } -} - -// TODO: images? attachments? rpc? -type InboundMessageContent = TranscriptionContent; - -export class InboundMessage extends BaseMessage { - contents: Array = []; - - constructor( - contents: Array, - id: string, - timestamp: Date = new Date(), - ) { - super(id, timestamp); - this.contents = contents; - } + content: Content; +}; - get complete() { - return this.contents.every(c => c.complete); - } -} +type TranscriptionReceivedMessage = BaseMessage<'inbound', { + type: 'transcription'; + text: string; + participantInfo: { identity: string }; + streamInfo: TextStreamInfo; +}>; -type OutboundMessageContent = TextContent; -export class OutboundMessage extends BaseMessage { - contents: Array = []; +export type ReceivedMessage = + | TranscriptionReceivedMessage; + // TODO: images? attachments? rpc? - constructor( - contents: Array, - id: string, - timestamp: Date = new Date() - ) { - super(id, timestamp); - this.contents = contents; - } -} +export type SentMessage = BaseMessage< + 'outbound', + | { type: 'text', text: string } +>; @@ -557,13 +525,13 @@ class MessageReceiverTerminationError extends Error {} abstract class MessageReceiver { private signallingFuture = new Future(); - private queue: Array = []; + private queue: Array = []; // This returns cleanup function like useEffect maybe? That could be a good pattern? abstract start(): Promise void)>; /** Submit new IncomingMessages to be received by anybody reading from messages() */ - protected enqueue(...messages: Array) { + protected enqueue(...messages: Array) { for (const message of messages) { this.queue.push(message); } @@ -581,7 +549,7 @@ abstract class MessageReceiver { } /** A stream of newly generated `IncomingMessage`s */ - async *messages(): AsyncGenerator { + async *messages(): AsyncGenerator { const cleanup = await this.start(); try { while (true) { @@ -601,12 +569,17 @@ abstract class MessageReceiver { } abstract class MessageSender { - abstract send(message: OutboundMessage): Promise; + abstract send(message: SentMessage): Promise; } -const segmentAttribute = 'lk.segment_id'; +enum TranscriptionAttributes { + Final = "lk.transcription_final", + Segment = "lk.segment_id", + TrackId = "lk.transcribed_track_id", +}; class TranscriptionMessageReceiver extends MessageReceiver { room: Room; + inFlightMessages: Array = []; constructor(room: Room) { super(); @@ -614,48 +587,54 @@ class TranscriptionMessageReceiver extends MessageReceiver { } async start() { - console.log('!! START!'); const textStreamHandler = async (reader: TextStreamReader, participantInfo: { identity: string }) => { - const id = `${Math.random()}`; // FIXME: somehow generate an id? - - const isTranscription = Boolean(reader.info.attributes?.[segmentAttribute]); - let textStreams: Array = []; + const transcriptionSegmentId = reader.info.attributes?.[TranscriptionAttributes.Segment]; + const isTranscription = Boolean(transcriptionSegmentId); + const isFinal = reader.info.attributes?.[TranscriptionAttributes.Final]; + + // Find and update the stream in our array + const messageIndex = this.inFlightMessages.findIndex((message) => { + if (message.content.streamInfo.id === reader.info.id) { + return true; + } + if (isTranscription && transcriptionSegmentId === reader.info.attributes?.[TranscriptionAttributes.Segment]) { + return true; + } + return false; + }); - let accumulatedText: string = ''; + let accumulatedText: string = this.inFlightMessages[messageIndex]?.content.text ?? ''; for await (const chunk of reader) { accumulatedText += chunk; - // Find and update the stream in our array - const index = textStreams.findIndex((stream) => { - if (stream.streamInfo.id === reader.info.id) { - return true; - } - if (isTranscription && - stream.streamInfo.attributes?.[segmentAttribute] === - reader.info.attributes?.[segmentAttribute]) { - return true; - } - return false; - }); - - if (index >= 0) { - textStreams[index] = { - ...textStreams[index], - text: accumulatedText, + if (messageIndex >= 0) { + this.inFlightMessages[messageIndex] = { + ...this.inFlightMessages[messageIndex], + content: { + ...this.inFlightMessages[messageIndex].content, + text: accumulatedText, + }, }; + this.enqueue(this.inFlightMessages[messageIndex]); } else { - // Handle case where stream ID wasn't found (new stream) - textStreams.push({ - text: accumulatedText, - participantInfo, - streamInfo: reader.info, - }); + // Handle case where stream ID wasn't found (new message) + const message: ReceivedMessage = { + id: reader.info.id, + timestamp: new Date(reader.info.timestamp), + content: { + type: 'transcription', + text: accumulatedText, + participantInfo, + streamInfo: reader.info, + }, + }; + this.inFlightMessages.push(message); + this.enqueue(message); } + } - console.log('!! TEXT STREAMS:', textStreams); - // this.enqueue(new InboundMessage([ - // new TranscriptionContent(chunk), - // ], id)); + if (isFinal) { + this.inFlightMessages.splice(messageIndex, 1); } }; this.room.registerTextStreamHandler(DataTopic.TRANSCRIPTION, textStreamHandler); @@ -694,7 +673,6 @@ class CombinedMessageReceiver extends MessageReceiver { } } - export enum AgentSessionEvent { AgentStateChanged = 'agentStateChanged', AudioTrackChanged = 'audioTrackChanged', @@ -710,8 +688,10 @@ export class AgentSession extends EventEmitter { agentParticipant: AgentParticipant | null = null; messageReceiver: MessageReceiver | null = null; - messageReceiverCleanup: (() => void) | undefined = undefined; - messages: Array = []; + + // FIXME: maybe make an OrderedMessageList with these two fields in it? + messageById: Map = new Map(); + messageIds: Array = []; constructor() { super(); @@ -792,9 +772,13 @@ export class AgentSession extends EventEmitter { this.updateAgentState(); } - private handleIncomingMessage = (incomingMessage: InboundMessage) => { - // FIXME: Do message accumulation here? Or maybe add some other entity to handle it? - this.messages.push(incomingMessage); + private handleIncomingMessage = (incomingMessage: ReceivedMessage) => { + // Upsert the message into the list + this.messageById.set(incomingMessage.id, incomingMessage); + if (!this.messageIds.includes(incomingMessage.id)) { + this.messageIds.push(incomingMessage.id); + } + this.emit(AgentSessionEvent.MessagesChanged, this.messages); } @@ -835,10 +819,19 @@ export class AgentSession extends EventEmitter { return this.room?.localParticipant ?? null; } + get messages() { + return ( + this.messageIds + .map(id => this.messageById.get(id)) + // FIXME: can I get rid of the filter somehow? + .filter((message): message is SentMessage | ReceivedMessage => typeof message !== 'undefined') + ); + } + // Mesasges: // - transcriptions are probably how agent generated messages come into being? // - lk.chat data channel messages also exist - async sendMessage(message: OutboundMessage) { + async sendMessage(message: SentMessage) { /* TODO */ } diff --git a/components/livekit/chat/chat-entry.tsx b/components/livekit/chat/chat-entry.tsx index 1ad1ab849..30b83b484 100644 --- a/components/livekit/chat/chat-entry.tsx +++ b/components/livekit/chat/chat-entry.tsx @@ -2,10 +2,11 @@ import * as React from 'react'; import type { MessageFormatter, ReceivedChatMessage } from '@livekit/components-react'; import { cn } from '@/lib/utils'; import { useChatMessage } from './hooks/utils'; +import { ReceivedMessage, SentMessage } from '@/agent-sdk'; export interface ChatEntryProps extends React.HTMLAttributes { /** The chat massage object to display. */ - entry: ReceivedChatMessage; + entry: ReceivedMessage | SentMessage; /** Hide sender name. Useful when displaying multiple consecutive chat messages from the same person. */ hideName?: boolean; /** Hide message timestamp. */ @@ -22,9 +23,15 @@ export const ChatEntry = ({ className, ...props }: ChatEntryProps) => { - const { message, hasBeenEdited, time, locale, name } = useChatMessage(entry, messageFormatter); + // FIXME: Where would this kind of metadata come from for real? + // const { message, hasBeenEdited, time, locale, name } = useChatMessage(entry, messageFormatter); + const message = entry.content.text; + const hasBeenEdited = false; + const time = entry.timestamp; + const locale = typeof navigator !== 'undefined' ? navigator.language : 'en-US'; + const name = entry.direction === 'outbound' ? 'User' : 'Agent'; - const isUser = entry.from?.isLocal ?? false; + const isUser = entry.direction === 'outbound';//entry.from?.isLocal ?? false; const messageOrigin = isUser ? 'remote' : 'local'; return ( diff --git a/components/session-view.tsx b/components/session-view.tsx index 886d353e8..9ec8b2697 100644 --- a/components/session-view.tsx +++ b/components/session-view.tsx @@ -17,7 +17,7 @@ import useChatAndTranscription from '@/hooks/useChatAndTranscription'; import { useDebugMode } from '@/hooks/useDebug'; import type { AppConfig } from '@/lib/types'; import { cn } from '@/lib/utils'; -import { AgentSessionEvent, OutboundMessage, TextContent, useAgentMessages, useAgentSession, useAgentSessionEvent, useAgentState } from '@/agent-sdk'; +import { AgentSessionEvent, useAgentMessages, useAgentSession, useAgentSessionEvent, useAgentState } from '@/agent-sdk'; function isAgentAvailable(agentState: AgentState) { return agentState == 'listening' || agentState == 'thinking' || agentState == 'speaking'; @@ -47,10 +47,12 @@ export const SessionView = ({ useDebugMode(); async function handleSendMessage(message: string) { - await send(new OutboundMessage( - [new TextContent(message)], - `${Math.random()}` /* FIXME: fix id generation */ - )); + // FIXME: add some sort of builder for SentMessage here so it's not just a raw object? + await send({ + id: `${Math.random()}`, /* FIXME: fix id generation */ + timestamp: new Date(), + content: { type: 'text', text: message }, + }); // await send(message); } From 7760aec4e00d0b2a68093e1315a27c8cedc77749 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Thu, 7 Aug 2025 12:18:46 -0400 Subject: [PATCH 07/51] feat: get CombinedMessageReceiver working --- agent-sdk/index.tsx | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index ee6b196b9..c583e5d3f 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -548,6 +548,10 @@ abstract class MessageReceiver { ); } + closeWithError(error: Error) { + this.signallingFuture.reject?.(error); + } + /** A stream of newly generated `IncomingMessage`s */ async *messages(): AsyncGenerator { const cleanup = await this.start(); @@ -576,7 +580,7 @@ enum TranscriptionAttributes { Final = "lk.transcription_final", Segment = "lk.segment_id", TrackId = "lk.transcribed_track_id", -}; +} class TranscriptionMessageReceiver extends MessageReceiver { room: Room; inFlightMessages: Array = []; @@ -604,6 +608,8 @@ class TranscriptionMessageReceiver extends MessageReceiver { }); let accumulatedText: string = this.inFlightMessages[messageIndex]?.content.text ?? ''; + // FIXME: I think there may need to be some error handling logic to ensure the below for await + // properly exposes errors via `this.closeWithError` for await (const chunk of reader) { accumulatedText += chunk; @@ -620,6 +626,7 @@ class TranscriptionMessageReceiver extends MessageReceiver { // Handle case where stream ID wasn't found (new message) const message: ReceivedMessage = { id: reader.info.id, + direction: 'inbound', timestamp: new Date(reader.info.timestamp), content: { type: 'transcription', @@ -650,7 +657,7 @@ class TranscriptionMessageReceiver extends MessageReceiver { /** * A `MessageReceiver` which takes a list of other `MessageReceiver`s and forwards along their `InboundMessage`s - * Conceptually, think `Promise.race` being run across each async iterator iteration. + * Conceptually, think `Promise.race` being run across all passed `MessageReceiver`s on each async iterator iteration. */ class CombinedMessageReceiver extends MessageReceiver { private messageReceivers: Array; @@ -661,9 +668,14 @@ class CombinedMessageReceiver extends MessageReceiver { } async start() { - for await (const inboundMessage of parallelMerge(...this.messageReceivers.map(mr => mr.messages()))) { - this.enqueue(inboundMessage); - } + const messagesAsyncIterators = this.messageReceivers.map(mr => mr.messages()); + (async () => { + for await (const inboundMessage of parallelMerge(...messagesAsyncIterators)) { + this.enqueue(inboundMessage); + } + })().catch(err => { + this.closeWithError(err); + }); return () => { for (const messageReceiver of this.messageReceivers) { @@ -720,8 +732,10 @@ export class AgentSession extends EventEmitter { this.agentParticipant.on(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); this.updateAgentState(); - // this.messageReceiver = new CombinedMessageReceiver( - this.messageReceiver = new TranscriptionMessageReceiver(this.room); + this.messageReceiver = new CombinedMessageReceiver( + new TranscriptionMessageReceiver(this.room), + // TODO: images? attachments? rpc? + ); (async () => { // FIXME: is this sort of pattern a better idea than just making MessageReceiver an EventEmitter? for await (const message of this.messageReceiver!.messages()) { From 5e35853f0b895019f652dcf54b04b084c6053b02 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Thu, 7 Aug 2025 13:17:28 -0400 Subject: [PATCH 08/51] feat: got transcriptions aggregating properly, it's way way more nuanced than I thought... --- agent-sdk/index.tsx | 87 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 15 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index c583e5d3f..e663fc711 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -581,6 +581,36 @@ enum TranscriptionAttributes { Segment = "lk.segment_id", TrackId = "lk.transcribed_track_id", } + +/** + * Processes new `lk.transcription` data stream events generated by the agent for both user and + * LLM generated speach and generates corresponding `TranscriptionReceivedMessage`s. + * + * For agent messages, a new text stream is emitted for each message, and the stream is closed when the message is finalized. + * Each agent message is delivered in chunks which must be accumulated and published into the message stream. + * + * For user messages, the full transcription is sent each time, but may be updated until finalized. + * + * The `lk.segment_id` attribute is stable and unique across the lifetime of the message. + * + * Example agent generated transcriptions: + * ``` + * { segment_id: "1", content: "Hello" } + * { segment_id: "1", content: " world" } + * { segment_id: "1", content: "!" } + * { segment_id: "2", content: "Hello" } + * { segment_id: "2", content: " Apple" } + * { segment_id: "2", content: "!" } + * ``` + * + * Example user generated transcriptions: + * ``` + * { segment_id: "3", content: "Hello" } + * { segment_id: "3", content: "Hello world!" } + * { segment_id: "4", content: "Hello" } + * { segment_id: "4", content: "Hello Apple!" } + * ``` + */ class TranscriptionMessageReceiver extends MessageReceiver { room: Room; inFlightMessages: Array = []; @@ -594,34 +624,36 @@ class TranscriptionMessageReceiver extends MessageReceiver { const textStreamHandler = async (reader: TextStreamReader, participantInfo: { identity: string }) => { const transcriptionSegmentId = reader.info.attributes?.[TranscriptionAttributes.Segment]; const isTranscription = Boolean(transcriptionSegmentId); - const isFinal = reader.info.attributes?.[TranscriptionAttributes.Final]; + const isFinal = reader.info.attributes?.[TranscriptionAttributes.Final] === 'true'; + + let currentStreamId = reader.info.id; // Find and update the stream in our array - const messageIndex = this.inFlightMessages.findIndex((message) => { + let messageIndex = this.inFlightMessages.findIndex((message) => { if (message.content.streamInfo.id === reader.info.id) { return true; } - if (isTranscription && transcriptionSegmentId === reader.info.attributes?.[TranscriptionAttributes.Segment]) { + if (isTranscription && transcriptionSegmentId === message.content.streamInfo.attributes?.[TranscriptionAttributes.Segment]) { return true; } return false; }); - let accumulatedText: string = this.inFlightMessages[messageIndex]?.content.text ?? ''; // FIXME: I think there may need to be some error handling logic to ensure the below for await // properly exposes errors via `this.closeWithError` for await (const chunk of reader) { - accumulatedText += chunk; + const existingMessage = this.inFlightMessages[messageIndex]; + if (existingMessage) { + if (existingMessage.content.streamInfo.id === currentStreamId) { + // Stream hasn't changed, just append content + const updatedMessage = this.appendInFlightMessageText(messageIndex, chunk, reader.info); + this.enqueue(updatedMessage); + } else { + // Stream has changed, so fully replace content + const updatedMessage = this.replaceInFlightMessageText(messageIndex, chunk, reader.info); + this.enqueue(updatedMessage); + } - if (messageIndex >= 0) { - this.inFlightMessages[messageIndex] = { - ...this.inFlightMessages[messageIndex], - content: { - ...this.inFlightMessages[messageIndex].content, - text: accumulatedText, - }, - }; - this.enqueue(this.inFlightMessages[messageIndex]); } else { // Handle case where stream ID wasn't found (new message) const message: ReceivedMessage = { @@ -630,18 +662,20 @@ class TranscriptionMessageReceiver extends MessageReceiver { timestamp: new Date(reader.info.timestamp), content: { type: 'transcription', - text: accumulatedText, + text: chunk, participantInfo, streamInfo: reader.info, }, }; this.inFlightMessages.push(message); + messageIndex = this.inFlightMessages.length-1; this.enqueue(message); } } if (isFinal) { this.inFlightMessages.splice(messageIndex, 1); + console.log('!! MESSAGE DONE!', this.inFlightMessages); } }; this.room.registerTextStreamHandler(DataTopic.TRANSCRIPTION, textStreamHandler); @@ -650,6 +684,29 @@ class TranscriptionMessageReceiver extends MessageReceiver { this.room.unregisterTextStreamHandler(DataTopic.TRANSCRIPTION); }; } + + private replaceInFlightMessageText(messageIndex: number, text: string, streamInfo: TextStreamInfo) { + this.inFlightMessages[messageIndex] = { + ...this.inFlightMessages[messageIndex], + content: { + ...this.inFlightMessages[messageIndex].content, + text, + streamInfo, + }, + }; + return this.inFlightMessages[messageIndex]; + } + private appendInFlightMessageText(messageIndex: number, text: string, streamInfo: TextStreamInfo) { + this.inFlightMessages[messageIndex] = { + ...this.inFlightMessages[messageIndex], + content: { + ...this.inFlightMessages[messageIndex].content, + text: this.inFlightMessages[messageIndex].content.text + text, + streamInfo, + }, + }; + return this.inFlightMessages[messageIndex]; + } } From 3978348cd3cb7d4dee29b32219b7fcaf41c52c10 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Thu, 7 Aug 2025 13:47:21 -0400 Subject: [PATCH 09/51] feat: add MessageSender and ChatMessageSender / CombinedMessageSender like the receive end --- agent-sdk/index.tsx | 88 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 19 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index e663fc711..d6b29b973 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -503,7 +503,7 @@ type BaseMessage = { content: Content; }; -type TranscriptionReceivedMessage = BaseMessage<'inbound', { +type ReceivedTranscriptionMessage = BaseMessage<'inbound', { type: 'transcription'; text: string; participantInfo: { identity: string }; @@ -511,27 +511,26 @@ type TranscriptionReceivedMessage = BaseMessage<'inbound', { }>; export type ReceivedMessage = - | TranscriptionReceivedMessage; + | ReceivedTranscriptionMessage; // TODO: images? attachments? rpc? -export type SentMessage = BaseMessage< - 'outbound', - | { type: 'text', text: string } ->; +type SentChatMessage = BaseMessage<'outbound', | { type: 'chat', text: string }>; +export type SentMessage = + | SentChatMessage; class MessageReceiverTerminationError extends Error {} -abstract class MessageReceiver { +abstract class MessageReceiver { private signallingFuture = new Future(); - private queue: Array = []; + private queue: Array = []; // This returns cleanup function like useEffect maybe? That could be a good pattern? abstract start(): Promise void)>; /** Submit new IncomingMessages to be received by anybody reading from messages() */ - protected enqueue(...messages: Array) { + protected enqueue(...messages: Array) { for (const message of messages) { this.queue.push(message); } @@ -553,7 +552,7 @@ abstract class MessageReceiver { } /** A stream of newly generated `IncomingMessage`s */ - async *messages(): AsyncGenerator { + async *messages(): AsyncGenerator { const cleanup = await this.start(); try { while (true) { @@ -572,8 +571,53 @@ abstract class MessageReceiver { } } -abstract class MessageSender { - abstract send(message: SentMessage): Promise; +abstract class MessageSender { + /** Can this MessageSender handle sending the given message? */ + abstract canSend(message: SentMessage): message is Message + abstract send(message: Message): Promise; +} + +class ChatMessageSender extends MessageSender { + private localParticipant: LocalParticipant; + + constructor(localParticipant: LocalParticipant) { + super(); + this.localParticipant = localParticipant; + } + + canSend(message: SentMessage): message is SentChatMessage { + return message.content.type === 'chat'; + } + + async send(message: SentChatMessage) { + await this.localParticipant.sendText(message.content.text, /* FIXME: options here? */); + } +} + +class CombinedMessageSender extends MessageSender { + private messageSenders: Array; + + constructor(...messageSenders: Array) { + super(); + this.messageSenders = messageSenders; + } + + canSend(message: SentMessage): message is SentMessage { + return true; + } + + async send(message: SentMessage) { + for (const sender of this.messageSenders) { + // FIXME: an open question - should this only ever send with one MessageSender or potentially + // multiple? It doesn't matter with only ChatMessageSender but I'm not sure the right long term call. + if (sender.canSend(message)) { + await sender.send(message); + return; + } + } + + throw new Error(`CombinedMessageSender - cannot find a MessageSender to send message ${message}`); + } } enum TranscriptionAttributes { @@ -613,7 +657,7 @@ enum TranscriptionAttributes { */ class TranscriptionMessageReceiver extends MessageReceiver { room: Room; - inFlightMessages: Array = []; + inFlightMessages: Array = []; constructor(room: Room) { super(); @@ -756,6 +800,7 @@ export class AgentSession extends EventEmitter { state: AgentState = 'disconnected'; agentParticipant: AgentParticipant | null = null; + messageSender: MessageSender | null = null; messageReceiver: MessageReceiver | null = null; // FIXME: maybe make an OrderedMessageList with these two fields in it? @@ -789,6 +834,11 @@ export class AgentSession extends EventEmitter { this.agentParticipant.on(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); this.updateAgentState(); + this.messageSender = new CombinedMessageSender( + new ChatMessageSender(this.localParticipant), + // TODO: other types of messages that can be sent + ); + this.messageReceiver = new CombinedMessageReceiver( new TranscriptionMessageReceiver(this.room), // TODO: images? attachments? rpc? @@ -899,14 +949,14 @@ export class AgentSession extends EventEmitter { ); } - // Mesasges: - // - transcriptions are probably how agent generated messages come into being? - // - lk.chat data channel messages also exist + // FIXME: maybe there should be a special case where if message is `string` it is converted into + // a `SentChatMessage`? async sendMessage(message: SentMessage) { - /* TODO */ + if (!this.messageSender) { + throw new Error('AgentSession.sendMessage - cannot send message until room is connected and MessageSender initialized!'); + } + await this.messageSender.send(message); } - - generateReply() {} } From b2355fcb43539b009c21c72fc5ce562b9e5c541b Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Thu, 7 Aug 2025 14:22:23 -0400 Subject: [PATCH 10/51] feat: add loopback message receiver to ChatMessageSender --- agent-sdk/index.tsx | 60 ++++++++++++++++++++++++++++++++++--- components/session-view.tsx | 5 ++-- 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index d6b29b973..58c0a04ae 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -510,8 +510,11 @@ type ReceivedTranscriptionMessage = BaseMessage<'inbound', { streamInfo: TextStreamInfo; }>; +type ReceivedChatLoopbackMessage = BaseMessage<'inbound', { type: 'chat'; text: string }>; + export type ReceivedMessage = - | ReceivedTranscriptionMessage; + | ReceivedTranscriptionMessage + | ReceivedChatLoopbackMessage; // TODO: images? attachments? rpc? type SentChatMessage = BaseMessage<'outbound', | { type: 'chat', text: string }>; @@ -579,6 +582,7 @@ abstract class MessageSender { class ChatMessageSender extends MessageSender { private localParticipant: LocalParticipant; + private loopbackReceiverCallbacks: Set<(incomingMessage: SentChatMessage) => void> = new Set(); constructor(localParticipant: LocalParticipant) { super(); @@ -590,7 +594,52 @@ class ChatMessageSender extends MessageSender { } async send(message: SentChatMessage) { + for (const callback of this.loopbackReceiverCallbacks) { + callback(message); + } + await this.localParticipant.sendText(message.content.text, /* FIXME: options here? */); + + // const legacyChatMsg: LegacyChatMessage = { + // id: message.id, + // timestamp: message.timestamp.getTime(), + // message: message.content.text, + // }; + // const encodeLegacyMsg = (message: LegacyChatMessage) => new TextEncoder().encode(JSON.stringify(message)); + // await this.localParticipant.publishData(encodeLegacyMsg(legacyChatMsg), { + // topic: "lk-chat-topic",//LegacyDataTopic.CHAT, + // reliable: true, + // }); + } + + /** + * Generates a corresponding MessageReceiver which will emit "received" versions of each chat + * message, that can be correspondingly merged into the message list. + * + * FIXME: should this be on the MessageSender instead, so this can be done for any sender? + */ + generateLoopbackMessageReceiver() { + const chatMessageSender = this; + class ChatMessageLoopbackReceiver extends MessageReceiver { + async start() { + const callback = (incomingMessage: SentChatMessage) => { + const outgoingMessage: ReceivedChatLoopbackMessage = { + id: incomingMessage.id, + direction: 'inbound', + timestamp: incomingMessage.timestamp, + content: { type: 'chat', text: incomingMessage.content.text }, + }; + this.enqueue(outgoingMessage); + }; + + chatMessageSender.loopbackReceiverCallbacks.add(callback); + return () => { + chatMessageSender.loopbackReceiverCallbacks.delete(callback); + }; + } + } + + return new ChatMessageLoopbackReceiver(); } } @@ -609,14 +658,15 @@ class CombinedMessageSender extends MessageSender { async send(message: SentMessage) { for (const sender of this.messageSenders) { // FIXME: an open question - should this only ever send with one MessageSender or potentially - // multiple? It doesn't matter with only ChatMessageSender but I'm not sure the right long term call. + // multiple? It doesn't matter now given there is only one MessageSender (ChatMessageSender) + // but I'm not sure the right long term call. if (sender.canSend(message)) { await sender.send(message); return; } } - throw new Error(`CombinedMessageSender - cannot find a MessageSender to send message ${message}`); + throw new Error(`CombinedMessageSender - cannot find a MessageSender to send message ${JSON.stringify(message)}`); } } @@ -834,13 +884,15 @@ export class AgentSession extends EventEmitter { this.agentParticipant.on(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); this.updateAgentState(); + const chatMessageSender = new ChatMessageSender(this.localParticipant); this.messageSender = new CombinedMessageSender( - new ChatMessageSender(this.localParticipant), + chatMessageSender, // TODO: other types of messages that can be sent ); this.messageReceiver = new CombinedMessageReceiver( new TranscriptionMessageReceiver(this.room), + chatMessageSender.generateLoopbackMessageReceiver(), // TODO: images? attachments? rpc? ); (async () => { diff --git a/components/session-view.tsx b/components/session-view.tsx index 9ec8b2697..58fb87ff8 100644 --- a/components/session-view.tsx +++ b/components/session-view.tsx @@ -50,8 +50,9 @@ export const SessionView = ({ // FIXME: add some sort of builder for SentMessage here so it's not just a raw object? await send({ id: `${Math.random()}`, /* FIXME: fix id generation */ + direction: 'outbound', timestamp: new Date(), - content: { type: 'text', text: message }, + content: { type: 'chat', text: message }, }); // await send(message); } @@ -101,7 +102,7 @@ export const SessionView = ({ >
- {messages.map((message: ReceivedChatMessage) => ( + {messages.map((message) => ( Date: Thu, 7 Aug 2025 15:50:12 -0400 Subject: [PATCH 11/51] docs: add more comments --- agent-sdk/index.tsx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index 58c0a04ae..f56dc22cd 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -21,6 +21,7 @@ import { EventEmitter } from "events"; // import { addMediaTimestampToTranscription, dedupeSegments, ReceivedTranscriptionSegment } from '@livekit/components-core'; // import { getParticipantTrackRefs } from '@livekit/components/src/observables/track'; import { ParticipantEventCallbacks } from "../node_modules/livekit-client/src/room/participant/Participant"; +// import { LegacyChatMessage } from "@livekit/components-core"; // import { DataTopic /* , ParticipantTrackIdentifier */ } from "@livekit/components-core"; // import { TRACK_TRANSCRIPTION_DEFAULTS } from "../hooks"; // import { Future } from "../node_modules/livekit-client/src/room/utils"; @@ -897,6 +898,7 @@ export class AgentSession extends EventEmitter { ); (async () => { // FIXME: is this sort of pattern a better idea than just making MessageReceiver an EventEmitter? + // FIXME: this probably doesn't handle errors properly right now for await (const message of this.messageReceiver!.messages()) { this.handleIncomingMessage(message); } From abe202637db5692a494ea30de357afe044bf8a2e Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Fri, 8 Aug 2025 10:26:40 -0400 Subject: [PATCH 12/51] feat: start migrating pre-existing hooks to agent alternatives --- agent-sdk/index.tsx | 37 +++++++++++++++---- .../agent-control-bar/agent-control-bar.tsx | 4 +- components/livekit/media-tiles.tsx | 13 +++++-- components/session-view.tsx | 3 +- 4 files changed, 41 insertions(+), 16 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index f56dc22cd..b6614ff1b 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -78,7 +78,7 @@ interface TextStreamInfo extends BaseStreamInfo {} /* FROM COMPONENTS JS: */ /** @public */ -type TrackReference = { +export type TrackReference = { participant: Participant; publication: TrackPublication; source: Track.Source; @@ -243,6 +243,17 @@ export function useAgentState() { return { state: agentState, isAvailable }; } +export function useAgentTracks() { + const agentSession = useAgentSession(); + + const [audioTrack, setAudioTrack] = useState(agentSession.agentParticipant?.audioTrack ?? null); + useAgentSessionEvent(AgentSessionEvent.AudioTrackChanged, setAudioTrack, []); + const [videoTrack, setVideoTrack] = useState(agentSession.agentParticipant?.videoTrack ?? null); + useAgentSessionEvent(AgentSessionEvent.VideoTrackChanged, setVideoTrack, []); + + return { audioTrack, videoTrack }; +} + function useParticipantEvents

( participant: P, eventNames: Array, @@ -268,7 +279,8 @@ export function useAgentLocalParticipant() { const agentSession = useAgentSession(); const [localParticipant, setLocalParticipant] = React.useState(agentSession.localParticipant); - const [microphoneTrack, setMicrophoneTrack] = React.useState(null); + const [microphoneTrack, setMicrophoneTrack] = React.useState(null); + const [cameraTrack, setCameraTrack] = React.useState(null); useParticipantEvents(agentSession.localParticipant, [ ParticipantEvent.TrackMuted, @@ -282,13 +294,22 @@ export function useAgentLocalParticipant() { ParticipantEvent.MediaDevicesError, ParticipantEvent.TrackSubscriptionStatusChanged, // ParticipantEvent.ConnectionQualityChanged, - ], (p: LocalParticipant) => { - setLocalParticipant(p); + ], () => { + setLocalParticipant(agentSession.localParticipant); // FIXME: is the rest of this stuff needed? // const { isMicrophoneEnabled, isCameraEnabled, isScreenShareEnabled } = p; - const microphoneTrack = p.getTrackPublication(Track.Source.Microphone); - setMicrophoneTrack(microphoneTrack ?? null); - // const cameraTrack = p.getTrackPublication(Track.Source.Camera); + const microphoneTrack = agentSession.localParticipant.getTrackPublication(Track.Source.Microphone); + setMicrophoneTrack(microphoneTrack ? { + source: Track.Source.Microphone, + participant: localParticipant, + publication: microphoneTrack, + } : null); + const cameraTrack = agentSession.localParticipant.getTrackPublication(Track.Source.Camera); + setCameraTrack(cameraTrack ? { + source: Track.Source.Camera, + participant: localParticipant, + publication: cameraTrack, + } : null); // const participantMedia: ParticipantMedia = { // isCameraEnabled, // isMicrophoneEnabled, @@ -300,7 +321,7 @@ export function useAgentLocalParticipant() { // return participantMedia; }, []); - return { localParticipant, microphoneTrack }; + return { localParticipant, microphoneTrack, cameraTrack }; } // hook ideas: diff --git a/components/livekit/agent-control-bar/agent-control-bar.tsx b/components/livekit/agent-control-bar/agent-control-bar.tsx index 3f5051c58..289a3ce9f 100644 --- a/components/livekit/agent-control-bar/agent-control-bar.tsx +++ b/components/livekit/agent-control-bar/agent-control-bar.tsx @@ -38,7 +38,7 @@ export function AgentControlBar({ onDeviceError, ...props }: AgentControlBarProps) { - const participants = useRemoteParticipants(); + const participants = useRemoteParticipants(); // FIXME: replace with agent alternative const [chatOpen, setChatOpen] = React.useState(false); const [isSendingMessage, setIsSendingMessage] = React.useState(false); @@ -56,7 +56,7 @@ export function AgentControlBar({ handleAudioDeviceChange, handleVideoDeviceChange, handleDisconnect, - } = useAgentControlBar({ + } = useAgentControlBar({ // FIXME: replace with agent alternative controls, saveUserChoices, }); diff --git a/components/livekit/media-tiles.tsx b/components/livekit/media-tiles.tsx index 7b7cedd69..f28dc3b62 100644 --- a/components/livekit/media-tiles.tsx +++ b/components/livekit/media-tiles.tsx @@ -11,6 +11,7 @@ import { cn } from '@/lib/utils'; import { AgentTile } from './agent-tile'; import { AvatarTile } from './avatar-tile'; import { VideoTile } from './video-tile'; +import { useAgentLocalParticipant, useAgentState, useAgentTracks } from '@/agent-sdk'; const MotionVideoTile = motion.create(VideoTile); const MotionAgentTile = motion.create(AgentTile); @@ -91,13 +92,17 @@ interface MediaTilesProps { } export function MediaTiles({ chatOpen }: MediaTilesProps) { + const { state: agentState } = useAgentState(); + // const { audioTrack: agentAudioTrack, videoTrack: agentVideoTrack } = useAgentTracks(); const { - state: agentState, + // state: agentState, audioTrack: agentAudioTrack, videoTrack: agentVideoTrack, } = useVoiceAssistant(); - const [screenShareTrack] = useTracks([Track.Source.ScreenShare]); - const cameraTrack: TrackReference | undefined = useLocalTrackRef(Track.Source.Camera); + // console.log('TRACKS:', agentAudioTrack, agentVideoTrack); + const [screenShareTrack] = useTracks([Track.Source.ScreenShare]); // FIXME: replace with agent alternative + // const cameraTrack: TrackReference | undefined = useLocalTrackRef(Track.Source.Camera); // FIXME: replace with agent alternative + const { cameraTrack } = useAgentLocalParticipant(); const isCameraEnabled = cameraTrack && !cameraTrack.publication.isMuted; const isScreenShareEnabled = screenShareTrack && !screenShareTrack.publication.isMuted; @@ -119,7 +124,7 @@ export function MediaTiles({ chatOpen }: MediaTilesProps) { const agentLayoutTransition = transition; const avatarLayoutTransition = transition; - const isAvatar = agentVideoTrack !== undefined; + const isAvatar = Boolean(agentVideoTrack); return (

diff --git a/components/session-view.tsx b/components/session-view.tsx index 58fb87ff8..19c121de3 100644 --- a/components/session-view.tsx +++ b/components/session-view.tsx @@ -36,13 +36,12 @@ export const SessionView = ({ ref, }: React.ComponentProps<'div'> & SessionViewProps) => { const agentSession = useAgentSession(); - const { state: agentState } = useAgentState(); const { messages, send } = useAgentMessages(); // const { state: agentState } = useVoiceAssistant(); const [chatOpen, setChatOpen] = useState(false); // const { messages, send } = useChatAndTranscription(); - const room = useRoomContext(); + // const room = useRoomContext(); useDebugMode(); From 4c1c024177af8acfbd6ca38842744963cc634709 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Fri, 8 Aug 2025 11:48:07 -0400 Subject: [PATCH 13/51] refactor: break up existing code into many files --- agent-sdk/agent-session/AgentParticipant.ts | 183 ++++ agent-sdk/agent-session/AgentSession.ts | 207 ++++ agent-sdk/agent-session/message/index.ts | 36 + .../receive/CombinedMessageReceiver.ts | 31 + .../message/receive/MessageReceiver.ts | 57 ++ .../receive/TranscriptionMessageReceiver.ts | 139 +++ .../message/send/ChatMessageSender.ts | 71 ++ .../message/send/CombinedMessageSender.ts | 33 + .../message/send/MessageSender.ts | 7 + agent-sdk/external-deps/client-sdk-js.tsx | 21 + agent-sdk/external-deps/components-js.tsx | 102 ++ agent-sdk/index.tsx | 920 +----------------- agent-sdk/lib/future.ts | 59 ++ 13 files changed, 982 insertions(+), 884 deletions(-) create mode 100644 agent-sdk/agent-session/AgentParticipant.ts create mode 100644 agent-sdk/agent-session/AgentSession.ts create mode 100644 agent-sdk/agent-session/message/index.ts create mode 100644 agent-sdk/agent-session/message/receive/CombinedMessageReceiver.ts create mode 100644 agent-sdk/agent-session/message/receive/MessageReceiver.ts create mode 100644 agent-sdk/agent-session/message/receive/TranscriptionMessageReceiver.ts create mode 100644 agent-sdk/agent-session/message/send/ChatMessageSender.ts create mode 100644 agent-sdk/agent-session/message/send/CombinedMessageSender.ts create mode 100644 agent-sdk/agent-session/message/send/MessageSender.ts create mode 100644 agent-sdk/external-deps/client-sdk-js.tsx create mode 100644 agent-sdk/external-deps/components-js.tsx create mode 100644 agent-sdk/lib/future.ts diff --git a/agent-sdk/agent-session/AgentParticipant.ts b/agent-sdk/agent-session/AgentParticipant.ts new file mode 100644 index 000000000..8c1510e96 --- /dev/null +++ b/agent-sdk/agent-session/AgentParticipant.ts @@ -0,0 +1,183 @@ +import type TypedEventEmitter from 'typed-emitter'; +import { EventEmitter } from "events"; +import { ParticipantEvent, ParticipantKind, RemoteParticipant, Room, RoomEvent, Track, TranscriptionSegment } from 'livekit-client'; +import { getParticipantTrackRefs, participantTrackEvents, TrackReference } from '@/agent-sdk/external-deps/components-js'; +import { ParticipantEventCallbacks } from '@/agent-sdk/external-deps/client-sdk-js'; + +export enum AgentParticipantEvent { + VideoTrackChanged = 'videoTrackChanged', + AudioTrackChanged = 'videoTrackChanged', + AgentAttributesChanged = 'agentAttributesChanged', + // AgentTranscriptionsChanged = 'agentTranscriptionsChanged', +} + +export type AgentParticipantCallbacks = { + [AgentParticipantEvent.VideoTrackChanged]: (newTrack: TrackReference | null) => void; + [AgentParticipantEvent.AudioTrackChanged]: (newTrack: TrackReference | null) => void; + [AgentParticipantEvent.AgentAttributesChanged]: (newAttributes: Record) => void; +}; + +// Goal: some sort of abstraction layer to provide information specific to the agent's interactions +// like video stream / audio stream / transcriptions / underlying participant attributes / etc, +// since it doesn't just come from one RemoteParticipant +// FIXME: maybe this could be named better? ... +export default class AgentParticipant extends (EventEmitter as new () => TypedEventEmitter) { + private room: Room; + + private agentParticipant: RemoteParticipant | null = null; + private workerParticipant: RemoteParticipant | null = null; + audioTrack: TrackReference | null = null; + videoTrack: TrackReference | null = null; + + audioTrackSyncTime: { timestamp: number, rtpTimestamp?: number } | null = null; + + attributes: Record = {}; + + transcriptions: Array = []; + transcriptionBufferSize: number = 100//TRACK_TRANSCRIPTION_DEFAULTS.bufferSize; + + constructor(room: Room) { + super(); + this.room = room; + + this.room.on(RoomEvent.ParticipantConnected, this.handleParticipantConnected); + this.room.on(RoomEvent.ParticipantDisconnected, this.handleParticipantDisconnected); + } + + teardown() { + this.room.off(RoomEvent.ParticipantConnected, this.handleParticipantConnected); + this.room.off(RoomEvent.ParticipantDisconnected, this.handleParticipantDisconnected); + } + + private handleParticipantConnected = () => { + this.updateParticipants(); + } + private handleParticipantDisconnected = () => { + this.updateParticipants(); + } + + private updateParticipants() { + const newAgentParticipant = this.roomRemoteParticipants.find( + (p) => p.kind === ParticipantKind.AGENT && !('lk.publish_on_behalf' in p.attributes), + ) ?? null; + const newWorkerParticipant = newAgentParticipant ? ( + this.roomRemoteParticipants.find( + (p) => + p.kind === ParticipantKind.AGENT && p.attributes['lk.publish_on_behalf'] === newAgentParticipant.identity, + ) ?? null + ) : null; + + const oldAgentParticipant = this.agentParticipant; + const oldWorkerParticipant = this.workerParticipant; + this.agentParticipant = newAgentParticipant; + this.workerParticipant = newWorkerParticipant; + + // 1. Listen for attribute changes + if (oldAgentParticipant !== this.agentParticipant) { + oldAgentParticipant?.off(ParticipantEvent.AttributesChanged, this.handleAttributesChanged); + + if (this.agentParticipant) { + this.agentParticipant.on(ParticipantEvent.AttributesChanged, this.handleAttributesChanged); + this.handleAttributesChanged(this.agentParticipant.attributes); + } + } + + // 2. Listen for track updates + for (const event of participantTrackEvents) { + if (oldAgentParticipant !== this.agentParticipant) { + oldAgentParticipant?.off(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + if (this.agentParticipant) { + this.agentParticipant.on(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + this.handleUpdateTracks(); + } + } + if (oldWorkerParticipant !== this.workerParticipant) { + oldWorkerParticipant?.off(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + if (this.workerParticipant) { + this.workerParticipant.on(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); + this.handleUpdateTracks(); + } + } + } + } + + private handleUpdateTracks = () => { + const newVideoTrack = ( + this.agentTracks.find((t) => t.source === Track.Source.Camera) ?? + this.workerTracks.find((t) => t.source === Track.Source.Camera) ?? null + ); + if (this.videoTrack !== newVideoTrack) { + this.videoTrack = newVideoTrack; + this.emit(AgentParticipantEvent.VideoTrackChanged, newVideoTrack); + } + + const newAudioTrack = ( + this.agentTracks.find((t) => t.source === Track.Source.Microphone) ?? + this.workerTracks.find((t) => t.source === Track.Source.Microphone) ?? null + ); + if (this.audioTrack !== newAudioTrack) { + // console.log('!! audio track changed', this.audioTrack?.publication); + // this.audioTrack?.publication.off(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); + this.audioTrack = newAudioTrack; + // this.audioTrack?.publication.on(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); + + // this.audioTrackSyncTime = { + // timestamp: Date.now(), + // rtpTimestamp: this.audioTrack?.publication.track?.rtpTimestamp, + // }; + // this.audioTrack?.publication.track?.on(TrackEvent.TimeSyncUpdate, this.handleTimeSyncUpdate); + + this.emit(AgentParticipantEvent.AudioTrackChanged, newAudioTrack); + } + }; + + private handleAttributesChanged = (attributes: Record) => { + this.attributes = attributes; + this.emit(AgentParticipantEvent.AgentAttributesChanged, attributes); + }; + + // private handleTranscriptionReceived = (segments: Array) => { + // console.log('!! TRANSCRIPTION', segments, this.audioTrackSyncTime); + // if (!this.audioTrackSyncTime) { + // throw new Error('AgentParticipant - audioTrackSyncTime missing'); + // } + // const audioTrackSyncTime = this.audioTrackSyncTime; + + // this.transcriptions = dedupeSegments( + // this.transcriptions, + // // when first receiving a segment, add the current media timestamp to it + // segments.map((s) => addMediaTimestampToTranscription(s, audioTrackSyncTime)), + // this.transcriptionBufferSize, + // ); + // this.emit(AgentParticipantEvent.AgentTranscriptionsChanged, this.transcriptions); + // } + + // private handleTimeSyncUpdate = (update: { timestamp: number; rtpTimestamp: number }) => { + // console.log('!! TIME SYNC UPDATE', update); + // this.audioTrackSyncTime = update; + // }; + + private get roomRemoteParticipants() { + return Array.from(this.room.remoteParticipants.values()); + } + + private get agentTracks() { + if (!this.agentParticipant) { + return []; + } + return getParticipantTrackRefs( + this.agentParticipant, + { sources: [Track.Source.Microphone, Track.Source.Camera] } + ); + } + + private get workerTracks() { + if (!this.workerParticipant) { + return []; + } + return getParticipantTrackRefs( + this.workerParticipant, + { sources: [Track.Source.Microphone, Track.Source.Camera] } + ); + } +} diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts new file mode 100644 index 000000000..b909a0f65 --- /dev/null +++ b/agent-sdk/agent-session/AgentSession.ts @@ -0,0 +1,207 @@ +import type TypedEventEmitter from 'typed-emitter'; +import { EventEmitter } from "events"; +import { Room, RoomEvent, ConnectionState } from 'livekit-client'; + +import { + type BaseMessageId, + type ReceivedMessage, + type SentMessage, + MessageSender, + MessageReceiver, + ChatMessageSender, + CombinedMessageSender, + CombinedMessageReceiver, + TranscriptionMessageReceiver, +} from "./message"; +import AgentParticipant, { AgentParticipantEvent } from './AgentParticipant'; + + +export enum AgentSessionEvent { + AgentStateChanged = 'agentStateChanged', + AgentAttributesChanged = 'agentAttributesChanged', + MessagesChanged = 'messagesChanged', + AgentConnectionFailure = 'AgentConnectionFailure', +} + +export type AgentSessionCallbacks = { + [AgentSessionEvent.AgentStateChanged]: (newAgentState: AgentState) => void; + [AgentSessionEvent.MessagesChanged]: (newMessages: Array) => void; + [AgentSessionEvent.AgentConnectionFailure]: (reason: string) => void; +}; + +const stateAttribute = 'lk.agent.state'; + +export type AgentState = + | 'disconnected' + | 'connecting' + | 'initializing' + | 'listening' + | 'thinking' + | 'speaking'; + +export class AgentSession extends (EventEmitter as new () => TypedEventEmitter) { + room: Room; // FIXME: should this be private? + state: AgentState = 'disconnected'; + + agentParticipant: AgentParticipant | null = null; + messageSender: MessageSender | null = null; + messageReceiver: MessageReceiver | null = null; + + // FIXME: maybe make an OrderedMessageList with these two fields in it? + messageById: Map = new Map(); + messageIds: Array = []; + + constructor() { + super(); + + this.room = new Room(); + this.room.on(RoomEvent.Connected, this.handleRoomConnected); + this.room.on(RoomEvent.Disconnected, this.handleRoomDisconnected); + this.room.on(RoomEvent.ConnectionStateChanged, this.handleConnectionStateChanged); + } + + async connect(url: string, token: string) { + // FIXME: catch connection errors here and reraise? idk + await Promise.all([ + this.room.connect(url, token), + // FIXME: make it so the preconenct buffer thing can be disabled? + this.room.localParticipant.setMicrophoneEnabled(true, undefined, { preConnectBuffer: true }), + ]); + } + async disconnect() { + await this.room.disconnect(); + } + + private handleRoomConnected = () => { + console.log('!! CONNECTED'); + this.agentParticipant = new AgentParticipant(this.room); + this.agentParticipant.on(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); + this.updateAgentState(); + + const chatMessageSender = new ChatMessageSender(this.localParticipant); + this.messageSender = new CombinedMessageSender( + chatMessageSender, + // TODO: other types of messages that can be sent + ); + + this.messageReceiver = new CombinedMessageReceiver( + new TranscriptionMessageReceiver(this.room), + chatMessageSender.generateLoopbackMessageReceiver(), + // TODO: images? attachments? rpc? + ); + (async () => { + // FIXME: is this sort of pattern a better idea than just making MessageReceiver an EventEmitter? + // FIXME: this probably doesn't handle errors properly right now + for await (const message of this.messageReceiver!.messages()) { + this.handleIncomingMessage(message); + } + })(); + + this.startAgentConnectedTimeout(); + } + + private handleRoomDisconnected = () => { + console.log('!! DISCONNECTED'); + this.agentParticipant?.off(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); + this.agentParticipant?.teardown(); + this.agentParticipant = null; + this.updateAgentState(); + + this.messageReceiver?.close(); + this.messageReceiver = null; + + if (this.agentConnectedTimeout) { + clearTimeout(this.agentConnectedTimeout); + this.agentConnectedTimeout = null; + } + } + + private agentConnectedTimeout: NodeJS.Timeout | null = null; + private startAgentConnectedTimeout = () => { + this.agentConnectedTimeout = setTimeout(() => { + if (!this.isAvailable) { + const reason = + this.state === 'connecting' + ? 'Agent did not join the room. ' + : 'Agent connected but did not complete initializing. '; + + this.emit(AgentSessionEvent.AgentConnectionFailure, reason); + this.room.disconnect(); + } + }, 10_000); + } + + private handleConnectionStateChanged = async () => { + this.updateAgentState(); + } + + private handleAgentAttributesChanged = () => { + console.log('!! ATTRIB CHANGED:', this.agentParticipant?.attributes) + this.updateAgentState(); + } + + private handleIncomingMessage = (incomingMessage: ReceivedMessage) => { + // Upsert the message into the list + this.messageById.set(incomingMessage.id, incomingMessage); + if (!this.messageIds.includes(incomingMessage.id)) { + this.messageIds.push(incomingMessage.id); + } + + this.emit(AgentSessionEvent.MessagesChanged, this.messages); + } + + private updateAgentState = () => { + let newAgentState: AgentState | null = null; + if (!this.agentParticipant) { + // throw new Error('AgentSession.agentParticipant is unset'); + newAgentState = 'disconnected'; + } else { + const agentParticipantAttributes = this.agentParticipant.attributes; + const connectionState = this.room.state; + + if (connectionState === ConnectionState.Disconnected) { + newAgentState = 'disconnected'; + } else if ( + connectionState === ConnectionState.Connecting || + !this.agentParticipant || + !agentParticipantAttributes?.[stateAttribute] + ) { + newAgentState = 'connecting'; + } else { + newAgentState = agentParticipantAttributes[stateAttribute] as AgentState; + } + } + console.log('!! STATE:', newAgentState, this.agentParticipant?.attributes); + + if (this.state !== newAgentState) { + this.state = newAgentState; + this.emit(AgentSessionEvent.AgentStateChanged, newAgentState); + } + } + + get isAvailable() { + return this.state == 'listening' || this.state == 'thinking' || this.state == 'speaking'; + } + + get localParticipant() { + return this.room?.localParticipant ?? null; + } + + get messages() { + return ( + this.messageIds + .map(id => this.messageById.get(id)) + // FIXME: can I get rid of the filter somehow? + .filter((message): message is SentMessage | ReceivedMessage => typeof message !== 'undefined') + ); + } + + // FIXME: maybe there should be a special case where if message is `string` it is converted into + // a `SentChatMessage`? + async sendMessage(message: SentMessage) { + if (!this.messageSender) { + throw new Error('AgentSession.sendMessage - cannot send message until room is connected and MessageSender initialized!'); + } + await this.messageSender.send(message); + } +} diff --git a/agent-sdk/agent-session/message/index.ts b/agent-sdk/agent-session/message/index.ts new file mode 100644 index 000000000..4cb90e260 --- /dev/null +++ b/agent-sdk/agent-session/message/index.ts @@ -0,0 +1,36 @@ +import { TextStreamInfo } from '@/agent-sdk/external-deps/client-sdk-js'; + +export type BaseMessageId = string; +export type BaseMessage = { + id: BaseMessageId; + direction: Direction; + timestamp: Date; + content: Content; +}; + +export type ReceivedTranscriptionMessage = BaseMessage<'inbound', { + type: 'transcription'; + text: string; + participantInfo: { identity: string }; + streamInfo: TextStreamInfo; +}>; + +export type ReceivedChatLoopbackMessage = BaseMessage<'inbound', { type: 'chat'; text: string }>; + +export type ReceivedMessage = + | ReceivedTranscriptionMessage + | ReceivedChatLoopbackMessage; + // TODO: images? attachments? rpc? + +export type SentChatMessage = BaseMessage<'outbound', | { type: 'chat', text: string }>; +export type SentMessage = + | SentChatMessage; + +// FIXME: maybe update all these functions to not have default exports as to avoid the duplicate +// names being written here? +export { default as MessageSender } from './send/MessageSender'; +export { default as ChatMessageSender } from './send/ChatMessageSender'; +export { default as CombinedMessageSender } from './send/CombinedMessageSender'; +export { default as MessageReceiver } from './receive/MessageReceiver'; +export { default as CombinedMessageReceiver } from './receive/CombinedMessageReceiver'; +export { default as TranscriptionMessageReceiver } from './receive/TranscriptionMessageReceiver'; diff --git a/agent-sdk/agent-session/message/receive/CombinedMessageReceiver.ts b/agent-sdk/agent-session/message/receive/CombinedMessageReceiver.ts new file mode 100644 index 000000000..0e3389868 --- /dev/null +++ b/agent-sdk/agent-session/message/receive/CombinedMessageReceiver.ts @@ -0,0 +1,31 @@ +import { parallelMerge } from "streaming-iterables"; +import MessageReceiver from "./MessageReceiver"; + +/** + * A `MessageReceiver` that zips together multiple underlying `MessageReceiver`s into one unified source. + */ +export default class CombinedMessageReceiver extends MessageReceiver { + private messageReceivers: Array; + + constructor(...messageReceivers: Array) { + super(); + this.messageReceivers = messageReceivers; + } + + async start() { + const messagesAsyncIterators = this.messageReceivers.map(mr => mr.messages()); + (async () => { + for await (const inboundMessage of parallelMerge(...messagesAsyncIterators)) { + this.enqueue(inboundMessage); + } + })().catch(err => { + this.closeWithError(err); + }); + + return () => { + for (const messageReceiver of this.messageReceivers) { + messageReceiver.close(); + } + }; + } +} diff --git a/agent-sdk/agent-session/message/receive/MessageReceiver.ts b/agent-sdk/agent-session/message/receive/MessageReceiver.ts new file mode 100644 index 000000000..9265d5d46 --- /dev/null +++ b/agent-sdk/agent-session/message/receive/MessageReceiver.ts @@ -0,0 +1,57 @@ +import Future from "@/agent-sdk/lib/future"; +import { type ReceivedMessage } from ".."; + +/** Thrown to signal that a MessageReceiver.messages() generator invocation was terminated out of band */ +export class MessageReceiverTerminationError extends Error {} + +/** + * A MessageReceiver acts as a source for all messages in the system. + */ +export default abstract class MessageReceiver { + private signallingFuture = new Future(); + private queue: Array = []; + + // This returns a cleanup function like useEffect maybe? That could be a good pattern? + abstract start(): Promise void)>; + + /** Submit new IncomingMessages to be received by anybody reading from messages() */ + protected enqueue(...messages: Array) { + for (const message of messages) { + this.queue.push(message); + } + const oldSignallingFuture = this.signallingFuture; + this.signallingFuture = new Future(); + oldSignallingFuture.resolve?.(null); + } + + /** Terminate the messages() iteration from an external source */ + close() { + const name: string = (this as any).constructor.name ?? 'MessageReceiver'; + this.signallingFuture.reject?.( + new MessageReceiverTerminationError(`${name} terminated messages() iteration`) + ); + } + + closeWithError(error: Error) { + this.signallingFuture.reject?.(error); + } + + /** A stream of newly generated `IncomingMessage`s */ + async *messages(): AsyncGenerator { + const cleanup = await this.start(); + try { + while (true) { + await this.signallingFuture.promise; + yield* this.queue; + this.queue = []; + } + } catch (err) { + if (err instanceof MessageReceiverTerminationError) { + cleanup?.(); + return; + } + } finally { + cleanup?.(); + } + } +} diff --git a/agent-sdk/agent-session/message/receive/TranscriptionMessageReceiver.ts b/agent-sdk/agent-session/message/receive/TranscriptionMessageReceiver.ts new file mode 100644 index 000000000..2d823d490 --- /dev/null +++ b/agent-sdk/agent-session/message/receive/TranscriptionMessageReceiver.ts @@ -0,0 +1,139 @@ +import { Room, TextStreamReader } from "livekit-client"; +import { DataTopic } from "@/agent-sdk/external-deps/components-js"; +import { TextStreamInfo } from "@/agent-sdk/external-deps/client-sdk-js"; + +import { type ReceivedMessage, type ReceivedTranscriptionMessage } from ".."; +import MessageReceiver from "./MessageReceiver"; + +export enum TranscriptionAttributes { + Final = "lk.transcription_final", + Segment = "lk.segment_id", + TrackId = "lk.transcribed_track_id", +} + +/** + * Processes new `lk.transcription` data stream events generated by the agent for both user and + * LLM generated speach and generates corresponding `TranscriptionReceivedMessage`s. + * + * For agent messages, a new text stream is emitted for each message, and the stream is closed when the message is finalized. + * Each agent message is delivered in chunks which must be accumulated and published into the message stream. + * + * For user messages, the full transcription is sent each time, but may be updated until finalized. + * + * The `lk.segment_id` attribute is stable and unique across the lifetime of the message. + * + * Example agent generated transcriptions: + * ``` + * { segment_id: "1", content: "Hello" } + * { segment_id: "1", content: " world" } + * { segment_id: "1", content: "!" } + * { segment_id: "2", content: "Hello" } + * { segment_id: "2", content: " Apple" } + * { segment_id: "2", content: "!" } + * ``` + * + * Example user generated transcriptions: + * ``` + * { segment_id: "3", content: "Hello" } + * { segment_id: "3", content: "Hello world!" } + * { segment_id: "4", content: "Hello" } + * { segment_id: "4", content: "Hello Apple!" } + * ``` + */ +export default class TranscriptionMessageReceiver extends MessageReceiver { + room: Room; + inFlightMessages: Array = []; + + constructor(room: Room) { + super(); + this.room = room; + } + + async start() { + const textStreamHandler = async (reader: TextStreamReader, participantInfo: { identity: string }) => { + const transcriptionSegmentId = reader.info.attributes?.[TranscriptionAttributes.Segment]; + const isTranscription = Boolean(transcriptionSegmentId); + const isFinal = reader.info.attributes?.[TranscriptionAttributes.Final] === 'true'; + + let currentStreamId = reader.info.id; + + // Find and update the stream in our array + let messageIndex = this.inFlightMessages.findIndex((message) => { + if (message.content.streamInfo.id === reader.info.id) { + return true; + } + if (isTranscription && transcriptionSegmentId === message.content.streamInfo.attributes?.[TranscriptionAttributes.Segment]) { + return true; + } + return false; + }); + + // FIXME: I think there may need to be some error handling logic to ensure the below for await + // properly exposes errors via `this.closeWithError` + for await (const chunk of reader) { + const existingMessage = this.inFlightMessages[messageIndex]; + if (existingMessage) { + if (existingMessage.content.streamInfo.id === currentStreamId) { + // Stream hasn't changed, just append content + const updatedMessage = this.appendInFlightMessageText(messageIndex, chunk, reader.info); + this.enqueue(updatedMessage); + } else { + // Stream has changed, so fully replace content + const updatedMessage = this.replaceInFlightMessageText(messageIndex, chunk, reader.info); + this.enqueue(updatedMessage); + } + + } else { + // Handle case where stream ID wasn't found (new message) + const message: ReceivedMessage = { + id: reader.info.id, + direction: 'inbound', + timestamp: new Date(reader.info.timestamp), + content: { + type: 'transcription', + text: chunk, + participantInfo, + streamInfo: reader.info, + }, + }; + this.inFlightMessages.push(message); + messageIndex = this.inFlightMessages.length-1; + this.enqueue(message); + } + } + + if (isFinal) { + this.inFlightMessages.splice(messageIndex, 1); + console.log('!! MESSAGE DONE!', this.inFlightMessages); + } + }; + this.room.registerTextStreamHandler(DataTopic.TRANSCRIPTION, textStreamHandler); + + return () => { + this.room.unregisterTextStreamHandler(DataTopic.TRANSCRIPTION); + }; + } + + private replaceInFlightMessageText(messageIndex: number, text: string, streamInfo: TextStreamInfo) { + this.inFlightMessages[messageIndex] = { + ...this.inFlightMessages[messageIndex], + content: { + ...this.inFlightMessages[messageIndex].content, + text, + streamInfo, + }, + }; + return this.inFlightMessages[messageIndex]; + } + private appendInFlightMessageText(messageIndex: number, text: string, streamInfo: TextStreamInfo) { + this.inFlightMessages[messageIndex] = { + ...this.inFlightMessages[messageIndex], + content: { + ...this.inFlightMessages[messageIndex].content, + text: this.inFlightMessages[messageIndex].content.text + text, + streamInfo, + }, + }; + return this.inFlightMessages[messageIndex]; + } +} diff --git a/agent-sdk/agent-session/message/send/ChatMessageSender.ts b/agent-sdk/agent-session/message/send/ChatMessageSender.ts new file mode 100644 index 000000000..484b7a43f --- /dev/null +++ b/agent-sdk/agent-session/message/send/ChatMessageSender.ts @@ -0,0 +1,71 @@ +import { LocalParticipant } from "livekit-client"; + +import { type ReceivedChatLoopbackMessage, type SentChatMessage, type SentMessage } from ".."; +import MessageSender from "./MessageSender"; +import MessageReceiver from "../receive/MessageReceiver"; + + +/** A `MessageSender` for sending chat messages via the `lk.chat` datastream topic. */ +export default class ChatMessageSender extends MessageSender { + private localParticipant: LocalParticipant; + private loopbackReceiverCallbacks: Set<(incomingMessage: SentChatMessage) => void> = new Set(); + + constructor(localParticipant: LocalParticipant) { + super(); + this.localParticipant = localParticipant; + } + + canSend(message: SentMessage): message is SentChatMessage { + return message.content.type === 'chat'; + } + + async send(message: SentChatMessage) { + for (const callback of this.loopbackReceiverCallbacks) { + callback(message); + } + + await this.localParticipant.sendText(message.content.text, /* FIXME: options here? */); + + // FIXME: do I need to handle sending legacy chat messages too? + // const legacyChatMsg: LegacyChatMessage = { + // id: message.id, + // timestamp: message.timestamp.getTime(), + // message: message.content.text, + // }; + // const encodeLegacyMsg = (message: LegacyChatMessage) => new TextEncoder().encode(JSON.stringify(message)); + // await this.localParticipant.publishData(encodeLegacyMsg(legacyChatMsg), { + // topic: "lk-chat-topic",//LegacyDataTopic.CHAT, + // reliable: true, + // }); + } + + /** + * Generates a corresponding MessageReceiver which will emit "received" versions of each chat + * message, that can be correspondingly merged into the message list. + * + * FIXME: should this be on the MessageSender instead, so this can be done for any sender? + */ + generateLoopbackMessageReceiver() { + const chatMessageSender = this; + class ChatMessageLoopbackReceiver extends MessageReceiver { + async start() { + const callback = (incomingMessage: SentChatMessage) => { + const outgoingMessage: ReceivedChatLoopbackMessage = { + id: incomingMessage.id, + direction: 'inbound', + timestamp: incomingMessage.timestamp, + content: { type: 'chat', text: incomingMessage.content.text }, + }; + this.enqueue(outgoingMessage); + }; + + chatMessageSender.loopbackReceiverCallbacks.add(callback); + return () => { + chatMessageSender.loopbackReceiverCallbacks.delete(callback); + }; + } + } + + return new ChatMessageLoopbackReceiver(); + } +} diff --git a/agent-sdk/agent-session/message/send/CombinedMessageSender.ts b/agent-sdk/agent-session/message/send/CombinedMessageSender.ts new file mode 100644 index 000000000..d119c5fe1 --- /dev/null +++ b/agent-sdk/agent-session/message/send/CombinedMessageSender.ts @@ -0,0 +1,33 @@ +import { type SentMessage } from ".."; +import MessageSender from "./MessageSender"; + +/** + * A `MessageSender` that routes any `SentMessage` to the first underlying `MessageSender` which + * can accept it. + */ +export default class CombinedMessageSender extends MessageSender { + private messageSenders: Array; + + constructor(...messageSenders: Array) { + super(); + this.messageSenders = messageSenders; + } + + canSend(message: SentMessage): message is SentMessage { + return true; + } + + async send(message: SentMessage) { + for (const sender of this.messageSenders) { + // FIXME: an open question - should this only ever send with one MessageSender or potentially + // multiple? It doesn't matter now given there is only one MessageSender (ChatMessageSender) + // but I'm not sure the right long term call. + if (sender.canSend(message)) { + await sender.send(message); + return; + } + } + + throw new Error(`CombinedMessageSender - cannot find a MessageSender to send message ${JSON.stringify(message)}`); + } +} diff --git a/agent-sdk/agent-session/message/send/MessageSender.ts b/agent-sdk/agent-session/message/send/MessageSender.ts new file mode 100644 index 000000000..d00904a6c --- /dev/null +++ b/agent-sdk/agent-session/message/send/MessageSender.ts @@ -0,0 +1,7 @@ +import { type SentMessage } from ".."; + +export default abstract class MessageSender { + /** Can this MessageSender handle sending the given message? */ + abstract canSend(message: SentMessage): message is Message + abstract send(message: Message): Promise; +} diff --git a/agent-sdk/external-deps/client-sdk-js.tsx b/agent-sdk/external-deps/client-sdk-js.tsx new file mode 100644 index 000000000..3f50b8880 --- /dev/null +++ b/agent-sdk/external-deps/client-sdk-js.tsx @@ -0,0 +1,21 @@ +// This file contains pieces copied and pasted from the livekit-client package, largely internal +// things that aren't currently being exported. +// +// FIXME: export this stuff in livekit-client or explicitly vendor this stuff into the agents sdk + +export interface BaseStreamInfo { + id: string; + mimeType: string; + topic: string; + timestamp: number; + /** total size in bytes for finite streams and undefined for streams of unknown size */ + size?: number; + attributes?: Record; +} +export interface ByteStreamInfo extends BaseStreamInfo { + name: string; +} + +export interface TextStreamInfo extends BaseStreamInfo {} + +export { type ParticipantEventCallbacks } from "../../node_modules/livekit-client/src/room/participant/Participant"; diff --git a/agent-sdk/external-deps/components-js.tsx b/agent-sdk/external-deps/components-js.tsx new file mode 100644 index 000000000..424dfa05d --- /dev/null +++ b/agent-sdk/external-deps/components-js.tsx @@ -0,0 +1,102 @@ +import { Participant, ParticipantEvent, Track, TrackPublication, TranscriptionSegment } from "livekit-client"; + +// This file contains pieces copied and pasted from the components-js repository +// Something is messed up with my local development environment and I can't figure out how to import +// these properly +// +// FIXME: figure out what is going on here or explicitly vendor this stuff into the agents sdk + +/** @public */ +export type TrackReference = { + participant: Participant; + publication: TrackPublication; + source: Track.Source; +}; + +export const participantTrackEvents = [ + ParticipantEvent.TrackPublished, + ParticipantEvent.TrackUnpublished, + ParticipantEvent.TrackMuted, + ParticipantEvent.TrackUnmuted, + ParticipantEvent.TrackStreamStateChanged, + ParticipantEvent.TrackSubscribed, + ParticipantEvent.TrackUnsubscribed, + ParticipantEvent.TrackSubscriptionPermissionChanged, + ParticipantEvent.TrackSubscriptionFailed, + ParticipantEvent.LocalTrackPublished, + ParticipantEvent.LocalTrackUnpublished, +]; + +export type ReceivedTranscriptionSegment = TranscriptionSegment & { + receivedAtMediaTimestamp: number; + receivedAt: number; +}; + +export function addMediaTimestampToTranscription( + segment: TranscriptionSegment, + timestamps: { timestamp: number; rtpTimestamp?: number }, +): ReceivedTranscriptionSegment { + return { + ...segment, + receivedAtMediaTimestamp: timestamps.rtpTimestamp ?? 0, + receivedAt: timestamps.timestamp, + }; +} + +/** + * @returns An array of unique (by id) `TranscriptionSegment`s. Latest wins. If the resulting array would be longer than `windowSize`, the array will be reduced to `windowSize` length + */ +export function dedupeSegments( + prevSegments: T[], + newSegments: T[], + windowSize: number, +) { + return [...prevSegments, ...newSegments] + .reduceRight((acc, segment) => { + if (!acc.find((val) => val.id === segment.id)) { + acc.unshift(segment); + } + return acc; + }, [] as Array) + .slice(0 - windowSize); +} + +/** + * Create `TrackReferences` for all tracks that are included in the sources property. + * */ +export function getParticipantTrackRefs( + participant: Participant, + identifier: any/* ParticipantTrackIdentifier */, + onlySubscribedTracks = false, +): TrackReference[] { + const { sources, kind, name } = identifier; + const sourceReferences = Array.from(participant.trackPublications.values()) + .filter( + (pub) => + (!sources || sources.includes(pub.source)) && + (!kind || pub.kind === kind) && + (!name || pub.trackName === name) && + // either return all or only the ones that are subscribed + (!onlySubscribedTracks || pub.track), + ) + .map((track): TrackReference => { + return { + participant: participant, + publication: track, + source: track.source, + }; + }); + + return sourceReferences; +} + +export interface TextStreamData { + text: string; + participantInfo: { identity: string }; // Replace with the correct type from livekit-client + streamInfo: any /* TextStreamInfo */; +} + +export const DataTopic = { + CHAT: 'lk.chat', + TRANSCRIPTION: 'lk.transcription', +} as const; diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index b6614ff1b..d8643c9d9 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -1,177 +1,16 @@ import * as React from "react"; import { useContext, useEffect, useState, useCallback } from "react"; -import { parallelMerge } from 'streaming-iterables'; import { - ConnectionState, - LocalParticipant, Participant, ParticipantEvent, - RemoteParticipant, - Room, - RoomEvent, Track, - TrackEvent, - TrackPublication, - TranscriptionSegment, - ParticipantKind, - TextStreamReader, // TextStreamInfo, } from "livekit-client"; -import { EventEmitter } from "events"; -// import { addMediaTimestampToTranscription, dedupeSegments, ReceivedTranscriptionSegment } from '@livekit/components-core'; -// import { getParticipantTrackRefs } from '@livekit/components/src/observables/track'; +import { TrackReference } from "@/agent-sdk/external-deps/components-js"; import { ParticipantEventCallbacks } from "../node_modules/livekit-client/src/room/participant/Participant"; -// import { LegacyChatMessage } from "@livekit/components-core"; -// import { DataTopic /* , ParticipantTrackIdentifier */ } from "@livekit/components-core"; -// import { TRACK_TRANSCRIPTION_DEFAULTS } from "../hooks"; -// import { Future } from "../node_modules/livekit-client/src/room/utils"; - -/* FROM LIVEKIT-CLIENT */ -class Future { - promise: Promise; - - resolve?: (arg: T) => void; - - reject?: (e: any) => void; - - onFinally?: () => void; - - get isResolved(): boolean { - return this._isResolved; - } - - private _isResolved: boolean = false; - - constructor( - futureBase?: (resolve: (arg: T) => void, reject: (e: any) => void) => void, - onFinally?: () => void, - ) { - this.onFinally = onFinally; - this.promise = new Promise(async (resolve, reject) => { - this.resolve = resolve; - this.reject = reject; - if (futureBase) { - await futureBase(resolve, reject); - } - }).finally(() => { - this._isResolved = true; - this.onFinally?.(); - }); - } -} - -interface BaseStreamInfo { - id: string; - mimeType: string; - topic: string; - timestamp: number; - /** total size in bytes for finite streams and undefined for streams of unknown size */ - size?: number; - attributes?: Record; -} -interface ByteStreamInfo extends BaseStreamInfo { - name: string; -} - -interface TextStreamInfo extends BaseStreamInfo {} -/* END FROM LIVEKIT CLIENT */ - -/* FROM COMPONENTS JS: */ -/** @public */ -export type TrackReference = { - participant: Participant; - publication: TrackPublication; - source: Track.Source; -}; - -const participantTrackEvents = [ - ParticipantEvent.TrackPublished, - ParticipantEvent.TrackUnpublished, - ParticipantEvent.TrackMuted, - ParticipantEvent.TrackUnmuted, - ParticipantEvent.TrackStreamStateChanged, - ParticipantEvent.TrackSubscribed, - ParticipantEvent.TrackUnsubscribed, - ParticipantEvent.TrackSubscriptionPermissionChanged, - ParticipantEvent.TrackSubscriptionFailed, - ParticipantEvent.LocalTrackPublished, - ParticipantEvent.LocalTrackUnpublished, -]; - -type ReceivedTranscriptionSegment = TranscriptionSegment & { - receivedAtMediaTimestamp: number; - receivedAt: number; -}; - -function addMediaTimestampToTranscription( - segment: TranscriptionSegment, - timestamps: { timestamp: number; rtpTimestamp?: number }, -): ReceivedTranscriptionSegment { - return { - ...segment, - receivedAtMediaTimestamp: timestamps.rtpTimestamp ?? 0, - receivedAt: timestamps.timestamp, - }; -} - -/** - * @returns An array of unique (by id) `TranscriptionSegment`s. Latest wins. If the resulting array would be longer than `windowSize`, the array will be reduced to `windowSize` length - */ -function dedupeSegments( - prevSegments: T[], - newSegments: T[], - windowSize: number, -) { - return [...prevSegments, ...newSegments] - .reduceRight((acc, segment) => { - if (!acc.find((val) => val.id === segment.id)) { - acc.unshift(segment); - } - return acc; - }, [] as Array) - .slice(0 - windowSize); -} - -/** - * Create `TrackReferences` for all tracks that are included in the sources property. - * */ -function getParticipantTrackRefs( - participant: Participant, - identifier: any/* ParticipantTrackIdentifier */, - onlySubscribedTracks = false, -): TrackReference[] { - const { sources, kind, name } = identifier; - const sourceReferences = Array.from(participant.trackPublications.values()) - .filter( - (pub) => - (!sources || sources.includes(pub.source)) && - (!kind || pub.kind === kind) && - (!name || pub.trackName === name) && - // either return all or only the ones that are subscribed - (!onlySubscribedTracks || pub.track), - ) - .map((track): TrackReference => { - return { - participant: participant, - publication: track, - source: track.source, - }; - }); - - return sourceReferences; -} - -interface TextStreamData { - text: string; - participantInfo: { identity: string }; // Replace with the correct type from livekit-client - streamInfo: any /* TextStreamInfo */; -} - -const DataTopic = { - CHAT: 'lk.chat', - TRANSCRIPTION: 'lk.transcription', -} as const; -/* END FROM COMPONENTS JS: */ +import { AgentSession, AgentSessionCallbacks, AgentSessionEvent } from "./agent-session/AgentSession"; +import { ReceivedMessage, SentMessage } from "./agent-session/message"; +import { AgentParticipantCallbacks, AgentParticipantEvent } from "./agent-session/AgentParticipant"; // --------------------- // REACT @@ -212,9 +51,9 @@ export function useAgentMessages() { return { messages, send }; } -export function useAgentSessionEvent( - eventName: AgentSessionEvent, - callback: (data: any /* FIXME: types */) => void, +export function useAgentSessionEvent( + eventName: EventName, + callback: AgentSessionCallbacks[EventName], dependencies: React.DependencyList, ) { const agentSession = useAgentSession(); @@ -230,12 +69,35 @@ export function useAgentSessionEvent( }, [eventName, memoizedCallback]); } +export function useAgentParticipantEvent( + eventName: EventName, + callback: AgentParticipantCallbacks[EventName], + dependencies: React.DependencyList, +) { + const agentSession = useAgentSession(); + + // FIXME: is doing this memoiztion here a good idea? Maybe useAgentSessionEvent(..., useCallback(...)) is preferrable? + const memoizedCallback = useCallback(callback, dependencies); + + useEffect(() => { + if (!agentSession.agentParticipant) { + return; + } + + const agentParticipant = agentSession.agentParticipant; + agentParticipant.on(eventName, memoizedCallback); + return () => { + agentParticipant.off(eventName, memoizedCallback); + }; + }, [agentSession.agentParticipant, eventName, memoizedCallback]); +} + export function useAgentState() { const agentSession = useAgentSession(); const [agentState, setAgentState] = useState(agentSession.state); const [isAvailable, setIsAvailable] = useState(agentSession.isAvailable); - useAgentSessionEvent(AgentSessionEvent.AgentStateChanged, (newAgentState: AgentState) => { + useAgentSessionEvent(AgentSessionEvent.AgentStateChanged, (newAgentState) => { setAgentState(newAgentState); setIsAvailable(agentSession.isAvailable); }, []); @@ -247,9 +109,9 @@ export function useAgentTracks() { const agentSession = useAgentSession(); const [audioTrack, setAudioTrack] = useState(agentSession.agentParticipant?.audioTrack ?? null); - useAgentSessionEvent(AgentSessionEvent.AudioTrackChanged, setAudioTrack, []); + useAgentParticipantEvent(AgentParticipantEvent.AudioTrackChanged, setAudioTrack, []); const [videoTrack, setVideoTrack] = useState(agentSession.agentParticipant?.videoTrack ?? null); - useAgentSessionEvent(AgentSessionEvent.VideoTrackChanged, setVideoTrack, []); + useAgentParticipantEvent(AgentParticipantEvent.VideoTrackChanged, setVideoTrack, []); return { audioTrack, videoTrack }; } @@ -328,717 +190,7 @@ export function useAgentLocalParticipant() { // useAgentTracks? (video) // useAgentControls? (control bar stuff) -// --------------------- -// BASE -// --------------------- - -const stateAttribute = 'lk.agent.state'; - -export type AgentState = - | 'disconnected' - | 'connecting' - | 'initializing' - | 'listening' - | 'thinking' - | 'speaking'; - -enum AgentParticipantEvent { - VideoTrackChanged = 'videoTrackChanged', - AudioTrackChanged = 'videoTrackChanged', - AgentAttributesChanged = 'agentAttributesChanged', - AgentTranscriptionsChanged = 'agentTranscriptionsChanged', -} - -// Goal: some sort of abstraction layer to provide information specific to the agent's interactions -// like video stream / audio stream / transcriptions / underlying participant attributes / etc, -// since it doesn't just come from one RemoteParticipant -// FIXME: maybe this could be named better? ... -class AgentParticipant extends EventEmitter { - private room: Room; - - private agentParticipant: RemoteParticipant | null = null; - private workerParticipant: RemoteParticipant | null = null; - audioTrack: TrackReference | null = null; - videoTrack: TrackReference | null = null; - - audioTrackSyncTime: { timestamp: number, rtpTimestamp?: number } | null = null; - - attributes: Record = {}; - - transcriptions: Array = []; - transcriptionBufferSize: number = 100//TRACK_TRANSCRIPTION_DEFAULTS.bufferSize; - - constructor(room: Room) { - super(); - this.room = room; - - this.room.on(RoomEvent.ParticipantConnected, this.handleParticipantConnected); - this.room.on(RoomEvent.ParticipantDisconnected, this.handleParticipantDisconnected); - } - - teardown() { - this.room.off(RoomEvent.ParticipantConnected, this.handleParticipantConnected); - this.room.off(RoomEvent.ParticipantDisconnected, this.handleParticipantDisconnected); - } - - private handleParticipantConnected = () => { - this.updateParticipants(); - } - private handleParticipantDisconnected = () => { - this.updateParticipants(); - } - - private updateParticipants() { - const newAgentParticipant = this.roomRemoteParticipants.find( - (p) => p.kind === ParticipantKind.AGENT && !('lk.publish_on_behalf' in p.attributes), - ) ?? null; - const newWorkerParticipant = newAgentParticipant ? ( - this.roomRemoteParticipants.find( - (p) => - p.kind === ParticipantKind.AGENT && p.attributes['lk.publish_on_behalf'] === newAgentParticipant.identity, - ) ?? null - ) : null; - - const oldAgentParticipant = this.agentParticipant; - const oldWorkerParticipant = this.workerParticipant; - this.agentParticipant = newAgentParticipant; - this.workerParticipant = newWorkerParticipant; - - // 1. Listen for attribute changes - if (oldAgentParticipant !== this.agentParticipant) { - oldAgentParticipant?.off(ParticipantEvent.AttributesChanged, this.handleAttributesChanged); - - if (this.agentParticipant) { - this.agentParticipant.on(ParticipantEvent.AttributesChanged, this.handleAttributesChanged); - this.handleAttributesChanged(this.agentParticipant.attributes); - } - } - - // 2. Listen for track updates - for (const event of participantTrackEvents) { - if (oldAgentParticipant !== this.agentParticipant) { - oldAgentParticipant?.off(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); - if (this.agentParticipant) { - this.agentParticipant.on(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); - this.handleUpdateTracks(); - } - } - if (oldWorkerParticipant !== this.workerParticipant) { - oldWorkerParticipant?.off(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); - if (this.workerParticipant) { - this.workerParticipant.on(event as keyof ParticipantEventCallbacks, this.handleUpdateTracks); - this.handleUpdateTracks(); - } - } - } - } - - private handleUpdateTracks = () => { - const newVideoTrack = ( - this.agentTracks.find((t) => t.source === Track.Source.Camera) ?? - this.workerTracks.find((t) => t.source === Track.Source.Camera) ?? null - ); - if (this.videoTrack !== newVideoTrack) { - this.videoTrack = newVideoTrack; - this.emit(AgentParticipantEvent.VideoTrackChanged, newVideoTrack); - } - - const newAudioTrack = ( - this.agentTracks.find((t) => t.source === Track.Source.Microphone) ?? - this.workerTracks.find((t) => t.source === Track.Source.Microphone) ?? null - ); - if (this.audioTrack !== newAudioTrack) { - // console.log('!! audio track changed', this.audioTrack?.publication); - // this.audioTrack?.publication.off(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); - this.audioTrack = newAudioTrack; - // this.audioTrack?.publication.on(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); - - // this.audioTrackSyncTime = { - // timestamp: Date.now(), - // rtpTimestamp: this.audioTrack?.publication.track?.rtpTimestamp, - // }; - // this.audioTrack?.publication.track?.on(TrackEvent.TimeSyncUpdate, this.handleTimeSyncUpdate); - - this.emit(AgentParticipantEvent.AudioTrackChanged, newAudioTrack); - } - }; - - private handleAttributesChanged = (attributes: Record) => { - this.attributes = attributes; - this.emit(AgentParticipantEvent.AgentAttributesChanged, attributes); - }; - - // private handleTranscriptionReceived = (segments: Array) => { - // console.log('!! TRANSCRIPTION', segments, this.audioTrackSyncTime); - // if (!this.audioTrackSyncTime) { - // throw new Error('AgentParticipant - audioTrackSyncTime missing'); - // } - // const audioTrackSyncTime = this.audioTrackSyncTime; - - // this.transcriptions = dedupeSegments( - // this.transcriptions, - // // when first receiving a segment, add the current media timestamp to it - // segments.map((s) => addMediaTimestampToTranscription(s, audioTrackSyncTime)), - // this.transcriptionBufferSize, - // ); - // this.emit(AgentParticipantEvent.AgentTranscriptionsChanged, this.transcriptions); - // } - - // private handleTimeSyncUpdate = (update: { timestamp: number; rtpTimestamp: number }) => { - // console.log('!! TIME SYNC UPDATE', update); - // this.audioTrackSyncTime = update; - // }; - - private get roomRemoteParticipants() { - return Array.from(this.room.remoteParticipants.values()); - } - - private get agentTracks() { - if (!this.agentParticipant) { - return []; - } - return getParticipantTrackRefs( - this.agentParticipant, - { sources: [Track.Source.Microphone, Track.Source.Camera] } - ); - } - - private get workerTracks() { - if (!this.workerParticipant) { - return []; - } - return getParticipantTrackRefs( - this.workerParticipant, - { sources: [Track.Source.Microphone, Track.Source.Camera] } - ); - } -} - - - - -type BaseMessageId = string; -type BaseMessage = { - id: BaseMessageId; - direction: Direction; - timestamp: Date; - content: Content; +export { + AgentSession, + AgentSessionEvent, }; - -type ReceivedTranscriptionMessage = BaseMessage<'inbound', { - type: 'transcription'; - text: string; - participantInfo: { identity: string }; - streamInfo: TextStreamInfo; -}>; - -type ReceivedChatLoopbackMessage = BaseMessage<'inbound', { type: 'chat'; text: string }>; - -export type ReceivedMessage = - | ReceivedTranscriptionMessage - | ReceivedChatLoopbackMessage; - // TODO: images? attachments? rpc? - -type SentChatMessage = BaseMessage<'outbound', | { type: 'chat', text: string }>; -export type SentMessage = - | SentChatMessage; - - - -class MessageReceiverTerminationError extends Error {} - -abstract class MessageReceiver { - private signallingFuture = new Future(); - private queue: Array = []; - - // This returns cleanup function like useEffect maybe? That could be a good pattern? - abstract start(): Promise void)>; - - /** Submit new IncomingMessages to be received by anybody reading from messages() */ - protected enqueue(...messages: Array) { - for (const message of messages) { - this.queue.push(message); - } - const oldSignallingFuture = this.signallingFuture; - this.signallingFuture = new Future(); - oldSignallingFuture.resolve?.(null); - } - - /** Terminate the messages() iteration from out of band */ - close() { - const name: string = (this as any).constructor.name ?? 'MessageReceiver'; - this.signallingFuture.reject?.( - new MessageReceiverTerminationError(`${name} terminated messages() iteration`) - ); - } - - closeWithError(error: Error) { - this.signallingFuture.reject?.(error); - } - - /** A stream of newly generated `IncomingMessage`s */ - async *messages(): AsyncGenerator { - const cleanup = await this.start(); - try { - while (true) { - await this.signallingFuture.promise; - yield* this.queue; - this.queue = []; - } - } catch (err) { - if (err instanceof MessageReceiverTerminationError) { - cleanup?.(); - return; - } - } finally { - cleanup?.(); - } - } -} - -abstract class MessageSender { - /** Can this MessageSender handle sending the given message? */ - abstract canSend(message: SentMessage): message is Message - abstract send(message: Message): Promise; -} - -class ChatMessageSender extends MessageSender { - private localParticipant: LocalParticipant; - private loopbackReceiverCallbacks: Set<(incomingMessage: SentChatMessage) => void> = new Set(); - - constructor(localParticipant: LocalParticipant) { - super(); - this.localParticipant = localParticipant; - } - - canSend(message: SentMessage): message is SentChatMessage { - return message.content.type === 'chat'; - } - - async send(message: SentChatMessage) { - for (const callback of this.loopbackReceiverCallbacks) { - callback(message); - } - - await this.localParticipant.sendText(message.content.text, /* FIXME: options here? */); - - // const legacyChatMsg: LegacyChatMessage = { - // id: message.id, - // timestamp: message.timestamp.getTime(), - // message: message.content.text, - // }; - // const encodeLegacyMsg = (message: LegacyChatMessage) => new TextEncoder().encode(JSON.stringify(message)); - // await this.localParticipant.publishData(encodeLegacyMsg(legacyChatMsg), { - // topic: "lk-chat-topic",//LegacyDataTopic.CHAT, - // reliable: true, - // }); - } - - /** - * Generates a corresponding MessageReceiver which will emit "received" versions of each chat - * message, that can be correspondingly merged into the message list. - * - * FIXME: should this be on the MessageSender instead, so this can be done for any sender? - */ - generateLoopbackMessageReceiver() { - const chatMessageSender = this; - class ChatMessageLoopbackReceiver extends MessageReceiver { - async start() { - const callback = (incomingMessage: SentChatMessage) => { - const outgoingMessage: ReceivedChatLoopbackMessage = { - id: incomingMessage.id, - direction: 'inbound', - timestamp: incomingMessage.timestamp, - content: { type: 'chat', text: incomingMessage.content.text }, - }; - this.enqueue(outgoingMessage); - }; - - chatMessageSender.loopbackReceiverCallbacks.add(callback); - return () => { - chatMessageSender.loopbackReceiverCallbacks.delete(callback); - }; - } - } - - return new ChatMessageLoopbackReceiver(); - } -} - -class CombinedMessageSender extends MessageSender { - private messageSenders: Array; - - constructor(...messageSenders: Array) { - super(); - this.messageSenders = messageSenders; - } - - canSend(message: SentMessage): message is SentMessage { - return true; - } - - async send(message: SentMessage) { - for (const sender of this.messageSenders) { - // FIXME: an open question - should this only ever send with one MessageSender or potentially - // multiple? It doesn't matter now given there is only one MessageSender (ChatMessageSender) - // but I'm not sure the right long term call. - if (sender.canSend(message)) { - await sender.send(message); - return; - } - } - - throw new Error(`CombinedMessageSender - cannot find a MessageSender to send message ${JSON.stringify(message)}`); - } -} - -enum TranscriptionAttributes { - Final = "lk.transcription_final", - Segment = "lk.segment_id", - TrackId = "lk.transcribed_track_id", -} - -/** - * Processes new `lk.transcription` data stream events generated by the agent for both user and - * LLM generated speach and generates corresponding `TranscriptionReceivedMessage`s. - * - * For agent messages, a new text stream is emitted for each message, and the stream is closed when the message is finalized. - * Each agent message is delivered in chunks which must be accumulated and published into the message stream. - * - * For user messages, the full transcription is sent each time, but may be updated until finalized. - * - * The `lk.segment_id` attribute is stable and unique across the lifetime of the message. - * - * Example agent generated transcriptions: - * ``` - * { segment_id: "1", content: "Hello" } - * { segment_id: "1", content: " world" } - * { segment_id: "1", content: "!" } - * { segment_id: "2", content: "Hello" } - * { segment_id: "2", content: " Apple" } - * { segment_id: "2", content: "!" } - * ``` - * - * Example user generated transcriptions: - * ``` - * { segment_id: "3", content: "Hello" } - * { segment_id: "3", content: "Hello world!" } - * { segment_id: "4", content: "Hello" } - * { segment_id: "4", content: "Hello Apple!" } - * ``` - */ -class TranscriptionMessageReceiver extends MessageReceiver { - room: Room; - inFlightMessages: Array = []; - - constructor(room: Room) { - super(); - this.room = room; - } - - async start() { - const textStreamHandler = async (reader: TextStreamReader, participantInfo: { identity: string }) => { - const transcriptionSegmentId = reader.info.attributes?.[TranscriptionAttributes.Segment]; - const isTranscription = Boolean(transcriptionSegmentId); - const isFinal = reader.info.attributes?.[TranscriptionAttributes.Final] === 'true'; - - let currentStreamId = reader.info.id; - - // Find and update the stream in our array - let messageIndex = this.inFlightMessages.findIndex((message) => { - if (message.content.streamInfo.id === reader.info.id) { - return true; - } - if (isTranscription && transcriptionSegmentId === message.content.streamInfo.attributes?.[TranscriptionAttributes.Segment]) { - return true; - } - return false; - }); - - // FIXME: I think there may need to be some error handling logic to ensure the below for await - // properly exposes errors via `this.closeWithError` - for await (const chunk of reader) { - const existingMessage = this.inFlightMessages[messageIndex]; - if (existingMessage) { - if (existingMessage.content.streamInfo.id === currentStreamId) { - // Stream hasn't changed, just append content - const updatedMessage = this.appendInFlightMessageText(messageIndex, chunk, reader.info); - this.enqueue(updatedMessage); - } else { - // Stream has changed, so fully replace content - const updatedMessage = this.replaceInFlightMessageText(messageIndex, chunk, reader.info); - this.enqueue(updatedMessage); - } - - } else { - // Handle case where stream ID wasn't found (new message) - const message: ReceivedMessage = { - id: reader.info.id, - direction: 'inbound', - timestamp: new Date(reader.info.timestamp), - content: { - type: 'transcription', - text: chunk, - participantInfo, - streamInfo: reader.info, - }, - }; - this.inFlightMessages.push(message); - messageIndex = this.inFlightMessages.length-1; - this.enqueue(message); - } - } - - if (isFinal) { - this.inFlightMessages.splice(messageIndex, 1); - console.log('!! MESSAGE DONE!', this.inFlightMessages); - } - }; - this.room.registerTextStreamHandler(DataTopic.TRANSCRIPTION, textStreamHandler); - - return () => { - this.room.unregisterTextStreamHandler(DataTopic.TRANSCRIPTION); - }; - } - - private replaceInFlightMessageText(messageIndex: number, text: string, streamInfo: TextStreamInfo) { - this.inFlightMessages[messageIndex] = { - ...this.inFlightMessages[messageIndex], - content: { - ...this.inFlightMessages[messageIndex].content, - text, - streamInfo, - }, - }; - return this.inFlightMessages[messageIndex]; - } - private appendInFlightMessageText(messageIndex: number, text: string, streamInfo: TextStreamInfo) { - this.inFlightMessages[messageIndex] = { - ...this.inFlightMessages[messageIndex], - content: { - ...this.inFlightMessages[messageIndex].content, - text: this.inFlightMessages[messageIndex].content.text + text, - streamInfo, - }, - }; - return this.inFlightMessages[messageIndex]; - } -} - - - - -/** - * A `MessageReceiver` which takes a list of other `MessageReceiver`s and forwards along their `InboundMessage`s - * Conceptually, think `Promise.race` being run across all passed `MessageReceiver`s on each async iterator iteration. - */ -class CombinedMessageReceiver extends MessageReceiver { - private messageReceivers: Array; - - constructor(...messageReceivers: Array) { - super(); - this.messageReceivers = messageReceivers; - } - - async start() { - const messagesAsyncIterators = this.messageReceivers.map(mr => mr.messages()); - (async () => { - for await (const inboundMessage of parallelMerge(...messagesAsyncIterators)) { - this.enqueue(inboundMessage); - } - })().catch(err => { - this.closeWithError(err); - }); - - return () => { - for (const messageReceiver of this.messageReceivers) { - messageReceiver.close(); - } - }; - } -} - -export enum AgentSessionEvent { - AgentStateChanged = 'agentStateChanged', - AudioTrackChanged = 'audioTrackChanged', - VideoTrackChanged = 'videoTrackChanged', - AgentAttributesChanged = 'agentAttributesChanged', - MessagesChanged = 'messagesChanged', - AgentConnectionFailure = 'AgentConnectionFailure', -} - -export class AgentSession extends EventEmitter { - room: Room; // FIXME: should this be private? - state: AgentState = 'disconnected'; - - agentParticipant: AgentParticipant | null = null; - messageSender: MessageSender | null = null; - messageReceiver: MessageReceiver | null = null; - - // FIXME: maybe make an OrderedMessageList with these two fields in it? - messageById: Map = new Map(); - messageIds: Array = []; - - constructor() { - super(); - - this.room = new Room(); - this.room.on(RoomEvent.Connected, this.handleRoomConnected); - this.room.on(RoomEvent.Disconnected, this.handleRoomDisconnected); - this.room.on(RoomEvent.ConnectionStateChanged, this.handleConnectionStateChanged); - } - - async connect(url: string, token: string) { - // FIXME: catch connection errors here and reraise? idk - await Promise.all([ - this.room.connect(url, token), - // FIXME: make it so the preconenct buffer thing can be disabled? - this.room.localParticipant.setMicrophoneEnabled(true, undefined, { preConnectBuffer: true }), - ]); - } - async disconnect() { - await this.room.disconnect(); - } - - private handleRoomConnected = () => { - console.log('!! CONNECTED'); - this.agentParticipant = new AgentParticipant(this.room); - this.agentParticipant.on(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); - this.updateAgentState(); - - const chatMessageSender = new ChatMessageSender(this.localParticipant); - this.messageSender = new CombinedMessageSender( - chatMessageSender, - // TODO: other types of messages that can be sent - ); - - this.messageReceiver = new CombinedMessageReceiver( - new TranscriptionMessageReceiver(this.room), - chatMessageSender.generateLoopbackMessageReceiver(), - // TODO: images? attachments? rpc? - ); - (async () => { - // FIXME: is this sort of pattern a better idea than just making MessageReceiver an EventEmitter? - // FIXME: this probably doesn't handle errors properly right now - for await (const message of this.messageReceiver!.messages()) { - this.handleIncomingMessage(message); - } - })(); - - this.startAgentConnectedTimeout(); - } - - private handleRoomDisconnected = () => { - console.log('!! DISCONNECTED'); - this.agentParticipant?.off(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); - this.agentParticipant?.teardown(); - this.agentParticipant = null; - this.updateAgentState(); - - this.messageReceiver?.close(); - this.messageReceiver = null; - - if (this.agentConnectedTimeout) { - clearTimeout(this.agentConnectedTimeout); - this.agentConnectedTimeout = null; - } - } - - private agentConnectedTimeout: NodeJS.Timeout | null = null; - private startAgentConnectedTimeout = () => { - this.agentConnectedTimeout = setTimeout(() => { - if (!this.isAvailable) { - const reason = - this.state === 'connecting' - ? 'Agent did not join the room. ' - : 'Agent connected but did not complete initializing. '; - - this.emit(AgentSessionEvent.AgentConnectionFailure, reason); - this.room.disconnect(); - } - }, 10_000); - } - - private handleConnectionStateChanged = async () => { - this.updateAgentState(); - } - - private handleAgentAttributesChanged = () => { - console.log('!! ATTRIB CHANGED:', this.agentParticipant?.attributes) - this.updateAgentState(); - } - - private handleIncomingMessage = (incomingMessage: ReceivedMessage) => { - // Upsert the message into the list - this.messageById.set(incomingMessage.id, incomingMessage); - if (!this.messageIds.includes(incomingMessage.id)) { - this.messageIds.push(incomingMessage.id); - } - - this.emit(AgentSessionEvent.MessagesChanged, this.messages); - } - - private updateAgentState = () => { - let newAgentState: AgentState | null = null; - if (!this.agentParticipant) { - // throw new Error('AgentSession.agentParticipant is unset'); - newAgentState = 'disconnected'; - } else { - const agentParticipantAttributes = this.agentParticipant.attributes; - const connectionState = this.room.state; - - if (connectionState === ConnectionState.Disconnected) { - newAgentState = 'disconnected'; - } else if ( - connectionState === ConnectionState.Connecting || - !this.agentParticipant || - !agentParticipantAttributes?.[stateAttribute] - ) { - newAgentState = 'connecting'; - } else { - newAgentState = agentParticipantAttributes[stateAttribute] as AgentState; - } - } - console.log('!! STATE:', newAgentState, this.agentParticipant?.attributes); - - if (this.state !== newAgentState) { - this.state = newAgentState; - this.emit(AgentSessionEvent.AgentStateChanged, newAgentState); - } - } - - get isAvailable() { - return this.state == 'listening' || this.state == 'thinking' || this.state == 'speaking'; - } - - get localParticipant() { - return this.room?.localParticipant ?? null; - } - - get messages() { - return ( - this.messageIds - .map(id => this.messageById.get(id)) - // FIXME: can I get rid of the filter somehow? - .filter((message): message is SentMessage | ReceivedMessage => typeof message !== 'undefined') - ); - } - - // FIXME: maybe there should be a special case where if message is `string` it is converted into - // a `SentChatMessage`? - async sendMessage(message: SentMessage) { - if (!this.messageSender) { - throw new Error('AgentSession.sendMessage - cannot send message until room is connected and MessageSender initialized!'); - } - await this.messageSender.send(message); - } -} - - -// Proposal: -// Copy of LiveKitRoom, but for agents (LiveKitAgentSession?) -// - This exposes a context like RoomContext -// Hooks that replicate a lot of useVoiceAssistant functionality which tap into agent context: -// - useAgent gets raw AgentSession -// - useAgentMessages? -// - useAgentSend diff --git a/agent-sdk/lib/future.ts b/agent-sdk/lib/future.ts new file mode 100644 index 000000000..992a5f1e3 --- /dev/null +++ b/agent-sdk/lib/future.ts @@ -0,0 +1,59 @@ +/** A Future represents a serialized version of a new Promise(...) call, exposing the promise plus + * corresponding resolve and reject functions to be used as an async execution management building + * block. + * + * @example + * ```ts + * const future = new Future(); + * + * async function startA() { + * setTimeout(() => future.resolve(123), 5000); + * } + * + * async function waitForA() { + * await future.promise; + * } + * + * async function main() { + * startA(); + * const result = await waitForA(); + * console.log(result); // logs 123 + * } + * ``` + * */ +export default class Future { + promise: Promise; + + // NOTE: these `throw`s shouldn't ever happen in practice, `new Promise` runs its callback + // syncronusly. + resolve: (arg: T) => void = () => { throw new Error('Future not yet initialized!') }; + reject: (e: any) => void = () => { throw new Error('Future not yet initialized!') }; + + onFinally?: () => void; + + get isResolved(): boolean { + return this._isResolved; + } + + private _isResolved: boolean = false; + + constructor( + futureBase?: (resolve: (arg: T) => void, reject: (e: any) => void) => void | Promise, + onFinally?: () => void, + ) { + this.onFinally = onFinally; + this.promise = new Promise(async (resolve, reject) => { + this.resolve = resolve; + this.reject = reject; + if (futureBase) { + const futureBaseReturn = futureBase(resolve, reject); + if (futureBaseReturn instanceof Promise) { + await futureBaseReturn; + } + } + }).finally(() => { + this._isResolved = true; + this.onFinally?.(); + }); + } +} From 8fcd2edc2ab0b65f9cb28eb393a107ce584ff471 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Fri, 8 Aug 2025 11:58:38 -0400 Subject: [PATCH 14/51] docs: add initial AgentSession docs --- agent-sdk/agent-session/AgentSession.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index b909a0f65..d9151f288 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -39,6 +39,10 @@ export type AgentState = | 'thinking' | 'speaking'; +/** + * AgentSession represents a connection to a LiveKit Agent, providing abstractions to make 1:1 + * agent/participant rooms easier to work with. + */ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter) { room: Room; // FIXME: should this be private? state: AgentState = 'disconnected'; From 9425833935d58c6c82247c95a742082086c8a509 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Fri, 8 Aug 2025 12:35:31 -0400 Subject: [PATCH 15/51] feat: add OrderedMessageList --- agent-sdk/agent-session/AgentSession.ts | 25 +++++++----------- agent-sdk/lib/ordered-message-list.ts | 35 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 15 deletions(-) create mode 100644 agent-sdk/lib/ordered-message-list.ts diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index d9151f288..68ce5c424 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -3,7 +3,6 @@ import { EventEmitter } from "events"; import { Room, RoomEvent, ConnectionState } from 'livekit-client'; import { - type BaseMessageId, type ReceivedMessage, type SentMessage, MessageSender, @@ -14,6 +13,7 @@ import { TranscriptionMessageReceiver, } from "./message"; import AgentParticipant, { AgentParticipantEvent } from './AgentParticipant'; +import OrderedMessageList from '@/agent-sdk/lib/ordered-message-list'; export enum AgentSessionEvent { @@ -50,10 +50,7 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter = new Map(); - messageIds: Array = []; + messageList: OrderedMessageList | null = null; constructor() { super(); @@ -101,6 +98,8 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter TypedEventEmitter { - // Upsert the message into the list - this.messageById.set(incomingMessage.id, incomingMessage); - if (!this.messageIds.includes(incomingMessage.id)) { - this.messageIds.push(incomingMessage.id); + if (!this.messageList) { + throw new Error('AgentSession.messageList is unset'); } + this.messageList.upsert(incomingMessage); this.emit(AgentSessionEvent.MessagesChanged, this.messages); } @@ -192,12 +192,7 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter this.messageById.get(id)) - // FIXME: can I get rid of the filter somehow? - .filter((message): message is SentMessage | ReceivedMessage => typeof message !== 'undefined') - ); + return this.messageList?.toArray() ?? []; } // FIXME: maybe there should be a special case where if message is `string` it is converted into diff --git a/agent-sdk/lib/ordered-message-list.ts b/agent-sdk/lib/ordered-message-list.ts new file mode 100644 index 000000000..582e270ef --- /dev/null +++ b/agent-sdk/lib/ordered-message-list.ts @@ -0,0 +1,35 @@ +import { BaseMessage } from "../agent-session/message"; + +/** A container for storing an ordered list of messages that can be easily changed */ +export default class OrderedMessageList> { + private messageById: Map = new Map(); + private messageIds: Array = []; + + constructor(input?: Array) { + if (input) { + this.messageById = new Map(input.map(message => [message.id, message])); + this.messageIds = input.map(message => message.id); + } + } + + upsert(message: Message) { + this.messageById.set(message.id, message); + if (!this.messageIds.includes(message.id)) { + this.messageIds.push(message.id); + } + } + + *[Symbol.iterator]() { + for (const id of this.messageIds) { + const message = this.messageById.get(id); + if (!message) { + continue; + } + yield message; + } + } + + toArray() { + return Array.from(this); + } +} From 4443c04432d703e9f943fa0f8a677d2b46086391 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Fri, 8 Aug 2025 14:34:16 -0400 Subject: [PATCH 16/51] feat: add more agent specific react hook coverage --- agent-sdk/external-deps/components-js.tsx | 14 ++++++ agent-sdk/index.tsx | 43 +++++++++++++++---- .../agent-control-bar/agent-control-bar.tsx | 6 ++- .../hooks/use-agent-control-bar.ts | 18 +++++--- 4 files changed, 65 insertions(+), 16 deletions(-) diff --git a/agent-sdk/external-deps/components-js.tsx b/agent-sdk/external-deps/components-js.tsx index 424dfa05d..5b3ae9a94 100644 --- a/agent-sdk/external-deps/components-js.tsx +++ b/agent-sdk/external-deps/components-js.tsx @@ -100,3 +100,17 @@ export const DataTopic = { CHAT: 'lk.chat', TRANSCRIPTION: 'lk.transcription', } as const; + +export const trackSourceToProtocol = (source: Track.Source) => { + // NOTE: this mapping avoids importing the protocol package as that leads to a significant bundle size increase + switch (source) { + case Track.Source.Camera: + return 1; + case Track.Source.Microphone: + return 2; + case Track.Source.ScreenShare: + return 3; + default: + return 0; + } +}; diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index d8643c9d9..8f78ac34a 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -1,16 +1,18 @@ import * as React from "react"; -import { useContext, useEffect, useState, useCallback } from "react"; +import { useContext, useEffect, useState, useCallback, useMemo } from "react"; import { + LocalParticipant, Participant, ParticipantEvent, Track, // TextStreamInfo, } from "livekit-client"; -import { TrackReference } from "@/agent-sdk/external-deps/components-js"; +import { TrackReference, trackSourceToProtocol } from "@/agent-sdk/external-deps/components-js"; import { ParticipantEventCallbacks } from "../node_modules/livekit-client/src/room/participant/Participant"; import { AgentSession, AgentSessionCallbacks, AgentSessionEvent } from "./agent-session/AgentSession"; import { ReceivedMessage, SentMessage } from "./agent-session/message"; import { AgentParticipantCallbacks, AgentParticipantEvent } from "./agent-session/AgentParticipant"; +import { ParticipantPermission } from "livekit-server-sdk"; // --------------------- // REACT @@ -116,10 +118,10 @@ export function useAgentTracks() { return { audioTrack, videoTrack }; } -function useParticipantEvents

( +function useParticipantEvents

( participant: P, - eventNames: Array, - callback: (data: any /* FIXME: types */) => void, + eventNames: Array, + callback: ParticipantEventCallbacks[EventName], dependencies: React.DependencyList, ) { // FIXME: is doing this memoiztion here a good idea? Maybe useAgentSessionEvent(..., useCallback(...)) is preferrable? @@ -127,11 +129,11 @@ function useParticipantEvents

( useEffect(() => { for (const eventName of eventNames) { - participant.on(eventName as keyof ParticipantEventCallbacks, memoizedCallback); + participant.on(eventName, memoizedCallback); } return () => { for (const eventName of eventNames) { - participant.off(eventName as keyof ParticipantEventCallbacks, memoizedCallback); + participant.off(eventName, memoizedCallback); } }; }, [participant, eventNames, memoizedCallback]); @@ -143,6 +145,7 @@ export function useAgentLocalParticipant() { const [localParticipant, setLocalParticipant] = React.useState(agentSession.localParticipant); const [microphoneTrack, setMicrophoneTrack] = React.useState(null); const [cameraTrack, setCameraTrack] = React.useState(null); + const [permissions, setPermissions] = React.useState(null); useParticipantEvents(agentSession.localParticipant, [ ParticipantEvent.TrackMuted, @@ -158,6 +161,8 @@ export function useAgentLocalParticipant() { // ParticipantEvent.ConnectionQualityChanged, ], () => { setLocalParticipant(agentSession.localParticipant); + setPermissions(agentSession.localParticipant.permissions ?? null); + // FIXME: is the rest of this stuff needed? // const { isMicrophoneEnabled, isCameraEnabled, isScreenShareEnabled } = p; const microphoneTrack = agentSession.localParticipant.getTrackPublication(Track.Source.Microphone); @@ -183,7 +188,29 @@ export function useAgentLocalParticipant() { // return participantMedia; }, []); - return { localParticipant, microphoneTrack, cameraTrack }; + const publishPermissions = useMemo(() => { + const canPublishSource = (source: Track.Source) => { + return ( + permissions?.canPublish && + (permissions.canPublishSources.length === 0 || + permissions.canPublishSources.includes(trackSourceToProtocol(source))) + ); + }; + + return { + camera: canPublishSource(Track.Source.Camera), + microphone: canPublishSource(Track.Source.Microphone), + screenShare: canPublishSource(Track.Source.ScreenShare), + data: permissions?.canPublishData ?? false, + }; + }, [permissions]); + + return { + localParticipant, + microphoneTrack, + cameraTrack, + publishPermissions, + }; } // hook ideas: diff --git a/components/livekit/agent-control-bar/agent-control-bar.tsx b/components/livekit/agent-control-bar/agent-control-bar.tsx index 289a3ce9f..3127f0c88 100644 --- a/components/livekit/agent-control-bar/agent-control-bar.tsx +++ b/components/livekit/agent-control-bar/agent-control-bar.tsx @@ -13,6 +13,7 @@ import { cn } from '@/lib/utils'; import { DeviceSelect } from '../device-select'; import { TrackToggle } from '../track-toggle'; import { UseAgentControlBarProps, useAgentControlBar } from './hooks/use-agent-control-bar'; +import { useAgentState } from '@/agent-sdk'; export interface AgentControlBarProps extends React.HTMLAttributes, @@ -38,11 +39,12 @@ export function AgentControlBar({ onDeviceError, ...props }: AgentControlBarProps) { - const participants = useRemoteParticipants(); // FIXME: replace with agent alternative const [chatOpen, setChatOpen] = React.useState(false); const [isSendingMessage, setIsSendingMessage] = React.useState(false); - const isAgentAvailable = participants.some((p) => p.isAgent); + // const participants = useRemoteParticipants(); + // const isAgentAvailable = participants.some((p) => p.isAgent); + const { isAvailable: isAgentAvailable } = useAgentState(); const isInputDisabled = !chatOpen || !isAgentAvailable || isSendingMessage; const [isDisconnecting, setIsDisconnecting] = React.useState(false); diff --git a/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts b/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts index 2c1495e8f..f928a7db2 100644 --- a/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts +++ b/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts @@ -8,6 +8,7 @@ import { useTrackToggle, } from '@livekit/components-react'; import { usePublishPermissions } from './use-publish-permissions'; +import { useAgentLocalParticipant } from '@/agent-sdk'; export interface ControlBarControls { microphone?: boolean; @@ -40,19 +41,24 @@ export function useAgentControlBar(props: UseAgentControlBarProps = {}): UseAgen leave: true, ...controls, }; - const { microphoneTrack, localParticipant } = useLocalParticipant(); - const publishPermissions = usePublishPermissions(); + // const { microphoneTrack, localParticipant } = useLocalParticipant(); // FIXME: replace with agent alternative + const { + microphoneTrack, + localParticipant, + publishPermissions, + } = useAgentLocalParticipant(); + // const publishPermissions = usePublishPermissions(); // FIXME: replace with agent alternative const room = useRoomContext(); - const microphoneToggle = useTrackToggle({ + const microphoneToggle = useTrackToggle({ // FIXME: replace with agent alternative source: Track.Source.Microphone, onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.Microphone, error }), }); - const cameraToggle = useTrackToggle({ + const cameraToggle = useTrackToggle({ // FIXME: replace with agent alternative source: Track.Source.Camera, onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.Camera, error }), }); - const screenShareToggle = useTrackToggle({ + const screenShareToggle = useTrackToggle({ // FIXME: replace with agent alternative source: Track.Source.ScreenShare, onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.ScreenShare, error }), }); @@ -75,7 +81,7 @@ export function useAgentControlBar(props: UseAgentControlBarProps = {}): UseAgen saveAudioInputDeviceId, saveVideoInputEnabled, saveVideoInputDeviceId, - } = usePersistentUserChoices({ + } = usePersistentUserChoices({ // FIXME: replace with agent alternative preventSave: !saveUserChoices, }); From 994b2094eec86e9be8de463cdedf85196f624f66 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Fri, 8 Aug 2025 15:01:28 -0400 Subject: [PATCH 17/51] feat; add stubs for a log of additional functionality lukas proposed on the root AgentSession --- agent-sdk/agent-session/AgentSession.ts | 33 ++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index 68ce5c424..32efe4149 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -20,13 +20,15 @@ export enum AgentSessionEvent { AgentStateChanged = 'agentStateChanged', AgentAttributesChanged = 'agentAttributesChanged', MessagesChanged = 'messagesChanged', - AgentConnectionFailure = 'AgentConnectionFailure', + AgentConnectionFailure = 'agentConnectionFailure', + AudioPlaybackStatusChanged = 'AudioPlaybackStatusChanged', } export type AgentSessionCallbacks = { [AgentSessionEvent.AgentStateChanged]: (newAgentState: AgentState) => void; [AgentSessionEvent.MessagesChanged]: (newMessages: Array) => void; [AgentSessionEvent.AgentConnectionFailure]: (reason: string) => void; + [AgentSessionEvent.AudioPlaybackStatusChanged]: (audioPlaybackPermitted: boolean) => void; }; const stateAttribute = 'lk.agent.state'; @@ -59,6 +61,7 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter { + this.emit(AgentSessionEvent.AudioPlaybackStatusChanged, this.room.canPlaybackAudio); + }; + private handleAgentAttributesChanged = () => { console.log('!! ATTRIB CHANGED:', this.agentParticipant?.attributes) this.updateAgentState(); @@ -203,4 +210,28 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter void) => void | undefined; + + // TODO: RPC stuff + // registerRpcHandler: ( + // method: string, + // handler: (data: RpcInvocationData) => Promise, + // ) => void; + // performRpc: (method: string, payload: string) => Promise; + + // TODO: Client media controls + // setCameraEnabled: (enabled: boolean) => Promise; + // setMicrophoneEnabled: (enabled: boolean) => Promise; + // setScreenShareEnabled: (enabled: boolean) => Promise; + // setCameraInput: (deviceId: string) => Promise; + // setMicrophoneInput: (deviceId: string) => Promise; + + // Media Playback + async startAudioPlayback() { + await this.room.startAudio(); + + // FIXME: add audio track to audio element / etc + // This probably needs to contain much of the logic in RoomAudioRenderer? + // And then make a similar type of component that then uses this function internally? + } } From 1658dc9fec4bf70c00e250b343b02ab5ef290f9d Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Fri, 8 Aug 2025 15:15:03 -0400 Subject: [PATCH 18/51] feat: add waitUntilAgentIsAvailable --- agent-sdk/agent-session/AgentSession.ts | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index 32efe4149..3484b53d1 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -71,6 +71,8 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter((resolve, reject) => { + const stateChangedHandler = () => { + if (!this.isAvailable) { + return; + } + cleanup(); + resolve(); + }; + const abortHandler = () => { + cleanup(); + reject(new Error('AgentSession.waitUntilAgentIsAvailable - signal aborted')); + }; + + const cleanup = () => { + this.off(AgentSessionEvent.AgentStateChanged, stateChangedHandler); + signal?.removeEventListener('abort', abortHandler); + }; + + this.on(AgentSessionEvent.AgentStateChanged, stateChangedHandler); + signal?.addEventListener('abort', abortHandler); + }); + } + get localParticipant() { return this.room?.localParticipant ?? null; } From 76a3962b47463c4435090d41de628486ada4f701 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Fri, 8 Aug 2025 17:09:20 -0400 Subject: [PATCH 19/51] feat: implement message aggregator idea --- agent-sdk/agent-session/AgentSession.ts | 96 ++++++++++++-- .../message/ReceivedMessageAggregator.ts | 120 ++++++++++++++++++ agent-sdk/agent-session/message/index.ts | 5 + agent-sdk/index.tsx | 22 +++- agent-sdk/lib/ordered-message-list.ts | 35 ----- 5 files changed, 232 insertions(+), 46 deletions(-) create mode 100644 agent-sdk/agent-session/message/ReceivedMessageAggregator.ts delete mode 100644 agent-sdk/lib/ordered-message-list.ts diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index 3484b53d1..5f826eadb 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -11,9 +11,11 @@ import { CombinedMessageSender, CombinedMessageReceiver, TranscriptionMessageReceiver, + ReceivedMessageAggregator, + type ReceivedMessageAggregatorOptions, + ReceivedMessageAggregatorEvent, } from "./message"; import AgentParticipant, { AgentParticipantEvent } from './AgentParticipant'; -import OrderedMessageList from '@/agent-sdk/lib/ordered-message-list'; export enum AgentSessionEvent { @@ -52,7 +54,8 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter | null = null; + defaultAggregator: ReceivedMessageAggregator | null = null; + aggregators: Array> | null = null; constructor() { super(); @@ -103,7 +106,8 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter TypedEventEmitter { - if (!this.messageList) { - throw new Error('AgentSession.messageList is unset'); + if (!this.defaultAggregator) { + throw new Error('AgentSession.defaultAggregator is unset'); + } + if (!this.aggregators) { + throw new Error('AgentSession.aggregators is unset'); + } + + this.defaultAggregator.upsert(incomingMessage); + for (const aggregator of this.aggregators) { + aggregator.upsert(incomingMessage); } - this.messageList.upsert(incomingMessage); this.emit(AgentSessionEvent.MessagesChanged, this.messages); } @@ -221,12 +237,76 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter((resolve, reject) => { + const onceRoomConnected = () => { + cleanup(); + resolve(); + }; + const abortHandler = () => { + cleanup(); + reject(new Error('AgentSession.waitUntilRoomConnected - signal aborted')); + }; + + const cleanup = () => { + this.room.off(RoomEvent.Connected, onceRoomConnected); + signal?.removeEventListener('abort', abortHandler); + }; + + this.room.on(RoomEvent.Connected, onceRoomConnected); + signal?.addEventListener('abort', abortHandler); + }); + } + get localParticipant() { return this.room?.localParticipant ?? null; } get messages() { - return this.messageList?.toArray() ?? []; + // return this.messageReceiver.messages(); + return this.defaultAggregator?.toArray() ?? []; + } + + async createMessageAggregator(options: { startsAt?: 'beginning' | 'now' } & ReceivedMessageAggregatorOptions = {}) { + await this.waitUntilRoomConnected(); + if (!this.aggregators) { + throw new Error('AgentSession.aggregators is unset'); + } + const aggregators = this.aggregators; // FIXME: this caching could lead to issues if this.aggregators changed reference? + + const { startsAt, ...aggregatorOptions } = { + startsAt: 'beginning' as const, + ...options, + }; + + let aggregator; + switch (startsAt) { + case 'now': + aggregator = new ReceivedMessageAggregator(aggregatorOptions); + break; + + case 'beginning': + aggregator = ReceivedMessageAggregator.fromIterator(this.defaultAggregator ?? [], aggregatorOptions); + break; + } + + aggregators.push(aggregator); + const closeHandler = () => { + const aggregatorIndex = aggregators.indexOf(aggregator); + if (aggregatorIndex < 0) { + throw new Error(`Index of aggregator was non integer (found ${aggregatorIndex}), has this aggregator already been closed previously?`); + } + aggregators.splice(aggregatorIndex, 1); + + aggregator.off(ReceivedMessageAggregatorEvent.Close, closeHandler); + }; + aggregator.on(ReceivedMessageAggregatorEvent.Close, closeHandler); + + return aggregator; } // FIXME: maybe there should be a special case where if message is `string` it is converted into diff --git a/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts b/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts new file mode 100644 index 000000000..38e7fac56 --- /dev/null +++ b/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts @@ -0,0 +1,120 @@ +import { EventEmitter } from "events"; +import TypedEventEmitter from "typed-emitter"; +import { ReceivedMessage } from "."; + +export type ReceivedMessageAggregatorOptions = { + /** + * Number of messages to buffer internally before old messages are discarded. If not set, the + * buffer size is unlimited. + */ + bufferSize?: number; + + // FIXME: other options? +}; + +export enum ReceivedMessageAggregatorEvent { + Updated = 'updated', + Close = 'close', +} + +type ReceivedMessageAggregatorCallbacks = { + [ReceivedMessageAggregatorEvent.Updated]: () => void; + [ReceivedMessageAggregatorEvent.Close]: () => void; +}; + +/** A container for storing an ordered list of messages that can be easily changed */ +export default class ReceivedMessageAggregator extends (EventEmitter as new () => TypedEventEmitter) { + private messageById: Map = new Map(); + private messageIds: Array = []; + + private options: ReceivedMessageAggregatorOptions; + private closed: boolean = false; + + constructor(options?: ReceivedMessageAggregatorOptions) { + super(); + this.options = options ?? {}; + } + + /** Create a new aggregator pre-populated with the included messages */ + static fromIterator(input: Iterable, options?: ReceivedMessageAggregatorOptions) { + const aggregator = new this(options); + aggregator.extend(input); + return aggregator; + } + + upsert(message: Message) { + this.internalBulkUpsert([message]); + this.emit(ReceivedMessageAggregatorEvent.Updated); + } + + delete(message: Message) { + this.internalBulkDelete([message.id]); + this.emit(ReceivedMessageAggregatorEvent.Updated); + } + + extend(input: Iterable) { + this.internalBulkUpsert(input); + this.emit(ReceivedMessageAggregatorEvent.Updated); + } + + clear() { + this.messageById.clear(); + this.messageIds = []; + } + + private internalBulkUpsert(messages: Iterable) { + if (this.closed) { + throw new Error('ReceivedMessageAggregator is closed and is now immutable, no more messages can be ingested!'); + } + + // FIXME: think through this scenario: + // 1. Message a is upserted + // 2. `options.bufferSize` messages are upserted, evicting message a + // 3. Another message a upsert happens, should this somehow get rejected (via bloom filter / etc?) + // or just end up in the list again as a seemingly brand new message? + for (const message of messages) { + this.messageById.set(message.id, message); + if (!this.messageIds.includes(message.id)) { + this.messageIds.push(message.id); + } + + // Truncate message buffer if it is now too large + const numberOfMessagesToRemove = typeof this.options.bufferSize === 'number' ? ( + this.messageIds.length - this.options.bufferSize + ) : 0; + if (numberOfMessagesToRemove > 0) { + const idsToDelete = this.messageIds.slice(0, numberOfMessagesToRemove); + this.internalBulkDelete(idsToDelete); + } + } + } + private internalBulkDelete(messageIdsToDelete: Array) { + if (this.closed) { + throw new Error('ReceivedMessageAggregator is closed and is now immutable, no more messages can be deleted!'); + } + + for (const id of messageIdsToDelete) { + this.messageById.delete(id); + } + this.messageIds = this.messageIds.filter(id => !messageIdsToDelete.includes(id)); + } + + *[Symbol.iterator]() { + for (const id of this.messageIds) { + const message = this.messageById.get(id); + if (!message) { + continue; + } + yield message; + } + } + + toArray() { + return Array.from(this); + } + + close() { + this.closed = true; + this.emit(ReceivedMessageAggregatorEvent.Close); + } +} diff --git a/agent-sdk/agent-session/message/index.ts b/agent-sdk/agent-session/message/index.ts index 4cb90e260..fc5ce4303 100644 --- a/agent-sdk/agent-session/message/index.ts +++ b/agent-sdk/agent-session/message/index.ts @@ -34,3 +34,8 @@ export { default as CombinedMessageSender } from './send/CombinedMessageSender'; export { default as MessageReceiver } from './receive/MessageReceiver'; export { default as CombinedMessageReceiver } from './receive/CombinedMessageReceiver'; export { default as TranscriptionMessageReceiver } from './receive/TranscriptionMessageReceiver'; +export { + default as ReceivedMessageAggregator, + type ReceivedMessageAggregatorOptions, + ReceivedMessageAggregatorEvent, +} from './ReceivedMessageAggregator'; diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index 8f78ac34a..aeca36da1 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -10,7 +10,7 @@ import { import { TrackReference, trackSourceToProtocol } from "@/agent-sdk/external-deps/components-js"; import { ParticipantEventCallbacks } from "../node_modules/livekit-client/src/room/participant/Participant"; import { AgentSession, AgentSessionCallbacks, AgentSessionEvent } from "./agent-session/AgentSession"; -import { ReceivedMessage, SentMessage } from "./agent-session/message"; +import { ReceivedMessage, ReceivedMessageAggregator, ReceivedMessageAggregatorEvent, SentMessage } from "./agent-session/message"; import { AgentParticipantCallbacks, AgentParticipantEvent } from "./agent-session/AgentParticipant"; import { ParticipantPermission } from "livekit-server-sdk"; @@ -40,9 +40,25 @@ export function useAgentMessages() { Array >(agentSession.messages); useEffect(() => { - agentSession.on(AgentSessionEvent.MessagesChanged, setMessages); + let aggregator: ReceivedMessageAggregator | null = null; + + const handleUpdated = () => { + if (!aggregator) { + return; + } + setMessages(aggregator.toArray()) + }; + + agentSession.createMessageAggregator({ startsAt: 'beginning' }).then(agg => { + aggregator = agg; + aggregator.on(ReceivedMessageAggregatorEvent.Updated, handleUpdated); + }).catch(err => { + // FIXME: how should this error be handled? + console.error('Error creating message aggregator:', err); + }); + return () => { - agentSession.off(AgentSessionEvent.MessagesChanged, setMessages); + aggregator?.off(ReceivedMessageAggregatorEvent.Updated, handleUpdated); }; }, [agentSession]); diff --git a/agent-sdk/lib/ordered-message-list.ts b/agent-sdk/lib/ordered-message-list.ts deleted file mode 100644 index 582e270ef..000000000 --- a/agent-sdk/lib/ordered-message-list.ts +++ /dev/null @@ -1,35 +0,0 @@ -import { BaseMessage } from "../agent-session/message"; - -/** A container for storing an ordered list of messages that can be easily changed */ -export default class OrderedMessageList> { - private messageById: Map = new Map(); - private messageIds: Array = []; - - constructor(input?: Array) { - if (input) { - this.messageById = new Map(input.map(message => [message.id, message])); - this.messageIds = input.map(message => message.id); - } - } - - upsert(message: Message) { - this.messageById.set(message.id, message); - if (!this.messageIds.includes(message.id)) { - this.messageIds.push(message.id); - } - } - - *[Symbol.iterator]() { - for (const id of this.messageIds) { - const message = this.messageById.get(id); - if (!message) { - continue; - } - yield message; - } - } - - toArray() { - return Array.from(this); - } -} From fb01009ff890cbd764339b288b6a96528643334c Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 11 Aug 2025 09:40:30 -0400 Subject: [PATCH 20/51] docs: add docs for message aggregator idea --- agent-sdk/agent-session/AgentSession.ts | 4 ++++ agent-sdk/index.tsx | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index 5f826eadb..e1670f6da 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -271,6 +271,10 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter >(agentSession.messages); useEffect(() => { - let aggregator: ReceivedMessageAggregator | null = null; + let aggregator: ReceivedMessageAggregator | null = null; const handleUpdated = () => { if (!aggregator) { @@ -58,6 +58,7 @@ export function useAgentMessages() { }); return () => { + aggregator?.close(); aggregator?.off(ReceivedMessageAggregatorEvent.Updated, handleUpdated); }; }, [agentSession]); From b4ba41feff4a2b15e398123b3cd51b8b716209b9 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 11 Aug 2025 09:56:09 -0400 Subject: [PATCH 21/51] feat: move agent state into AgentParticipant --- agent-sdk/agent-session/AgentParticipant.ts | 46 +++++++++++++++- agent-sdk/agent-session/AgentSession.ts | 60 +++------------------ 2 files changed, 53 insertions(+), 53 deletions(-) diff --git a/agent-sdk/agent-session/AgentParticipant.ts b/agent-sdk/agent-session/AgentParticipant.ts index 8c1510e96..3fb355b2d 100644 --- a/agent-sdk/agent-session/AgentParticipant.ts +++ b/agent-sdk/agent-session/AgentParticipant.ts @@ -1,13 +1,24 @@ import type TypedEventEmitter from 'typed-emitter'; import { EventEmitter } from "events"; -import { ParticipantEvent, ParticipantKind, RemoteParticipant, Room, RoomEvent, Track, TranscriptionSegment } from 'livekit-client'; +import { ConnectionState, ParticipantEvent, ParticipantKind, RemoteParticipant, Room, RoomEvent, Track, TranscriptionSegment } from 'livekit-client'; import { getParticipantTrackRefs, participantTrackEvents, TrackReference } from '@/agent-sdk/external-deps/components-js'; import { ParticipantEventCallbacks } from '@/agent-sdk/external-deps/client-sdk-js'; +const stateAttribute = 'lk.agent.state'; + +export type AgentState = + | 'disconnected' + | 'connecting' + | 'initializing' + | 'listening' + | 'thinking' + | 'speaking'; + export enum AgentParticipantEvent { VideoTrackChanged = 'videoTrackChanged', AudioTrackChanged = 'videoTrackChanged', AgentAttributesChanged = 'agentAttributesChanged', + AgentStateChanged = 'agentStateChanged', // AgentTranscriptionsChanged = 'agentTranscriptionsChanged', } @@ -15,6 +26,7 @@ export type AgentParticipantCallbacks = { [AgentParticipantEvent.VideoTrackChanged]: (newTrack: TrackReference | null) => void; [AgentParticipantEvent.AudioTrackChanged]: (newTrack: TrackReference | null) => void; [AgentParticipantEvent.AgentAttributesChanged]: (newAttributes: Record) => void; + [AgentParticipantEvent.AgentStateChanged]: (newState: AgentState) => void; }; // Goal: some sort of abstraction layer to provide information specific to the agent's interactions @@ -23,6 +35,7 @@ export type AgentParticipantCallbacks = { // FIXME: maybe this could be named better? ... export default class AgentParticipant extends (EventEmitter as new () => TypedEventEmitter) { private room: Room; + state: AgentState = 'disconnected'; private agentParticipant: RemoteParticipant | null = null; private workerParticipant: RemoteParticipant | null = null; @@ -42,11 +55,14 @@ export default class AgentParticipant extends (EventEmitter as new () => TypedEv this.room.on(RoomEvent.ParticipantConnected, this.handleParticipantConnected); this.room.on(RoomEvent.ParticipantDisconnected, this.handleParticipantDisconnected); + this.room.on(RoomEvent.ConnectionStateChanged, this.handleConnectionStateChanged); + this.updateAgentState(); } teardown() { this.room.off(RoomEvent.ParticipantConnected, this.handleParticipantConnected); this.room.off(RoomEvent.ParticipantDisconnected, this.handleParticipantDisconnected); + this.room.off(RoomEvent.ConnectionStateChanged, this.handleConnectionStateChanged); } private handleParticipantConnected = () => { @@ -56,6 +72,10 @@ export default class AgentParticipant extends (EventEmitter as new () => TypedEv this.updateParticipants(); } + private handleConnectionStateChanged = () => { + this.updateAgentState(); + } + private updateParticipants() { const newAgentParticipant = this.roomRemoteParticipants.find( (p) => p.kind === ParticipantKind.AGENT && !('lk.publish_on_behalf' in p.attributes), @@ -134,8 +154,32 @@ export default class AgentParticipant extends (EventEmitter as new () => TypedEv private handleAttributesChanged = (attributes: Record) => { this.attributes = attributes; this.emit(AgentParticipantEvent.AgentAttributesChanged, attributes); + this.updateAgentState(); }; + private updateAgentState() { + let newAgentState: AgentState | null = null; + const connectionState = this.room.state; + + if (connectionState === ConnectionState.Disconnected) { + newAgentState = 'disconnected'; + } else if ( + connectionState === ConnectionState.Connecting || + !this.agentParticipant || + !this.attributes[stateAttribute] + ) { + newAgentState = 'connecting'; + } else { + newAgentState = this.attributes[stateAttribute] as AgentState; + } + console.log('!! STATE:', newAgentState, this.agentParticipant?.attributes); + + if (this.state !== newAgentState) { + this.state = newAgentState; + this.emit(AgentParticipantEvent.AgentStateChanged, newAgentState); + } + } + // private handleTranscriptionReceived = (segments: Array) => { // console.log('!! TRANSCRIPTION', segments, this.audioTrackSyncTime); // if (!this.audioTrackSyncTime) { diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index e1670f6da..d1c856308 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -15,8 +15,7 @@ import { type ReceivedMessageAggregatorOptions, ReceivedMessageAggregatorEvent, } from "./message"; -import AgentParticipant, { AgentParticipantEvent } from './AgentParticipant'; - +import AgentParticipant, { AgentParticipantEvent, AgentState } from './AgentParticipant'; export enum AgentSessionEvent { AgentStateChanged = 'agentStateChanged', @@ -33,15 +32,6 @@ export type AgentSessionCallbacks = { [AgentSessionEvent.AudioPlaybackStatusChanged]: (audioPlaybackPermitted: boolean) => void; }; -const stateAttribute = 'lk.agent.state'; - -export type AgentState = - | 'disconnected' - | 'connecting' - | 'initializing' - | 'listening' - | 'thinking' - | 'speaking'; /** * AgentSession represents a connection to a LiveKit Agent, providing abstractions to make 1:1 @@ -49,7 +39,6 @@ export type AgentState = */ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter) { room: Room; // FIXME: should this be private? - state: AgentState = 'disconnected'; agentParticipant: AgentParticipant | null = null; messageSender: MessageSender | null = null; @@ -63,7 +52,6 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter { console.log('!! CONNECTED'); this.agentParticipant = new AgentParticipant(this.room); - this.agentParticipant.on(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); - this.updateAgentState(); + this.agentParticipant.on(AgentParticipantEvent.AgentStateChanged, this.handleAgentStateChanged); const chatMessageSender = new ChatMessageSender(this.localParticipant); this.messageSender = new CombinedMessageSender( @@ -114,10 +101,9 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { console.log('!! DISCONNECTED'); - this.agentParticipant?.off(AgentParticipantEvent.AgentAttributesChanged, this.handleAgentAttributesChanged); + this.agentParticipant?.off(AgentParticipantEvent.AgentStateChanged, this.handleAgentStateChanged); this.agentParticipant?.teardown(); this.agentParticipant = null; - this.updateAgentState(); this.messageReceiver?.close(); this.messageReceiver = null; @@ -150,19 +136,14 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { - this.updateAgentState(); - } + private handleAgentStateChanged = async (newAgentState: AgentState) => { + this.emit(AgentSessionEvent.AgentStateChanged, newAgentState); + }; private handleAudioPlaybackStatusChanged = async () => { this.emit(AgentSessionEvent.AudioPlaybackStatusChanged, this.room.canPlaybackAudio); }; - private handleAgentAttributesChanged = () => { - console.log('!! ATTRIB CHANGED:', this.agentParticipant?.attributes) - this.updateAgentState(); - } - private handleIncomingMessage = (incomingMessage: ReceivedMessage) => { if (!this.defaultAggregator) { throw new Error('AgentSession.defaultAggregator is unset'); @@ -179,33 +160,8 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { - let newAgentState: AgentState | null = null; - if (!this.agentParticipant) { - // throw new Error('AgentSession.agentParticipant is unset'); - newAgentState = 'disconnected'; - } else { - const agentParticipantAttributes = this.agentParticipant.attributes; - const connectionState = this.room.state; - - if (connectionState === ConnectionState.Disconnected) { - newAgentState = 'disconnected'; - } else if ( - connectionState === ConnectionState.Connecting || - !this.agentParticipant || - !agentParticipantAttributes?.[stateAttribute] - ) { - newAgentState = 'connecting'; - } else { - newAgentState = agentParticipantAttributes[stateAttribute] as AgentState; - } - } - console.log('!! STATE:', newAgentState, this.agentParticipant?.attributes); - - if (this.state !== newAgentState) { - this.state = newAgentState; - this.emit(AgentSessionEvent.AgentStateChanged, newAgentState); - } + get state() { + return this.agentParticipant?.state ?? 'disconnected'; } get isAvailable() { From b192a46266bc852cc4d0e5b6ffe62959b01b9e8c Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 11 Aug 2025 10:09:07 -0400 Subject: [PATCH 22/51] feat: fix useAgentLocalParticipant local participant mic track not showing up in agent starter app --- agent-sdk/index.tsx | 39 +++++++++++++------ .../hooks/use-agent-control-bar.ts | 18 ++------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index d6092d6f9..eee9ceb4a 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -5,6 +5,7 @@ import { Participant, ParticipantEvent, Track, + TrackPublication, // TextStreamInfo, } from "livekit-client"; import { TrackReference, trackSourceToProtocol } from "@/agent-sdk/external-deps/components-js"; @@ -160,8 +161,8 @@ export function useAgentLocalParticipant() { const agentSession = useAgentSession(); const [localParticipant, setLocalParticipant] = React.useState(agentSession.localParticipant); - const [microphoneTrack, setMicrophoneTrack] = React.useState(null); - const [cameraTrack, setCameraTrack] = React.useState(null); + const [microphoneTrackPublication, setMicrophoneTrackPublication] = React.useState(null); + const [cameraTrackPublication, setCameraTrackPublication] = React.useState(null); const [permissions, setPermissions] = React.useState(null); useParticipantEvents(agentSession.localParticipant, [ @@ -183,17 +184,9 @@ export function useAgentLocalParticipant() { // FIXME: is the rest of this stuff needed? // const { isMicrophoneEnabled, isCameraEnabled, isScreenShareEnabled } = p; const microphoneTrack = agentSession.localParticipant.getTrackPublication(Track.Source.Microphone); - setMicrophoneTrack(microphoneTrack ? { - source: Track.Source.Microphone, - participant: localParticipant, - publication: microphoneTrack, - } : null); + setMicrophoneTrackPublication(microphoneTrack ?? null); const cameraTrack = agentSession.localParticipant.getTrackPublication(Track.Source.Camera); - setCameraTrack(cameraTrack ? { - source: Track.Source.Camera, - participant: localParticipant, - publication: cameraTrack, - } : null); + setCameraTrackPublication(cameraTrack ?? null); // const participantMedia: ParticipantMedia = { // isCameraEnabled, // isMicrophoneEnabled, @@ -222,6 +215,28 @@ export function useAgentLocalParticipant() { }; }, [permissions]); + const microphoneTrack: TrackReference | null = React.useMemo(() => { + if (!microphoneTrackPublication) { + return null; + } + return { + participant: localParticipant, + source: Track.Source.Microphone, + publication: microphoneTrackPublication, + }; + }, [localParticipant, microphoneTrackPublication]); + + const cameraTrack: TrackReference | null = React.useMemo(() => { + if (!cameraTrackPublication) { + return null; + } + return { + participant: localParticipant, + source: Track.Source.Camera, + publication: cameraTrackPublication, + }; + }, [localParticipant, cameraTrackPublication]); + return { localParticipant, microphoneTrack, diff --git a/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts b/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts index f928a7db2..c2734c3bb 100644 --- a/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts +++ b/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts @@ -41,12 +41,8 @@ export function useAgentControlBar(props: UseAgentControlBarProps = {}): UseAgen leave: true, ...controls, }; - // const { microphoneTrack, localParticipant } = useLocalParticipant(); // FIXME: replace with agent alternative - const { - microphoneTrack, - localParticipant, - publishPermissions, - } = useAgentLocalParticipant(); + // const { microphoneTrack, /* localParticipant */ } = useLocalParticipant(); // FIXME: replace with agent alternative + const { microphoneTrack, publishPermissions } = useAgentLocalParticipant(); // const publishPermissions = usePublishPermissions(); // FIXME: replace with agent alternative const room = useRoomContext(); @@ -63,14 +59,6 @@ export function useAgentControlBar(props: UseAgentControlBarProps = {}): UseAgen onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.ScreenShare, error }), }); - const micTrackRef = React.useMemo(() => { - return { - participant: localParticipant, - source: Track.Source.Microphone, - publication: microphoneTrack, - }; - }, [localParticipant, microphoneTrack]); - visibleControls.microphone ??= publishPermissions.microphone; visibleControls.screenShare ??= publishPermissions.screenShare; visibleControls.camera ??= publishPermissions.camera; @@ -137,7 +125,7 @@ export function useAgentControlBar(props: UseAgentControlBarProps = {}): UseAgen ); return { - micTrackRef, + micTrackRef: microphoneTrack, visibleControls, cameraToggle: { ...cameraToggle, From 6b9dfc52eefa7061667d02e70a49b3a5e9a8cee3 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 11 Aug 2025 10:12:59 -0400 Subject: [PATCH 23/51] feat: remove dead code from AgentParticipant --- agent-sdk/agent-session/AgentParticipant.ts | 44 +-------------------- 1 file changed, 2 insertions(+), 42 deletions(-) diff --git a/agent-sdk/agent-session/AgentParticipant.ts b/agent-sdk/agent-session/AgentParticipant.ts index 3fb355b2d..1b1ec9d72 100644 --- a/agent-sdk/agent-session/AgentParticipant.ts +++ b/agent-sdk/agent-session/AgentParticipant.ts @@ -1,6 +1,6 @@ import type TypedEventEmitter from 'typed-emitter'; import { EventEmitter } from "events"; -import { ConnectionState, ParticipantEvent, ParticipantKind, RemoteParticipant, Room, RoomEvent, Track, TranscriptionSegment } from 'livekit-client'; +import { ConnectionState, ParticipantEvent, ParticipantKind, RemoteParticipant, Room, RoomEvent, Track } from 'livekit-client'; import { getParticipantTrackRefs, participantTrackEvents, TrackReference } from '@/agent-sdk/external-deps/components-js'; import { ParticipantEventCallbacks } from '@/agent-sdk/external-deps/client-sdk-js'; @@ -19,7 +19,6 @@ export enum AgentParticipantEvent { AudioTrackChanged = 'videoTrackChanged', AgentAttributesChanged = 'agentAttributesChanged', AgentStateChanged = 'agentStateChanged', - // AgentTranscriptionsChanged = 'agentTranscriptionsChanged', } export type AgentParticipantCallbacks = { @@ -29,10 +28,7 @@ export type AgentParticipantCallbacks = { [AgentParticipantEvent.AgentStateChanged]: (newState: AgentState) => void; }; -// Goal: some sort of abstraction layer to provide information specific to the agent's interactions -// like video stream / audio stream / transcriptions / underlying participant attributes / etc, -// since it doesn't just come from one RemoteParticipant -// FIXME: maybe this could be named better? ... +/** Encapsulates all agent state / complexity */ export default class AgentParticipant extends (EventEmitter as new () => TypedEventEmitter) { private room: Room; state: AgentState = 'disconnected'; @@ -42,13 +38,8 @@ export default class AgentParticipant extends (EventEmitter as new () => TypedEv audioTrack: TrackReference | null = null; videoTrack: TrackReference | null = null; - audioTrackSyncTime: { timestamp: number, rtpTimestamp?: number } | null = null; - attributes: Record = {}; - transcriptions: Array = []; - transcriptionBufferSize: number = 100//TRACK_TRANSCRIPTION_DEFAULTS.bufferSize; - constructor(room: Room) { super(); this.room = room; @@ -136,17 +127,7 @@ export default class AgentParticipant extends (EventEmitter as new () => TypedEv this.workerTracks.find((t) => t.source === Track.Source.Microphone) ?? null ); if (this.audioTrack !== newAudioTrack) { - // console.log('!! audio track changed', this.audioTrack?.publication); - // this.audioTrack?.publication.off(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); this.audioTrack = newAudioTrack; - // this.audioTrack?.publication.on(TrackEvent.TranscriptionReceived, this.handleTranscriptionReceived); - - // this.audioTrackSyncTime = { - // timestamp: Date.now(), - // rtpTimestamp: this.audioTrack?.publication.track?.rtpTimestamp, - // }; - // this.audioTrack?.publication.track?.on(TrackEvent.TimeSyncUpdate, this.handleTimeSyncUpdate); - this.emit(AgentParticipantEvent.AudioTrackChanged, newAudioTrack); } }; @@ -180,27 +161,6 @@ export default class AgentParticipant extends (EventEmitter as new () => TypedEv } } - // private handleTranscriptionReceived = (segments: Array) => { - // console.log('!! TRANSCRIPTION', segments, this.audioTrackSyncTime); - // if (!this.audioTrackSyncTime) { - // throw new Error('AgentParticipant - audioTrackSyncTime missing'); - // } - // const audioTrackSyncTime = this.audioTrackSyncTime; - - // this.transcriptions = dedupeSegments( - // this.transcriptions, - // // when first receiving a segment, add the current media timestamp to it - // segments.map((s) => addMediaTimestampToTranscription(s, audioTrackSyncTime)), - // this.transcriptionBufferSize, - // ); - // this.emit(AgentParticipantEvent.AgentTranscriptionsChanged, this.transcriptions); - // } - - // private handleTimeSyncUpdate = (update: { timestamp: number; rtpTimestamp: number }) => { - // console.log('!! TIME SYNC UPDATE', update); - // this.audioTrackSyncTime = update; - // }; - private get roomRemoteParticipants() { return Array.from(this.room.remoteParticipants.values()); } From 58a5ad373eb0b30d447549c06bae87f218b338d4 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 11 Aug 2025 10:16:45 -0400 Subject: [PATCH 24/51] feat: rename AgentParticipant to Agent --- .../{AgentParticipant.ts => Agent.ts} | 22 ++++++++-------- agent-sdk/agent-session/AgentSession.ts | 16 ++++++------ agent-sdk/index.tsx | 25 +++++++++---------- 3 files changed, 31 insertions(+), 32 deletions(-) rename agent-sdk/agent-session/{AgentParticipant.ts => Agent.ts} (87%) diff --git a/agent-sdk/agent-session/AgentParticipant.ts b/agent-sdk/agent-session/Agent.ts similarity index 87% rename from agent-sdk/agent-session/AgentParticipant.ts rename to agent-sdk/agent-session/Agent.ts index 1b1ec9d72..646b6fb83 100644 --- a/agent-sdk/agent-session/AgentParticipant.ts +++ b/agent-sdk/agent-session/Agent.ts @@ -14,22 +14,22 @@ export type AgentState = | 'thinking' | 'speaking'; -export enum AgentParticipantEvent { +export enum AgentEvent { VideoTrackChanged = 'videoTrackChanged', AudioTrackChanged = 'videoTrackChanged', AgentAttributesChanged = 'agentAttributesChanged', AgentStateChanged = 'agentStateChanged', } -export type AgentParticipantCallbacks = { - [AgentParticipantEvent.VideoTrackChanged]: (newTrack: TrackReference | null) => void; - [AgentParticipantEvent.AudioTrackChanged]: (newTrack: TrackReference | null) => void; - [AgentParticipantEvent.AgentAttributesChanged]: (newAttributes: Record) => void; - [AgentParticipantEvent.AgentStateChanged]: (newState: AgentState) => void; +export type AgentCallbacks = { + [AgentEvent.VideoTrackChanged]: (newTrack: TrackReference | null) => void; + [AgentEvent.AudioTrackChanged]: (newTrack: TrackReference | null) => void; + [AgentEvent.AgentAttributesChanged]: (newAttributes: Record) => void; + [AgentEvent.AgentStateChanged]: (newState: AgentState) => void; }; /** Encapsulates all agent state / complexity */ -export default class AgentParticipant extends (EventEmitter as new () => TypedEventEmitter) { +export default class Agent extends (EventEmitter as new () => TypedEventEmitter) { private room: Room; state: AgentState = 'disconnected'; @@ -119,7 +119,7 @@ export default class AgentParticipant extends (EventEmitter as new () => TypedEv ); if (this.videoTrack !== newVideoTrack) { this.videoTrack = newVideoTrack; - this.emit(AgentParticipantEvent.VideoTrackChanged, newVideoTrack); + this.emit(AgentEvent.VideoTrackChanged, newVideoTrack); } const newAudioTrack = ( @@ -128,13 +128,13 @@ export default class AgentParticipant extends (EventEmitter as new () => TypedEv ); if (this.audioTrack !== newAudioTrack) { this.audioTrack = newAudioTrack; - this.emit(AgentParticipantEvent.AudioTrackChanged, newAudioTrack); + this.emit(AgentEvent.AudioTrackChanged, newAudioTrack); } }; private handleAttributesChanged = (attributes: Record) => { this.attributes = attributes; - this.emit(AgentParticipantEvent.AgentAttributesChanged, attributes); + this.emit(AgentEvent.AgentAttributesChanged, attributes); this.updateAgentState(); }; @@ -157,7 +157,7 @@ export default class AgentParticipant extends (EventEmitter as new () => TypedEv if (this.state !== newAgentState) { this.state = newAgentState; - this.emit(AgentParticipantEvent.AgentStateChanged, newAgentState); + this.emit(AgentEvent.AgentStateChanged, newAgentState); } } diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index d1c856308..3e7534a80 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -15,7 +15,7 @@ import { type ReceivedMessageAggregatorOptions, ReceivedMessageAggregatorEvent, } from "./message"; -import AgentParticipant, { AgentParticipantEvent, AgentState } from './AgentParticipant'; +import Agent, { AgentEvent, AgentState } from './Agent'; export enum AgentSessionEvent { AgentStateChanged = 'agentStateChanged', @@ -40,7 +40,7 @@ export type AgentSessionCallbacks = { export class AgentSession extends (EventEmitter as new () => TypedEventEmitter) { room: Room; // FIXME: should this be private? - agentParticipant: AgentParticipant | null = null; + agent: Agent | null = null; messageSender: MessageSender | null = null; messageReceiver: MessageReceiver | null = null; defaultAggregator: ReceivedMessageAggregator | null = null; @@ -71,8 +71,8 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { console.log('!! CONNECTED'); - this.agentParticipant = new AgentParticipant(this.room); - this.agentParticipant.on(AgentParticipantEvent.AgentStateChanged, this.handleAgentStateChanged); + this.agent = new Agent(this.room); + this.agent.on(AgentEvent.AgentStateChanged, this.handleAgentStateChanged); const chatMessageSender = new ChatMessageSender(this.localParticipant); this.messageSender = new CombinedMessageSender( @@ -101,9 +101,9 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { console.log('!! DISCONNECTED'); - this.agentParticipant?.off(AgentParticipantEvent.AgentStateChanged, this.handleAgentStateChanged); - this.agentParticipant?.teardown(); - this.agentParticipant = null; + this.agent?.off(AgentEvent.AgentStateChanged, this.handleAgentStateChanged); + this.agent?.teardown(); + this.agent = null; this.messageReceiver?.close(); this.messageReceiver = null; @@ -161,7 +161,7 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter( +export function useAgentEvent( eventName: EventName, - callback: AgentParticipantCallbacks[EventName], + callback: AgentCallbacks[EventName], dependencies: React.DependencyList, ) { const agentSession = useAgentSession(); @@ -100,16 +99,16 @@ export function useAgentParticipantEvent { - if (!agentSession.agentParticipant) { + if (!agentSession.agent) { return; } - const agentParticipant = agentSession.agentParticipant; - agentParticipant.on(eventName, memoizedCallback); + const agent = agentSession.agent; + agent.on(eventName, memoizedCallback); return () => { - agentParticipant.off(eventName, memoizedCallback); + agent.off(eventName, memoizedCallback); }; - }, [agentSession.agentParticipant, eventName, memoizedCallback]); + }, [agentSession.agent, eventName, memoizedCallback]); } export function useAgentState() { @@ -128,10 +127,10 @@ export function useAgentState() { export function useAgentTracks() { const agentSession = useAgentSession(); - const [audioTrack, setAudioTrack] = useState(agentSession.agentParticipant?.audioTrack ?? null); - useAgentParticipantEvent(AgentParticipantEvent.AudioTrackChanged, setAudioTrack, []); - const [videoTrack, setVideoTrack] = useState(agentSession.agentParticipant?.videoTrack ?? null); - useAgentParticipantEvent(AgentParticipantEvent.VideoTrackChanged, setVideoTrack, []); + const [audioTrack, setAudioTrack] = useState(agentSession.agent?.audioTrack ?? null); + useAgentEvent(AgentEvent.AudioTrackChanged, setAudioTrack, []); + const [videoTrack, setVideoTrack] = useState(agentSession.agent?.videoTrack ?? null); + useAgentEvent(AgentEvent.VideoTrackChanged, setVideoTrack, []); return { audioTrack, videoTrack }; } From d38ddf6610eac180768260db6df8db1cc1c5b155 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 11 Aug 2025 10:29:22 -0400 Subject: [PATCH 25/51] docs: remove comments --- agent-sdk/agent-session/AgentSession.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index 3e7534a80..c7c5e7b89 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -56,7 +56,6 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter Date: Mon, 11 Aug 2025 10:29:40 -0400 Subject: [PATCH 26/51] feat: add "ready" state to useAgentMessages --- agent-sdk/index.tsx | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index c3f9e0de9..93cc6d65b 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -36,9 +36,9 @@ export function useAgentSession() { export function useAgentMessages() { const agentSession = useAgentSession(); - const [messages, setMessages] = useState< - Array - >(agentSession.messages); + const [messagesState, setMessagesState] = useState< + Array | null + >(null); useEffect(() => { let aggregator: ReceivedMessageAggregator | null = null; @@ -46,11 +46,12 @@ export function useAgentMessages() { if (!aggregator) { return; } - setMessages(aggregator.toArray()) + setMessagesState(aggregator.toArray()); }; agentSession.createMessageAggregator({ startsAt: 'beginning' }).then(agg => { aggregator = agg; + setMessagesState(aggregator.toArray()); aggregator.on(ReceivedMessageAggregatorEvent.Updated, handleUpdated); }).catch(err => { // FIXME: how should this error be handled? @@ -60,6 +61,7 @@ export function useAgentMessages() { return () => { aggregator?.close(); aggregator?.off(ReceivedMessageAggregatorEvent.Updated, handleUpdated); + setMessagesState(null); }; }, [agentSession]); @@ -67,7 +69,15 @@ export function useAgentMessages() { return agentSession.sendMessage(message); }, [agentSession]); - return { messages, send }; + const { messages, ready } = useMemo(() => { + if (messagesState) { + return { messages: messagesState, ready: true }; + } else { + return { messages: [], ready: false }; + } + }, [messagesState]); + + return { ready, messages, send }; } export function useAgentSessionEvent( From 728258bb84f2bfccc49464af13b03e61b117a410 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 11 Aug 2025 11:47:15 -0400 Subject: [PATCH 27/51] feat: add most of agent control bar behind the scenes logic into useAgentLocalParticipant --- agent-sdk/index.tsx | 186 ++++++++++++++++-- .../agent-control-bar/agent-control-bar.tsx | 4 +- .../hooks/use-agent-control-bar.ts | 178 +++++++++-------- 3 files changed, 271 insertions(+), 97 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index 93cc6d65b..89df4b9e1 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -1,11 +1,14 @@ import * as React from "react"; import { useContext, useEffect, useState, useCallback, useMemo } from "react"; import { + AudioCaptureOptions, Participant, ParticipantEvent, + ScreenShareCaptureOptions, Track, TrackPublication, - // TextStreamInfo, + TrackPublishOptions, + VideoCaptureOptions, } from "livekit-client"; import { TrackReference, trackSourceToProtocol } from "@/agent-sdk/external-deps/components-js"; import { ParticipantEventCallbacks } from "../node_modules/livekit-client/src/room/participant/Participant"; @@ -13,6 +16,7 @@ import { AgentSession, AgentSessionCallbacks, AgentSessionEvent } from "./agent- import { ReceivedMessage, ReceivedMessageAggregator, ReceivedMessageAggregatorEvent, SentMessage } from "./agent-session/message"; import { AgentCallbacks, AgentEvent } from "./agent-session/Agent"; import { ParticipantPermission } from "livekit-server-sdk"; +import { usePersistentUserChoices } from "@livekit/components-react"; // --------------------- // REACT @@ -166,12 +170,25 @@ function useParticipantEvents

void; + saveUserTrackEnabledChoices?: boolean; +}) { const agentSession = useAgentSession(); const [localParticipant, setLocalParticipant] = React.useState(agentSession.localParticipant); const [microphoneTrackPublication, setMicrophoneTrackPublication] = React.useState(null); + const [microphoneTrackEnabled, setMicrophoneTrackEnabled] = React.useState(false); + const [microphoneTrackPending, setMicrophoneTrackPending] = React.useState(false); + const [cameraTrackPublication, setCameraTrackPublication] = React.useState(null); + const [cameraTrackEnabled, setCameraTrackEnabled] = React.useState(false); + const [cameraTrackPending, setCameraTrackPending] = React.useState(false); + + const [screenShareTrackPublication, setScreenShareTrackPublication] = React.useState(null); + const [screenShareTrackEnabled, setScreenShareTrackEnabled] = React.useState(false); + const [screenShareTrackPending, setScreenShareTrackPending] = React.useState(false); + const [permissions, setPermissions] = React.useState(null); useParticipantEvents(agentSession.localParticipant, [ @@ -194,17 +211,15 @@ export function useAgentLocalParticipant() { // const { isMicrophoneEnabled, isCameraEnabled, isScreenShareEnabled } = p; const microphoneTrack = agentSession.localParticipant.getTrackPublication(Track.Source.Microphone); setMicrophoneTrackPublication(microphoneTrack ?? null); + setMicrophoneTrackEnabled(localParticipant.isMicrophoneEnabled); + const cameraTrack = agentSession.localParticipant.getTrackPublication(Track.Source.Camera); setCameraTrackPublication(cameraTrack ?? null); - // const participantMedia: ParticipantMedia = { - // isCameraEnabled, - // isMicrophoneEnabled, - // isScreenShareEnabled, - // cameraTrack, - // microphoneTrack, - // participant: p, - // }; - // return participantMedia; + setCameraTrackEnabled(localParticipant.isCameraEnabled); + + const screenShareTrack = agentSession.localParticipant.getTrackPublication(Track.Source.ScreenShare); + setScreenShareTrackPublication(screenShareTrack ?? null); + setScreenShareTrackEnabled(localParticipant.isScreenShareEnabled); }, []); const publishPermissions = useMemo(() => { @@ -246,11 +261,156 @@ export function useAgentLocalParticipant() { }; }, [localParticipant, cameraTrackPublication]); + const screenShareTrack: TrackReference | null = React.useMemo(() => { + if (!screenShareTrackPublication) { + return null; + } + return { + participant: localParticipant, + source: Track.Source.ScreenShare, + publication: screenShareTrackPublication, + }; + }, [localParticipant, screenShareTrackPublication]); + + const { + saveAudioInputEnabled, + saveAudioInputDeviceId, + saveVideoInputEnabled, + saveVideoInputDeviceId, + } = usePersistentUserChoices({ // FIXME: replace with agent alternative + preventSave: !options?.saveUserTrackEnabledChoices, + }); + + const setMicrophoneEnabled = useCallback(async ( + enabled: boolean, + captureOptions?: AudioCaptureOptions, + publishOptions?: TrackPublishOptions, + ) => { + setMicrophoneTrackPending(true); + try { + await localParticipant.setMicrophoneEnabled( + enabled, + captureOptions, + publishOptions, + ); + saveAudioInputEnabled(enabled); + setMicrophoneTrackEnabled(enabled); + return localParticipant.isMicrophoneEnabled; + } catch (e) { + if (options?.onDeviceError && e instanceof Error) { + options?.onDeviceError(e, Track.Source.Microphone); + return; + } else { + throw e; + } + } finally { + setMicrophoneTrackPending(false); + } + }, [options?.onDeviceError, setMicrophoneTrackPending, saveAudioInputEnabled, setMicrophoneTrackEnabled]); + + const setCameraEnabled = useCallback(async ( + enabled: boolean, + captureOptions?: VideoCaptureOptions, + publishOptions?: TrackPublishOptions, + ) => { + setCameraTrackPending(true); + try { + await localParticipant.setCameraEnabled( + enabled, + captureOptions, + publishOptions, + ); + saveVideoInputEnabled(enabled); + setCameraTrackEnabled(enabled); + return localParticipant.isMicrophoneEnabled; + } catch (e) { + if (options?.onDeviceError && e instanceof Error) { + options?.onDeviceError(e, Track.Source.Camera); + return; + } else { + throw e; + } + } finally { + setCameraTrackPending(false); + } + }, [options?.onDeviceError, setCameraTrackPending, saveVideoInputEnabled, setCameraTrackEnabled]); + + const setScreenShareEnabled = useCallback(async ( + enabled: boolean, + captureOptions?: ScreenShareCaptureOptions, + publishOptions?: TrackPublishOptions, + ) => { + setScreenShareTrackPending(true); + try { + await localParticipant.setScreenShareEnabled( + enabled, + captureOptions, + publishOptions, + ); + setScreenShareEnabled(enabled); + return localParticipant.isMicrophoneEnabled; + } catch (e) { + if (options?.onDeviceError && e instanceof Error) { + options?.onDeviceError(e, Track.Source.ScreenShare); + return; + } else { + throw e; + } + } finally { + setScreenShareTrackPending(false); + } + }, [options?.onDeviceError, setScreenShareTrackPending, setScreenShareTrackEnabled]); + + const changeAudioDevice = useCallback( + (deviceId: string) => { + saveAudioInputDeviceId(deviceId ?? 'default'); + }, + [saveAudioInputDeviceId] + ); + + const changeVideoDevice = useCallback( + (deviceId: string) => { + saveVideoInputDeviceId(deviceId ?? 'default'); + }, + [saveVideoInputDeviceId] + ); + return { localParticipant, - microphoneTrack, - cameraTrack, publishPermissions, + + microphone: { + track: microphoneTrack, + enabled: microphoneTrackEnabled, + pending: microphoneTrackPending, + set: setMicrophoneEnabled, + toggle: useCallback(( + captureOptions?: AudioCaptureOptions, + publishOptions?: TrackPublishOptions + ) => setMicrophoneEnabled(!microphoneTrackEnabled, captureOptions, publishOptions), [microphoneTrackEnabled, setMicrophoneEnabled]), + changeDevice: changeAudioDevice, + }, + camera: { + track: cameraTrack, + enabled: cameraTrackEnabled, + pending: cameraTrackPending, + set: setCameraEnabled, + toggle: useCallback(( + captureOptions?: VideoCaptureOptions, + publishOptions?: TrackPublishOptions + ) => setCameraEnabled(!cameraTrackEnabled, captureOptions, publishOptions), [cameraTrackEnabled, setCameraEnabled]), + changeDevice: changeVideoDevice, + }, + screenShare: { + track: screenShareTrack, + enabled: screenShareTrackEnabled, + pending: screenShareTrackPending, + set: setScreenShareEnabled, + toggle: useCallback(( + captureOptions?: ScreenShareCaptureOptions, + publishOptions?: TrackPublishOptions + ) => setScreenShareEnabled(!screenShareTrackEnabled, captureOptions, publishOptions), [screenShareTrackEnabled, setScreenShareEnabled]), + }, }; } diff --git a/components/livekit/agent-control-bar/agent-control-bar.tsx b/components/livekit/agent-control-bar/agent-control-bar.tsx index 3127f0c88..f39bdcc1e 100644 --- a/components/livekit/agent-control-bar/agent-control-bar.tsx +++ b/components/livekit/agent-control-bar/agent-control-bar.tsx @@ -3,7 +3,7 @@ import * as React from 'react'; import { useCallback } from 'react'; import { Track } from 'livekit-client'; -import { BarVisualizer, useRemoteParticipants } from '@livekit/components-react'; +import { BarVisualizer, /* useRemoteParticipants */ } from '@livekit/components-react'; import { ChatTextIcon, PhoneDisconnectIcon } from '@phosphor-icons/react/dist/ssr'; import { ChatInput } from '@/components/livekit/chat/chat-input'; import { Button } from '@/components/ui/button'; @@ -58,7 +58,7 @@ export function AgentControlBar({ handleAudioDeviceChange, handleVideoDeviceChange, handleDisconnect, - } = useAgentControlBar({ // FIXME: replace with agent alternative + } = useAgentControlBar({ controls, saveUserChoices, }); diff --git a/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts b/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts index c2734c3bb..f9d56522c 100644 --- a/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts +++ b/components/livekit/agent-control-bar/hooks/use-agent-control-bar.ts @@ -1,4 +1,5 @@ import * as React from 'react'; +import { useCallback } from 'react'; import { Track } from 'livekit-client'; import { type TrackReferenceOrPlaceholder, @@ -8,7 +9,7 @@ import { useTrackToggle, } from '@livekit/components-react'; import { usePublishPermissions } from './use-publish-permissions'; -import { useAgentLocalParticipant } from '@/agent-sdk'; +import { useAgentLocalParticipant, useAgentSession } from '@/agent-sdk'; export interface ControlBarControls { microphone?: boolean; @@ -42,105 +43,118 @@ export function useAgentControlBar(props: UseAgentControlBarProps = {}): UseAgen ...controls, }; // const { microphoneTrack, /* localParticipant */ } = useLocalParticipant(); // FIXME: replace with agent alternative - const { microphoneTrack, publishPermissions } = useAgentLocalParticipant(); + const { + publishPermissions, + microphone, + camera, + screenShare, + } = useAgentLocalParticipant({ + onDeviceError: useCallback((error: Error, source: Track.Source) => props.onDeviceError?.({ source, error }), [props.onDeviceError]), + saveUserTrackEnabledChoices: saveUserChoices, + }); // const publishPermissions = usePublishPermissions(); // FIXME: replace with agent alternative - const room = useRoomContext(); + // const room = useRoomContext(); + const agentSession = useAgentSession(); - const microphoneToggle = useTrackToggle({ // FIXME: replace with agent alternative - source: Track.Source.Microphone, - onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.Microphone, error }), - }); - const cameraToggle = useTrackToggle({ // FIXME: replace with agent alternative - source: Track.Source.Camera, - onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.Camera, error }), - }); - const screenShareToggle = useTrackToggle({ // FIXME: replace with agent alternative - source: Track.Source.ScreenShare, - onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.ScreenShare, error }), - }); + // const microphoneToggle = useTrackToggle({ // FIXME: replace with agent alternative + // source: Track.Source.Microphone, + // onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.Microphone, error }), + // }); + // const cameraToggle = useTrackToggle({ // FIXME: replace with agent alternative + // source: Track.Source.Camera, + // onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.Camera, error }), + // }); + // const screenShareToggle = useTrackToggle({ // FIXME: replace with agent alternative + // source: Track.Source.ScreenShare, + // onDeviceError: (error) => props.onDeviceError?.({ source: Track.Source.ScreenShare, error }), + // }); visibleControls.microphone ??= publishPermissions.microphone; visibleControls.screenShare ??= publishPermissions.screenShare; visibleControls.camera ??= publishPermissions.camera; visibleControls.chat ??= publishPermissions.data; - const { - saveAudioInputEnabled, - saveAudioInputDeviceId, - saveVideoInputEnabled, - saveVideoInputDeviceId, - } = usePersistentUserChoices({ // FIXME: replace with agent alternative - preventSave: !saveUserChoices, - }); + // const { + // saveAudioInputEnabled, + // saveAudioInputDeviceId, + // saveVideoInputEnabled, + // saveVideoInputDeviceId, + // } = usePersistentUserChoices({ // FIXME: replace with agent alternative + // preventSave: !saveUserChoices, + // }); const handleDisconnect = React.useCallback(async () => { - if (room) { - await room.disconnect(); - } - }, [room]); + // if (room) { + // await room.disconnect(); + // } + await agentSession?.disconnect() + }, [/* room */, agentSession]); - const handleAudioDeviceChange = React.useCallback( - (deviceId: string) => { - saveAudioInputDeviceId(deviceId ?? 'default'); - }, - [saveAudioInputDeviceId] - ); + // const handleAudioDeviceChange = React.useCallback( + // (deviceId: string) => { + // saveAudioInputDeviceId(deviceId ?? 'default'); + // }, + // [saveAudioInputDeviceId] + // ); - const handleVideoDeviceChange = React.useCallback( - (deviceId: string) => { - saveVideoInputDeviceId(deviceId ?? 'default'); - }, - [saveVideoInputDeviceId] - ); + // const handleVideoDeviceChange = React.useCallback( + // (deviceId: string) => { + // saveVideoInputDeviceId(deviceId ?? 'default'); + // }, + // [saveVideoInputDeviceId] + // ); - const handleToggleCamera = React.useCallback( - async (enabled?: boolean) => { - if (screenShareToggle.enabled) { - screenShareToggle.toggle(false); - } - await cameraToggle.toggle(enabled); - // persist video input enabled preference - saveVideoInputEnabled(!cameraToggle.enabled); - }, - [cameraToggle.enabled, screenShareToggle.enabled] - ); + // const handleToggleCamera = React.useCallback( + // async (enabled?: boolean) => { + // if (screenShareToggle.enabled) { + // screenShareToggle.toggle(false); + // } + // await cameraToggle.toggle(enabled); + // // persist video input enabled preference + // saveVideoInputEnabled(!cameraToggle.enabled); + // }, + // [cameraToggle.enabled, screenShareToggle.enabled] + // ); - const handleToggleMicrophone = React.useCallback( - async (enabled?: boolean) => { - await microphoneToggle.toggle(enabled); - // persist audio input enabled preference - saveAudioInputEnabled(!microphoneToggle.enabled); - }, - [microphoneToggle.enabled] - ); + // const handleToggleMicrophone = React.useCallback( + // async (enabled?: boolean) => { + // await microphoneToggle.toggle(enabled); + // // persist audio input enabled preference + // saveAudioInputEnabled(!microphoneToggle.enabled); + // }, + // [microphoneToggle.enabled] + // ); - const handleToggleScreenShare = React.useCallback( - async (enabled?: boolean) => { - if (cameraToggle.enabled) { - cameraToggle.toggle(false); - } - await screenShareToggle.toggle(enabled); - }, - [screenShareToggle.enabled, cameraToggle.enabled] - ); + // const handleToggleScreenShare = React.useCallback( + // async (enabled?: boolean) => { + // if (cameraToggle.enabled) { + // cameraToggle.toggle(false); + // } + // await screenShareToggle.toggle(enabled); + // }, + // [screenShareToggle.enabled, cameraToggle.enabled] + // ); return { - micTrackRef: microphoneTrack, + micTrackRef: microphone.track, visibleControls, - cameraToggle: { - ...cameraToggle, - toggle: handleToggleCamera, - }, - microphoneToggle: { - ...microphoneToggle, - toggle: handleToggleMicrophone, - }, - screenShareToggle: { - ...screenShareToggle, - toggle: handleToggleScreenShare, - }, + cameraToggle: { ...camera, buttonProps: {} }, + microphoneToggle: { ...microphone, buttonProps: {} }, + screenShareToggle: { ...screenShare, buttonProps: {} }, + // cameraToggle: { + // ...cameraToggle, + // toggle: handleToggleCamera, + // }, + // microphoneToggle: { + // ...microphoneToggle, + // toggle: handleToggleMicrophone, + // }, + // screenShareToggle: { + // ...screenShareToggle, + // toggle: handleToggleScreenShare, + // }, handleDisconnect, - handleAudioDeviceChange, - handleVideoDeviceChange, + handleAudioDeviceChange: microphone.changeDevice, + handleVideoDeviceChange: camera.changeDevice, }; } From 0d312e3fb0e390971d1e232d49c7718a71a8c974 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 11 Aug 2025 11:47:58 -0400 Subject: [PATCH 28/51] feat: port useDebug to use AgentSession --- hooks/useDebug.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hooks/useDebug.ts b/hooks/useDebug.ts index 7e69dab9a..149e5a616 100644 --- a/hooks/useDebug.ts +++ b/hooks/useDebug.ts @@ -1,9 +1,11 @@ import * as React from 'react'; import { LogLevel, setLogLevel } from 'livekit-client'; -import { useRoomContext } from '@livekit/components-react'; +// import { useRoomContext } from '@livekit/components-react'; +import { useAgentSession } from '@/agent-sdk'; export const useDebugMode = ({ logLevel }: { logLevel?: LogLevel } = {}) => { - const room = useRoomContext(); + // const room = useRoomContext(); + const room = useAgentSession().room; React.useEffect(() => { setLogLevel(logLevel ?? 'debug'); From 22282c26fbae11b68a87b31f1464fff1fc45ee45 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 11 Aug 2025 11:49:30 -0400 Subject: [PATCH 29/51] fix: agent timeout should disconnect whole AgentSession --- agent-sdk/agent-session/AgentSession.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index c7c5e7b89..e6faf02d3 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -130,7 +130,7 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter Date: Mon, 11 Aug 2025 11:59:18 -0400 Subject: [PATCH 30/51] refactor: update docs --- agent-sdk/agent-session/Agent.ts | 6 ++++-- .../agent-session/message/ReceivedMessageAggregator.ts | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/agent-sdk/agent-session/Agent.ts b/agent-sdk/agent-session/Agent.ts index 646b6fb83..45d2e5cbb 100644 --- a/agent-sdk/agent-session/Agent.ts +++ b/agent-sdk/agent-session/Agent.ts @@ -28,13 +28,15 @@ export type AgentCallbacks = { [AgentEvent.AgentStateChanged]: (newState: AgentState) => void; }; -/** Encapsulates all agent state / complexity */ +/** + * Agent encapculates all agent state, normalizing some quirks around how LiveKit Agents work. + */ export default class Agent extends (EventEmitter as new () => TypedEventEmitter) { private room: Room; state: AgentState = 'disconnected'; private agentParticipant: RemoteParticipant | null = null; - private workerParticipant: RemoteParticipant | null = null; + private workerParticipant: RemoteParticipant | null = null; // ref: https://docs.livekit.io/agents/integrations/avatar/#avatar-workers audioTrack: TrackReference | null = null; videoTrack: TrackReference | null = null; diff --git a/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts b/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts index 38e7fac56..625fc587b 100644 --- a/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts +++ b/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts @@ -68,10 +68,10 @@ export default class ReceivedMessageAggregator } // FIXME: think through this scenario: - // 1. Message a is upserted - // 2. `options.bufferSize` messages are upserted, evicting message a - // 3. Another message a upsert happens, should this somehow get rejected (via bloom filter / etc?) - // or just end up in the list again as a seemingly brand new message? + // 1. Message `a` is upserted + // 2. `options.bufferSize` messages are upserted, evicting message `a` + // 3. Another message `a` upsert happens, should this somehow get rejected (via bloom filter / etc?) + // or just end up in the list again as a seemingly brand new message? for (const message of messages) { this.messageById.set(message.id, message); if (!this.messageIds.includes(message.id)) { From b2602b8c623010b14496380c5660011fb146614b Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 11 Aug 2025 11:59:39 -0400 Subject: [PATCH 31/51] feat: add MessageReceived event, aggregation should be the responsibility of the ReceivedMessageAggregator --- agent-sdk/agent-session/AgentSession.ts | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index e6faf02d3..c6e262b3e 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -20,14 +20,14 @@ import Agent, { AgentEvent, AgentState } from './Agent'; export enum AgentSessionEvent { AgentStateChanged = 'agentStateChanged', AgentAttributesChanged = 'agentAttributesChanged', - MessagesChanged = 'messagesChanged', + MessageReceived = 'messageReceived', AgentConnectionFailure = 'agentConnectionFailure', AudioPlaybackStatusChanged = 'AudioPlaybackStatusChanged', } export type AgentSessionCallbacks = { [AgentSessionEvent.AgentStateChanged]: (newAgentState: AgentState) => void; - [AgentSessionEvent.MessagesChanged]: (newMessages: Array) => void; + [AgentSessionEvent.MessageReceived]: (newMessage: ReceivedMessage) => void; [AgentSessionEvent.AgentConnectionFailure]: (reason: string) => void; [AgentSessionEvent.AudioPlaybackStatusChanged]: (audioPlaybackPermitted: boolean) => void; }; @@ -156,7 +156,7 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter Date: Mon, 11 Aug 2025 12:00:25 -0400 Subject: [PATCH 32/51] refactor: comment out dead code --- components/session-view.tsx | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/components/session-view.tsx b/components/session-view.tsx index 19c121de3..b818e55c5 100644 --- a/components/session-view.tsx +++ b/components/session-view.tsx @@ -1,27 +1,27 @@ 'use client'; -import React, { useEffect, useState } from 'react'; +import React, { useState } from 'react'; import { AnimatePresence, motion } from 'motion/react'; -import { - type AgentState, - type ReceivedChatMessage, - useRoomContext, - useVoiceAssistant, -} from '@livekit/components-react'; +// import { +// type AgentState, +// type ReceivedChatMessage, +// useRoomContext, +// useVoiceAssistant, +// } from '@livekit/components-react'; import { toastAlert } from '@/components/alert-toast'; import { AgentControlBar } from '@/components/livekit/agent-control-bar/agent-control-bar'; import { ChatEntry } from '@/components/livekit/chat/chat-entry'; import { ChatMessageView } from '@/components/livekit/chat/chat-message-view'; import { MediaTiles } from '@/components/livekit/media-tiles'; -import useChatAndTranscription from '@/hooks/useChatAndTranscription'; +// import useChatAndTranscription from '@/hooks/useChatAndTranscription'; import { useDebugMode } from '@/hooks/useDebug'; import type { AppConfig } from '@/lib/types'; import { cn } from '@/lib/utils'; -import { AgentSessionEvent, useAgentMessages, useAgentSession, useAgentSessionEvent, useAgentState } from '@/agent-sdk'; +import { AgentSessionEvent, useAgentMessages, useAgentSessionEvent } from '@/agent-sdk'; -function isAgentAvailable(agentState: AgentState) { - return agentState == 'listening' || agentState == 'thinking' || agentState == 'speaking'; -} +// function isAgentAvailable(agentState: AgentState) { +// return agentState == 'listening' || agentState == 'thinking' || agentState == 'speaking'; +// } interface SessionViewProps { appConfig: AppConfig; @@ -35,7 +35,6 @@ export const SessionView = ({ sessionStarted, ref, }: React.ComponentProps<'div'> & SessionViewProps) => { - const agentSession = useAgentSession(); const { messages, send } = useAgentMessages(); // const { state: agentState } = useVoiceAssistant(); From d24be87e52b314451dacda46ff6250003b02a392 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Thu, 14 Aug 2025 10:50:13 -0400 Subject: [PATCH 33/51] fix: remove SentMessage from ChatEntryProps ref: https://github.com/livekit-examples/agent-starter-react/pull/237#discussion_r2275657498 --- components/livekit/chat/chat-entry.tsx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/components/livekit/chat/chat-entry.tsx b/components/livekit/chat/chat-entry.tsx index 30b83b484..270e55814 100644 --- a/components/livekit/chat/chat-entry.tsx +++ b/components/livekit/chat/chat-entry.tsx @@ -1,12 +1,11 @@ import * as React from 'react'; -import type { MessageFormatter, ReceivedChatMessage } from '@livekit/components-react'; +import type { MessageFormatter } from '@livekit/components-react'; import { cn } from '@/lib/utils'; -import { useChatMessage } from './hooks/utils'; -import { ReceivedMessage, SentMessage } from '@/agent-sdk'; +import { ReceivedMessage } from '@/agent-sdk/agent-session/message'; export interface ChatEntryProps extends React.HTMLAttributes { /** The chat massage object to display. */ - entry: ReceivedMessage | SentMessage; + entry: ReceivedMessage; /** Hide sender name. Useful when displaying multiple consecutive chat messages from the same person. */ hideName?: boolean; /** Hide message timestamp. */ From 8f6301b1c8d54dd638e5b8eab83c48cc001e53e9 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Thu, 14 Aug 2025 12:37:09 -0400 Subject: [PATCH 34/51] feat: add special case for sendMessage string -> SentChatMessage --- agent-sdk/agent-session/AgentSession.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index c6e262b3e..531176271 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -265,11 +265,17 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter void) => void | undefined; From 00674f4a8fb26e5f634724280cd9144a7547cc6d Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Thu, 14 Aug 2025 15:45:11 -0400 Subject: [PATCH 35/51] feat: add ConnectionDetailsProvider to further abstract generating downstream tokens This also facilitates running room.prepareConnection in the constructor --- agent-sdk/agent-session/AgentSession.ts | 23 ++++- .../ConnectionCredentialsProvider.ts | 99 +++++++++++++++++++ components/app.tsx | 17 ++-- hooks/useConnectionDetails.ts | 38 ++++--- package.json | 1 + pnpm-lock.yaml | 17 ++++ 6 files changed, 166 insertions(+), 29 deletions(-) create mode 100644 agent-sdk/agent-session/ConnectionCredentialsProvider.ts diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index 531176271..db391830e 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -16,6 +16,7 @@ import { ReceivedMessageAggregatorEvent, } from "./message"; import Agent, { AgentEvent, AgentState } from './Agent'; +import { ConnectionCredentialsProvider } from './ConnectionCredentialsProvider'; export enum AgentSessionEvent { AgentStateChanged = 'agentStateChanged', @@ -46,18 +47,29 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter | null = null; aggregators: Array> | null = null; - constructor() { + private connectionCredentialsProvider: ConnectionCredentialsProvider; + + constructor(provider: ConnectionCredentialsProvider) { super(); + this.connectionCredentialsProvider = provider; this.room = new Room(); this.room.on(RoomEvent.Connected, this.handleRoomConnected); this.room.on(RoomEvent.Disconnected, this.handleRoomDisconnected); this.room.on(RoomEvent.AudioPlaybackStatusChanged, this.handleAudioPlaybackStatusChanged); + + this.prepareConnection().catch(err => { + // FIXME: figure out a better logging solution? + console.warn('WARNING: Room.prepareConnection failed:', err); + }); } - async connect(url: string, token: string) { + async connect() { + // await this.waitUntilRoomDisconnected() await Promise.all([ - this.room.connect(url, token), + this.connectionCredentialsProvider.generate().then(connection => ( + this.room.connect(connection.serverUrl, connection.participantToken) + )), // FIXME: make it so the preconenct buffer thing can be disabled? this.room.localParticipant.setMicrophoneEnabled(true, undefined, { preConnectBuffer: true }), ]); @@ -68,6 +80,11 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { console.log('!! CONNECTED'); this.agent = new Agent(this.room); diff --git a/agent-sdk/agent-session/ConnectionCredentialsProvider.ts b/agent-sdk/agent-session/ConnectionCredentialsProvider.ts new file mode 100644 index 000000000..d3c2bb4c7 --- /dev/null +++ b/agent-sdk/agent-session/ConnectionCredentialsProvider.ts @@ -0,0 +1,99 @@ +import { decodeJwt } from 'jose'; + +import { ConnectionDetails } from "@/app/api/connection-details/route"; + +const ONE_MINUTE_IN_MILLISECONDS = 60 * 1000; + +/** + * The ConnectionDetailsProvider handles getting credentials for connecting to a new Room, caching + * the last result and using it until it expires. */ +export abstract class ConnectionCredentialsProvider { + private cachedConnectionDetails: ConnectionDetails | null = null; + + private isCachedConnectionDetailsExpired() { + const token = this.cachedConnectionDetails?.participantToken; + if (!token) { + return true; + } + + const jwtPayload = decodeJwt(token); + if (!jwtPayload.exp) { + return true; + } + const expiresAt = new Date(jwtPayload.exp - ONE_MINUTE_IN_MILLISECONDS); + + const now = new Date(); + return expiresAt >= now; + } + + async generate() { + if (this.isCachedConnectionDetailsExpired()) { + this.refresh(); + } + + return this.cachedConnectionDetails!; + } + + async refresh() { + this.cachedConnectionDetails = await this.fetch(); + } + + protected abstract fetch(): Promise; +}; + +export class ManualConnectionCredentialsProvider extends ConnectionCredentialsProvider { + protected fetch: () => Promise; + + constructor(handler: () => Promise) { + super(); + this.fetch = handler; + } +} + + +type SandboxConnectionCredentialsProviderOptions = { + sandboxId: string; + baseUrl?: string; + + /** The name of the room to join. If omitted, a random new room name will be generated instead. */ + roomName?: string; + + /** The identity of the participant the token should connect as connect as. If omitted, a random + * identity will be used instead. */ + participantName?: string; +}; + +export class SandboxConnectionCredentialsProvider extends ConnectionCredentialsProvider { + protected options: SandboxConnectionCredentialsProviderOptions; + + constructor(options: SandboxConnectionCredentialsProviderOptions) { + super(); + this.options = options; + + if (process.env.NODE_ENV === 'production') { + // FIXME: figure out a better logging solution? + console.warn('WARNING: SandboxConnectionCredentialsProvider is meant for development, and is not security hardened. In production, implement your own token generation solution.'); + } + } + + async fetch() { + const baseUrl = this.options.baseUrl ?? "https://cloud-api.livekit.io"; + const response = await fetch(`${baseUrl}/api/sandbox/connection-details`, { + method: "POST", + headers: { + "X-Sandbox-ID": this.options.sandboxId, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + roomName: this.options.roomName, + participantName: this.options.participantName, + }), + }); + + if (!response.ok) { + throw new Error(`Error generting token from sandbox token server: ${response.status} ${await response.text()}`); + } + + return response.json(); + } +} diff --git a/components/app.tsx b/components/app.tsx index 7b65e3463..2069c41d8 100644 --- a/components/app.tsx +++ b/components/app.tsx @@ -20,14 +20,14 @@ interface AppProps { } export function App({ appConfig }: AppProps) { - const agentSession = useMemo(() => new AgentSession(), []); + const { connectionDetailsProvider } = useConnectionDetails(); + const agentSession = useMemo(() => new AgentSession(connectionDetailsProvider), [connectionDetailsProvider]); const [sessionStarted, setSessionStarted] = useState(false); - const { connectionDetails, refreshConnectionDetails } = useConnectionDetails(); useEffect(() => { const onDisconnected = () => { setSessionStarted(false); - refreshConnectionDetails(); + connectionDetailsProvider.refresh(); }; const onMediaDevicesError = (error: Error) => { toastAlert({ @@ -41,15 +41,12 @@ export function App({ appConfig }: AppProps) { agentSession.room.off(RoomEvent.Disconnected, onDisconnected); agentSession.room.off(RoomEvent.MediaDevicesError, onMediaDevicesError); }; - }, [agentSession, refreshConnectionDetails]); + }, [agentSession, connectionDetailsProvider.refresh]); useEffect(() => { let aborted = false; - if (sessionStarted && agentSession.state === 'disconnected' && connectionDetails) { - agentSession.connect( - connectionDetails.serverUrl, - connectionDetails.participantToken, - ).catch((error) => { + if (sessionStarted && agentSession.state === 'disconnected') { + agentSession.connect().catch((error) => { if (aborted) { // Once the effect has cleaned up after itself, drop any errors // @@ -69,7 +66,7 @@ export function App({ appConfig }: AppProps) { aborted = true; agentSession.disconnect(); }; - }, [agentSession, sessionStarted, connectionDetails /* , appConfig.isPreConnectBufferEnabled */]); + }, [agentSession, sessionStarted /* , appConfig.isPreConnectBufferEnabled */]); const { startButtonText } = appConfig; diff --git a/hooks/useConnectionDetails.ts b/hooks/useConnectionDetails.ts index 521d8ce50..b90c57c8e 100644 --- a/hooks/useConnectionDetails.ts +++ b/hooks/useConnectionDetails.ts @@ -1,5 +1,6 @@ -import { useCallback, useEffect, useState } from 'react'; +import { useCallback, useEffect, useMemo } from 'react'; import { ConnectionDetails } from '@/app/api/connection-details/route'; +import { ManualConnectionCredentialsProvider } from '@/agent-sdk/agent-session/ConnectionCredentialsProvider'; export default function useConnectionDetails() { // Generate room connection details, including: @@ -11,27 +12,32 @@ export default function useConnectionDetails() { // In real-world application, you would likely allow the user to specify their // own participant name, and possibly to choose from existing rooms to join. - const [connectionDetails, setConnectionDetails] = useState(null); - - const fetchConnectionDetails = useCallback(() => { - setConnectionDetails(null); + const fetchConnectionDetails = useCallback(async () => { const url = new URL( process.env.NEXT_PUBLIC_CONN_DETAILS_ENDPOINT ?? '/api/connection-details', window.location.origin ); - fetch(url.toString()) - .then((res) => res.json()) - .then((data) => { - setConnectionDetails(data); - }) - .catch((error) => { - console.error('Error fetching connection details:', error); - }); + + let data: ConnectionDetails; + try { + const res = await fetch(url.toString()); + data = await res.json(); + } catch (error) { + console.error('Error fetching connection details:', error); + throw new Error('Error fetching connection details!'); + } + + return data; }, []); + const provider = useMemo( + () => new ManualConnectionCredentialsProvider(fetchConnectionDetails), + [fetchConnectionDetails], + ); + useEffect(() => { - fetchConnectionDetails(); - }, [fetchConnectionDetails]); + provider.refresh(); + }, [provider]); - return { connectionDetails, refreshConnectionDetails: fetchConnectionDetails }; + return { connectionDetailsProvider: provider }; } diff --git a/package.json b/package.json index 5cd54b642..69c2bf5f3 100644 --- a/package.json +++ b/package.json @@ -22,6 +22,7 @@ "buffer-image-size": "^0.6.4", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", + "jose": "^6.0.12", "livekit-client": "^2.13.3", "livekit-server-sdk": "^2.13.0", "mime": "^4.0.7", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e97a81c8c..099372a47 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -41,6 +41,9 @@ importers: clsx: specifier: ^2.1.1 version: 2.1.1 + jose: + specifier: ^6.0.12 + version: 6.0.12 livekit-client: specifier: ^2.13.3 version: 2.15.4(@types/dom-mediacapture-record@1.0.22) @@ -68,6 +71,9 @@ importers: sonner: specifier: ^2.0.3 version: 2.0.7(react-dom@19.1.1(react@19.1.1))(react@19.1.1) + streaming-iterables: + specifier: ^8.0.1 + version: 8.0.1 tailwind-merge: specifier: ^3.3.0 version: 3.3.1 @@ -1853,6 +1859,9 @@ packages: jose@5.10.0: resolution: {integrity: sha512-s+3Al/p9g32Iq+oqXxkW//7jk2Vig6FF1CFqzVXoTUXt2qz89YWbL+OwS17NFYEvxC35n0FKeGO2LGYSxeM2Gg==} + jose@6.0.12: + resolution: {integrity: sha512-T8xypXs8CpmiIi78k0E+Lk7T2zlK4zDyg+o1CZ4AkOHgDg98ogdP2BeZ61lTFKFyoEwJ9RgAgN+SdM3iPgNonQ==} + js-tokens@4.0.0: resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==} @@ -2445,6 +2454,10 @@ packages: resolution: {integrity: sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==} engines: {node: '>= 0.4'} + streaming-iterables@8.0.1: + resolution: {integrity: sha512-yfQdmUB1b+rGLZkD/r6YisT/eNOjZxBAckXKlzYNmRJnwSzHaiScykD8gsQceFcShtK09qAbLhOqvzIpnBPoDQ==} + engines: {node: '>=18'} + string.prototype.includes@2.0.1: resolution: {integrity: sha512-o7+c9bW6zpAdJHTtujeePODAhkuicdAryFsfVKwA+wGw89wJ4GTY484WTucM9hLtDEOpOvI+aHnzqnC5lHp4Rg==} engines: {node: '>= 0.4'} @@ -4453,6 +4466,8 @@ snapshots: jose@5.10.0: {} + jose@6.0.12: {} + js-tokens@4.0.0: {} js-yaml@4.1.0: @@ -4998,6 +5013,8 @@ snapshots: es-errors: 1.3.0 internal-slot: 1.1.0 + streaming-iterables@8.0.1: {} + string.prototype.includes@2.0.1: dependencies: call-bind: 1.0.8 From 34c36459b2964de58cf3a228416c1ef3f50da8ed Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Fri, 15 Aug 2025 09:25:54 -0400 Subject: [PATCH 36/51] fix: add missing package --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index 69c2bf5f3..209d27c57 100644 --- a/package.json +++ b/package.json @@ -32,6 +32,7 @@ "react": "^19.0.0", "react-dom": "^19.0.0", "sonner": "^2.0.3", + "streaming-iterables": "^8.0.1", "tailwind-merge": "^3.3.0" }, "devDependencies": { From 0d6288ea9c86807ee0c37be11fbbf0440ae284ce Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 12:05:19 -0400 Subject: [PATCH 37/51] feat: remove defaultAggregator and startsAt param to createMessageAggregator This makes createMessageAggregator much simpler / just a wrapper tying some events together --- agent-sdk/agent-session/AgentSession.ts | 55 ++----------------------- 1 file changed, 4 insertions(+), 51 deletions(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index db391830e..cc448b17a 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -44,8 +44,6 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter | null = null; - aggregators: Array> | null = null; private connectionCredentialsProvider: ConnectionCredentialsProvider; @@ -109,9 +107,6 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter TypedEventEmitter { - if (!this.defaultAggregator) { - throw new Error('AgentSession.defaultAggregator is unset'); - } - if (!this.aggregators) { - throw new Error('AgentSession.aggregators is unset'); - } - - this.defaultAggregator.upsert(incomingMessage); - for (const aggregator of this.aggregators) { - aggregator.upsert(incomingMessage); - } - this.emit(AgentSessionEvent.MessageReceived, incomingMessage); } @@ -242,37 +218,14 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { - const aggregatorIndex = aggregators.indexOf(aggregator); - if (aggregatorIndex < 0) { - throw new Error(`Index of aggregator was non integer (found ${aggregatorIndex}), has this aggregator already been closed previously?`); - } - aggregators.splice(aggregatorIndex, 1); - + this.off(AgentSessionEvent.MessageReceived, aggregator.upsert); aggregator.off(ReceivedMessageAggregatorEvent.Close, closeHandler); }; aggregator.on(ReceivedMessageAggregatorEvent.Close, closeHandler); From 6fae521fb03c3be39e525999df8ab152f5ed7777 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 12:09:48 -0400 Subject: [PATCH 38/51] feat: add explicit AgentSessionEvent.Disconnected event --- agent-sdk/agent-session/AgentSession.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index cc448b17a..7edc6ba77 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -22,6 +22,7 @@ export enum AgentSessionEvent { AgentStateChanged = 'agentStateChanged', AgentAttributesChanged = 'agentAttributesChanged', MessageReceived = 'messageReceived', + Disconnected = 'disconnected', AgentConnectionFailure = 'agentConnectionFailure', AudioPlaybackStatusChanged = 'AudioPlaybackStatusChanged', } @@ -31,6 +32,7 @@ export type AgentSessionCallbacks = { [AgentSessionEvent.MessageReceived]: (newMessage: ReceivedMessage) => void; [AgentSessionEvent.AgentConnectionFailure]: (reason: string) => void; [AgentSessionEvent.AudioPlaybackStatusChanged]: (audioPlaybackPermitted: boolean) => void; + [AgentSessionEvent.Disconnected]: () => void; }; @@ -123,6 +125,8 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter Date: Mon, 18 Aug 2025 12:10:29 -0400 Subject: [PATCH 39/51] feat: use AgentSessionEvent.Disconnected to close any open `ReceivedMessageAggregator`s --- agent-sdk/agent-session/AgentSession.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index 7edc6ba77..243bf49ee 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -227,9 +227,11 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { this.off(AgentSessionEvent.MessageReceived, aggregator.upsert); + this.off(AgentSessionEvent.Disconnected, aggregator.close); aggregator.off(ReceivedMessageAggregatorEvent.Close, closeHandler); }; aggregator.on(ReceivedMessageAggregatorEvent.Close, closeHandler); From e5b1fc00344de4d51d777a1a0d7c431da8e170ef Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 12:15:30 -0400 Subject: [PATCH 40/51] feat: make ReceivedMessageAggregator methods arrow functions so they can be used directly as event handlers ie, this.emit("...", aggregator.upsert) --- .../agent-session/message/ReceivedMessageAggregator.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts b/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts index 625fc587b..96b2dce77 100644 --- a/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts +++ b/agent-sdk/agent-session/message/ReceivedMessageAggregator.ts @@ -42,22 +42,22 @@ export default class ReceivedMessageAggregator return aggregator; } - upsert(message: Message) { + upsert = (message: Message) => { this.internalBulkUpsert([message]); this.emit(ReceivedMessageAggregatorEvent.Updated); } - delete(message: Message) { + delete = (message: Message) => { this.internalBulkDelete([message.id]); this.emit(ReceivedMessageAggregatorEvent.Updated); } - extend(input: Iterable) { + extend = (input: Iterable) => { this.internalBulkUpsert(input); this.emit(ReceivedMessageAggregatorEvent.Updated); } - clear() { + clear = () => { this.messageById.clear(); this.messageIds = []; } @@ -113,7 +113,7 @@ export default class ReceivedMessageAggregator return Array.from(this); } - close() { + close = () => { this.closed = true; this.emit(ReceivedMessageAggregatorEvent.Close); } From f4eeace2a22a46c761060c5f1a339281652dc1cd Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 12:16:26 -0400 Subject: [PATCH 41/51] feat: remove startsAt from TBD react layer --- agent-sdk/index.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index 89df4b9e1..dec9cf558 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -41,7 +41,7 @@ export function useAgentMessages() { const agentSession = useAgentSession(); const [messagesState, setMessagesState] = useState< - Array | null + Array | null >(null); useEffect(() => { let aggregator: ReceivedMessageAggregator | null = null; @@ -53,7 +53,7 @@ export function useAgentMessages() { setMessagesState(aggregator.toArray()); }; - agentSession.createMessageAggregator({ startsAt: 'beginning' }).then(agg => { + agentSession.createMessageAggregator().then(agg => { aggregator = agg; setMessagesState(aggregator.toArray()); aggregator.on(ReceivedMessageAggregatorEvent.Updated, handleUpdated); From ad4c23626fdffab0c642526d7e68bba424386c4c Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 12:16:50 -0400 Subject: [PATCH 42/51] feat: add types-emitter package (this wasn't installed for some reason?) --- package.json | 3 ++- pnpm-lock.yaml | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index 209d27c57..9cb2fbe6f 100644 --- a/package.json +++ b/package.json @@ -33,7 +33,8 @@ "react-dom": "^19.0.0", "sonner": "^2.0.3", "streaming-iterables": "^8.0.1", - "tailwind-merge": "^3.3.0" + "tailwind-merge": "^3.3.0", + "typed-emitter": "^2.1.0" }, "devDependencies": { "@eslint/eslintrc": "^3", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 099372a47..cc37b1123 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -77,6 +77,9 @@ importers: tailwind-merge: specifier: ^3.3.0 version: 3.3.1 + typed-emitter: + specifier: ^2.1.0 + version: 2.1.0 devDependencies: '@eslint/eslintrc': specifier: ^3 From 730dbeda27d705e0492f081934c4c96215a0f3a0 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 12:38:16 -0400 Subject: [PATCH 43/51] feat: add logic to ensure that `connect` can't be run until underlying room fully disconnected This means that agentSession.disconnect() CAN be put in a useEffect cleanup function --- agent-sdk/agent-session/AgentSession.ts | 39 ++++++++++++++++++++----- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index 243bf49ee..2cca4ca66 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -35,6 +35,10 @@ export type AgentSessionCallbacks = { [AgentSessionEvent.Disconnected]: () => void; }; +export type AgentSessionOptions = { + connectSignal?: AbortSignal; +}; + /** * AgentSession represents a connection to a LiveKit Agent, providing abstractions to make 1:1 @@ -64,8 +68,13 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter ( this.room.connect(connection.serverUrl, connection.participantToken) @@ -190,26 +199,42 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter((resolve, reject) => { - const onceRoomConnected = () => { + const onceRoomEventOccurred = () => { cleanup(); resolve(); }; const abortHandler = () => { cleanup(); - reject(new Error('AgentSession.waitUntilRoomConnected - signal aborted')); + reject(new Error(`AgentSession.waitUntilRoomState(${state}, ...) - signal aborted`)); }; const cleanup = () => { - this.room.off(RoomEvent.Connected, onceRoomConnected); + this.room.off(stateMonitoringEvent, onceRoomEventOccurred); signal?.removeEventListener('abort', abortHandler); }; - this.room.on(RoomEvent.Connected, onceRoomConnected); + this.room.on(stateMonitoringEvent, onceRoomEventOccurred); signal?.addEventListener('abort', abortHandler); }); } From 80038a00cc024a3b07c125f8265016d0b4dd7a62 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 12:49:24 -0400 Subject: [PATCH 44/51] feat: add better mechanism to control whether microphone is enabled on agent session connect --- agent-sdk/agent-session/AgentSession.ts | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index 2cca4ca66..17ab031b1 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -1,6 +1,6 @@ import type TypedEventEmitter from 'typed-emitter'; import { EventEmitter } from "events"; -import { Room, RoomEvent, ConnectionState } from 'livekit-client'; +import { Room, RoomEvent, ConnectionState, TrackPublishOptions } from 'livekit-client'; import { type ReceivedMessage, @@ -37,6 +37,16 @@ export type AgentSessionCallbacks = { export type AgentSessionOptions = { connectSignal?: AbortSignal; + + // FIXME: not sure about this pattern, background thinking is that it would be good to be able to + // abstract away enabling relevant media tracks to the caller so they don't have to interface with + // the room. + tracks?: { + microphone?: { + enabled?: boolean; + publishOptions?: TrackPublishOptions; + }; + }; }; @@ -71,6 +81,7 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter ( this.room.connect(connection.serverUrl, connection.participantToken) )), - // FIXME: make it so the preconenct buffer thing can be disabled? - this.room.localParticipant.setMicrophoneEnabled(true, undefined, { preConnectBuffer: true }), + + // Start microphone (with preconnect buffer) by default + tracks.microphone?.enabled ? ( + this.room.localParticipant.setMicrophoneEnabled(true, undefined, tracks.microphone?.publishOptions ?? {}) + ) : Promise.resolve(), ]); await this.waitUntilAgentIsAvailable(); From c3993be5bcfac13582d96dabd149f5313955069e Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 13:12:37 -0400 Subject: [PATCH 45/51] feat: parameterize agentConnectTimeoutMilliseconds with default value --- agent-sdk/agent-session/AgentSession.ts | 28 ++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index 17ab031b1..df8fd26ae 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -36,7 +36,19 @@ export type AgentSessionCallbacks = { }; export type AgentSessionOptions = { - connectSignal?: AbortSignal; + /** Optional abort signal which if triggered will stop waiting for the room to be disconnected + * prior to connecting + * + * FIXME: is this a confusing property to expose? Maybe expose one `signal` that universally + * could apply across the whole agentSession.connect(...) call? + */ + waitForDisconnectSignal?: AbortSignal; + + /** + * Amount of time in milliseonds the system will wait for an agent to join the room, before + * emitting an AgentSessionEvent.AgentConnectionFailure event. + */ + agentConnectTimeoutMilliseconds?: number; // FIXME: not sure about this pattern, background thinking is that it would be good to be able to // abstract away enabling relevant media tracks to the caller so they don't have to interface with @@ -49,6 +61,9 @@ export type AgentSessionOptions = { }; }; +// FIXME: make this 10 seconds once room dispatch booting info is discoverable +const DEFAULT_AGENT_CONNECT_TIMEOUT_MILLISECONDS = 20_000; + /** * AgentSession represents a connection to a LiveKit Agent, providing abstractions to make 1:1 @@ -60,8 +75,9 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter ( @@ -164,7 +182,7 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { From fa6cac4972141d2d0a0ab4c559343ac0aefcc218 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 13:21:21 -0400 Subject: [PATCH 46/51] feat: add dependency to temp react layer so that useAgentMessages reattaches the aggregator on connect / disconnect --- agent-sdk/index.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index dec9cf558..1fe1fdf4c 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -67,7 +67,7 @@ export function useAgentMessages() { aggregator?.off(ReceivedMessageAggregatorEvent.Updated, handleUpdated); setMessagesState(null); }; - }, [agentSession]); + }, [agentSession, agentSession.isAvailable]); const send = useCallback(async (message: SentMessage) => { return agentSession.sendMessage(message); From e47c9af12264c4047b3c6510c86d98ed4469d22d Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 16:21:02 -0400 Subject: [PATCH 47/51] feat: replace AgentState with AgentConnectionState / AgentConversationalState --- agent-sdk/agent-session/Agent.ts | 90 ++++++++++++++++++------- agent-sdk/agent-session/AgentSession.ts | 53 +++++++++++---- agent-sdk/index.tsx | 38 +++++++++-- components/app.tsx | 2 +- components/livekit/media-tiles.tsx | 6 +- 5 files changed, 143 insertions(+), 46 deletions(-) diff --git a/agent-sdk/agent-session/Agent.ts b/agent-sdk/agent-session/Agent.ts index 45d2e5cbb..395153fa6 100644 --- a/agent-sdk/agent-session/Agent.ts +++ b/agent-sdk/agent-session/Agent.ts @@ -6,26 +6,26 @@ import { ParticipantEventCallbacks } from '@/agent-sdk/external-deps/client-sdk- const stateAttribute = 'lk.agent.state'; -export type AgentState = - | 'disconnected' - | 'connecting' - | 'initializing' - | 'listening' - | 'thinking' - | 'speaking'; +/** State representing the current connection status to the server hosted agent */ +export type AgentConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecting' | 'signalReconnecting'; + +/** State representing the current status of the agent, whether it is ready for speach, etc */ +export type AgentConversationalState = 'disconnected' | 'initializing' | 'idle' | 'listening' | 'thinking' | 'speaking'; export enum AgentEvent { VideoTrackChanged = 'videoTrackChanged', AudioTrackChanged = 'videoTrackChanged', AgentAttributesChanged = 'agentAttributesChanged', - AgentStateChanged = 'agentStateChanged', + AgentConnectionStateChanged = 'agentConnectionStateChanged', + AgentConversationalStateChanged = 'agentConversationalStateChanged', } export type AgentCallbacks = { [AgentEvent.VideoTrackChanged]: (newTrack: TrackReference | null) => void; [AgentEvent.AudioTrackChanged]: (newTrack: TrackReference | null) => void; [AgentEvent.AgentAttributesChanged]: (newAttributes: Record) => void; - [AgentEvent.AgentStateChanged]: (newState: AgentState) => void; + [AgentEvent.AgentConnectionStateChanged]: (newAgentConnectionState: AgentConnectionState) => void; + [AgentEvent.AgentConversationalStateChanged]: (newAgentConversationalState: AgentConversationalState) => void; }; /** @@ -33,7 +33,9 @@ export type AgentCallbacks = { */ export default class Agent extends (EventEmitter as new () => TypedEventEmitter) { private room: Room; - state: AgentState = 'disconnected'; + + connectionState: AgentConnectionState = 'disconnected'; + conversationalState: AgentConversationalState = 'disconnected'; private agentParticipant: RemoteParticipant | null = null; private workerParticipant: RemoteParticipant | null = null; // ref: https://docs.livekit.io/agents/integrations/avatar/#avatar-workers @@ -49,13 +51,17 @@ export default class Agent extends (EventEmitter as new () => TypedEventEmitter< this.room.on(RoomEvent.ParticipantConnected, this.handleParticipantConnected); this.room.on(RoomEvent.ParticipantDisconnected, this.handleParticipantDisconnected); this.room.on(RoomEvent.ConnectionStateChanged, this.handleConnectionStateChanged); - this.updateAgentState(); + this.room.localParticipant.on(ParticipantEvent.TrackPublished, this.handleLocalParticipantTrackPublished) + + this.updateConnectionState(); + this.updateConversationalState(); } teardown() { this.room.off(RoomEvent.ParticipantConnected, this.handleParticipantConnected); this.room.off(RoomEvent.ParticipantDisconnected, this.handleParticipantDisconnected); this.room.off(RoomEvent.ConnectionStateChanged, this.handleConnectionStateChanged); + this.room.localParticipant.off(ParticipantEvent.TrackPublished, this.handleLocalParticipantTrackPublished) } private handleParticipantConnected = () => { @@ -66,7 +72,12 @@ export default class Agent extends (EventEmitter as new () => TypedEventEmitter< } private handleConnectionStateChanged = () => { - this.updateAgentState(); + this.updateConnectionState(); + this.updateConversationalState(); + } + + private handleLocalParticipantTrackPublished = () => { + this.updateConversationalState(); } private updateParticipants() { @@ -137,29 +148,58 @@ export default class Agent extends (EventEmitter as new () => TypedEventEmitter< private handleAttributesChanged = (attributes: Record) => { this.attributes = attributes; this.emit(AgentEvent.AgentAttributesChanged, attributes); - this.updateAgentState(); + this.updateConnectionState(); + this.updateConversationalState(); }; - private updateAgentState() { - let newAgentState: AgentState | null = null; - const connectionState = this.room.state; + private updateConnectionState() { + let newConnectionState: AgentConnectionState; - if (connectionState === ConnectionState.Disconnected) { - newAgentState = 'disconnected'; + const roomConnectionState = this.room.state; + if (roomConnectionState === ConnectionState.Disconnected) { + newConnectionState = 'disconnected'; } else if ( - connectionState === ConnectionState.Connecting || + roomConnectionState === ConnectionState.Connecting || !this.agentParticipant || !this.attributes[stateAttribute] ) { - newAgentState = 'connecting'; + newConnectionState = 'connecting'; } else { - newAgentState = this.attributes[stateAttribute] as AgentState; + newConnectionState = roomConnectionState; + } + console.log('!! CONNECTION STATE:', newConnectionState); + + if (this.connectionState !== newConnectionState) { + this.connectionState = newConnectionState; + this.emit(AgentEvent.AgentConnectionStateChanged, newConnectionState); + } + } + + private updateConversationalState() { + let newConversationalState: AgentConversationalState = 'disconnected'; + + if (this.room.state !== ConnectionState.Disconnected) { + newConversationalState = 'initializing'; + } + + // If the microphone preconnect buffer is active, then the state should be "listening" rather + // than "initializing" + const micTrack = this.room.localParticipant.getTrackPublication(Track.Source.Microphone); + if (micTrack) { + newConversationalState = 'listening'; + } + + if (this.agentParticipant && this.attributes[stateAttribute]) { + // ref: https://github.com/livekit/agents/blob/65170238db197f62f479eb7aaef1c0e18bfad6e7/livekit-agents/livekit/agents/voice/events.py#L97 + const agentState = this.attributes[stateAttribute] as 'initializing' | 'idle' | 'listening' | 'thinking' | 'speaking'; + newConversationalState = agentState; } - console.log('!! STATE:', newAgentState, this.agentParticipant?.attributes); - if (this.state !== newAgentState) { - this.state = newAgentState; - this.emit(AgentEvent.AgentStateChanged, newAgentState); + console.log('!! CONVERSATIONAL STATE:', newConversationalState); + + if (this.conversationalState !== newConversationalState) { + this.conversationalState = newConversationalState; + this.emit(AgentEvent.AgentConversationalStateChanged, newConversationalState); } } diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index df8fd26ae..d1e30be9b 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -15,11 +15,12 @@ import { type ReceivedMessageAggregatorOptions, ReceivedMessageAggregatorEvent, } from "./message"; -import Agent, { AgentEvent, AgentState } from './Agent'; +import Agent, { AgentConnectionState, AgentConversationalState, AgentEvent } from './Agent'; import { ConnectionCredentialsProvider } from './ConnectionCredentialsProvider'; export enum AgentSessionEvent { - AgentStateChanged = 'agentStateChanged', + AgentConnectionStateChanged = 'agentConnectionStateChanged', + AgentConversationalStateChanged = 'agentConversationalStateChanged', AgentAttributesChanged = 'agentAttributesChanged', MessageReceived = 'messageReceived', Disconnected = 'disconnected', @@ -28,7 +29,8 @@ export enum AgentSessionEvent { } export type AgentSessionCallbacks = { - [AgentSessionEvent.AgentStateChanged]: (newAgentState: AgentState) => void; + [AgentSessionEvent.AgentConnectionStateChanged]: (newAgentConnectionState: AgentConnectionState) => void; + [AgentSessionEvent.AgentConversationalStateChanged]: (newAgentConversationalState: AgentConversationalState) => void; [AgentSessionEvent.MessageReceived]: (newMessage: ReceivedMessage) => void; [AgentSessionEvent.AgentConnectionFailure]: (reason: string) => void; [AgentSessionEvent.AudioPlaybackStatusChanged]: (audioPlaybackPermitted: boolean) => void; @@ -129,7 +131,8 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { console.log('!! CONNECTED'); this.agent = new Agent(this.room); - this.agent.on(AgentEvent.AgentStateChanged, this.handleAgentStateChanged); + this.agent.on(AgentEvent.AgentConnectionStateChanged, this.handleAgentConnectionStateChanged); + this.agent.on(AgentEvent.AgentConversationalStateChanged, this.handleAgentConversationalStateChanged); const chatMessageSender = new ChatMessageSender(this.localParticipant); this.messageSender = new CombinedMessageSender( @@ -155,7 +158,8 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { console.log('!! DISCONNECTED'); - this.agent?.off(AgentEvent.AgentStateChanged, this.handleAgentStateChanged); + this.agent?.off(AgentEvent.AgentConnectionStateChanged, this.handleAgentConnectionStateChanged); + this.agent?.off(AgentEvent.AgentConversationalStateChanged, this.handleAgentConversationalStateChanged); this.agent?.teardown(); this.agent = null; @@ -175,7 +179,7 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { if (!this.isAvailable) { const reason = - this.state === 'connecting' + this.connectionState === 'connecting' ? 'Agent did not join the room. ' : 'Agent connected but did not complete initializing. '; @@ -185,8 +189,12 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter { - this.emit(AgentSessionEvent.AgentStateChanged, newAgentState); + private handleAgentConnectionStateChanged = async (newConnectionState: AgentConnectionState) => { + this.emit(AgentSessionEvent.AgentConnectionStateChanged, newConnectionState); + }; + + private handleAgentConversationalStateChanged = async (newConversationalState: AgentConversationalState) => { + this.emit(AgentSessionEvent.AgentConversationalStateChanged, newConversationalState); }; private handleAudioPlaybackStatusChanged = async () => { @@ -197,12 +205,29 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter TypedEventEmitter { - this.off(AgentSessionEvent.AgentStateChanged, stateChangedHandler); + this.off(AgentSessionEvent.AgentConnectionStateChanged, stateChangedHandler); + this.off(AgentSessionEvent.AgentConversationalStateChanged, stateChangedHandler); signal?.removeEventListener('abort', abortHandler); }; - this.on(AgentSessionEvent.AgentStateChanged, stateChangedHandler); + this.on(AgentSessionEvent.AgentConnectionStateChanged, stateChangedHandler); + this.on(AgentSessionEvent.AgentConversationalStateChanged, stateChangedHandler); signal?.addEventListener('abort', abortHandler); }); } diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index 1fe1fdf4c..a62c12ccd 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -127,15 +127,45 @@ export function useAgentEvent( export function useAgentState() { const agentSession = useAgentSession(); - const [agentState, setAgentState] = useState(agentSession.state); + const [connectionState, setConnectionState] = useState(agentSession.connectionState); + const [conversationalState, setConversationalState] = useState(agentSession.conversationalState); const [isAvailable, setIsAvailable] = useState(agentSession.isAvailable); + const [isConnected, setIsConnected] = useState(agentSession.isConnected); - useAgentSessionEvent(AgentSessionEvent.AgentStateChanged, (newAgentState) => { - setAgentState(newAgentState); + useAgentSessionEvent(AgentSessionEvent.AgentConnectionStateChanged, (newState) => { + setConnectionState(newState); setIsAvailable(agentSession.isAvailable); + setIsConnected(agentSession.isConnected); + }, []); + useAgentSessionEvent(AgentSessionEvent.AgentConversationalStateChanged, (newState) => { + setConversationalState(newState); + setIsAvailable(agentSession.isAvailable); + setIsConnected(agentSession.isConnected); }, []); - return { state: agentState, isAvailable }; + const legacyState = useMemo((): 'disconnected' | 'connecting' | 'initializing' | 'listening' | 'thinking' | 'speaking' => { + if (connectionState === 'disconnected' || connectionState === 'connecting') { + return connectionState; + } else { + switch (conversationalState) { + case 'initializing': + case 'idle': + return 'initializing'; + + default: + return conversationalState; + } + } + }, [connectionState, conversationalState]); + + return { + connectionState, + conversationalState, + /** @deprecated Use connectionState / conversationalState insread of legacyState */ + legacyState, + isAvailable, + isConnected + }; } export function useAgentTracks() { diff --git a/components/app.tsx b/components/app.tsx index 2069c41d8..b63ff9d33 100644 --- a/components/app.tsx +++ b/components/app.tsx @@ -45,7 +45,7 @@ export function App({ appConfig }: AppProps) { useEffect(() => { let aborted = false; - if (sessionStarted && agentSession.state === 'disconnected') { + if (sessionStarted && agentSession.connectionState === 'disconnected') { agentSession.connect().catch((error) => { if (aborted) { // Once the effect has cleaned up after itself, drop any errors diff --git a/components/livekit/media-tiles.tsx b/components/livekit/media-tiles.tsx index f28dc3b62..c3b7b99d2 100644 --- a/components/livekit/media-tiles.tsx +++ b/components/livekit/media-tiles.tsx @@ -11,7 +11,7 @@ import { cn } from '@/lib/utils'; import { AgentTile } from './agent-tile'; import { AvatarTile } from './avatar-tile'; import { VideoTile } from './video-tile'; -import { useAgentLocalParticipant, useAgentState, useAgentTracks } from '@/agent-sdk'; +import { useAgentLocalParticipant, useAgentState } from '@/agent-sdk'; const MotionVideoTile = motion.create(VideoTile); const MotionAgentTile = motion.create(AgentTile); @@ -92,7 +92,7 @@ interface MediaTilesProps { } export function MediaTiles({ chatOpen }: MediaTilesProps) { - const { state: agentState } = useAgentState(); + const { legacyState: agentState } = useAgentState(); // const { audioTrack: agentAudioTrack, videoTrack: agentVideoTrack } = useAgentTracks(); const { // state: agentState, @@ -102,7 +102,7 @@ export function MediaTiles({ chatOpen }: MediaTilesProps) { // console.log('TRACKS:', agentAudioTrack, agentVideoTrack); const [screenShareTrack] = useTracks([Track.Source.ScreenShare]); // FIXME: replace with agent alternative // const cameraTrack: TrackReference | undefined = useLocalTrackRef(Track.Source.Camera); // FIXME: replace with agent alternative - const { cameraTrack } = useAgentLocalParticipant(); + const { camera: { track: cameraTrack } } = useAgentLocalParticipant(); const isCameraEnabled = cameraTrack && !cameraTrack.publication.isMuted; const isScreenShareEnabled = screenShareTrack && !screenShareTrack.publication.isMuted; From 6f12617c578ecaf828b4da1c8bcf0620b082fffd Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 16:21:20 -0400 Subject: [PATCH 48/51] fix: add await as part of refresh so return is blocked until after refresh finishes --- agent-sdk/agent-session/ConnectionCredentialsProvider.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent-sdk/agent-session/ConnectionCredentialsProvider.ts b/agent-sdk/agent-session/ConnectionCredentialsProvider.ts index d3c2bb4c7..277166234 100644 --- a/agent-sdk/agent-session/ConnectionCredentialsProvider.ts +++ b/agent-sdk/agent-session/ConnectionCredentialsProvider.ts @@ -28,7 +28,7 @@ export abstract class ConnectionCredentialsProvider { async generate() { if (this.isCachedConnectionDetailsExpired()) { - this.refresh(); + await this.refresh(); } return this.cachedConnectionDetails!; From 1d716d7aaca4a5158aa76061aeac5482e7126874 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Mon, 18 Aug 2025 16:43:17 -0400 Subject: [PATCH 49/51] feat: add centralized participant attributes enum --- agent-sdk/agent-session/Agent.ts | 13 ++++++------- .../message/receive/TranscriptionMessageReceiver.ts | 13 ++++--------- agent-sdk/lib/participant-attributes.ts | 9 +++++++++ 3 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 agent-sdk/lib/participant-attributes.ts diff --git a/agent-sdk/agent-session/Agent.ts b/agent-sdk/agent-session/Agent.ts index 395153fa6..a1805ac28 100644 --- a/agent-sdk/agent-session/Agent.ts +++ b/agent-sdk/agent-session/Agent.ts @@ -3,8 +3,7 @@ import { EventEmitter } from "events"; import { ConnectionState, ParticipantEvent, ParticipantKind, RemoteParticipant, Room, RoomEvent, Track } from 'livekit-client'; import { getParticipantTrackRefs, participantTrackEvents, TrackReference } from '@/agent-sdk/external-deps/components-js'; import { ParticipantEventCallbacks } from '@/agent-sdk/external-deps/client-sdk-js'; - -const stateAttribute = 'lk.agent.state'; +import { ParticipantAttributes } from '@/agent-sdk/lib/participant-attributes'; /** State representing the current connection status to the server hosted agent */ export type AgentConnectionState = 'disconnected' | 'connecting' | 'connected' | 'reconnecting' | 'signalReconnecting'; @@ -82,12 +81,12 @@ export default class Agent extends (EventEmitter as new () => TypedEventEmitter< private updateParticipants() { const newAgentParticipant = this.roomRemoteParticipants.find( - (p) => p.kind === ParticipantKind.AGENT && !('lk.publish_on_behalf' in p.attributes), + (p) => p.kind === ParticipantKind.AGENT && !(ParticipantAttributes.publishOnBehalf in p.attributes), ) ?? null; const newWorkerParticipant = newAgentParticipant ? ( this.roomRemoteParticipants.find( (p) => - p.kind === ParticipantKind.AGENT && p.attributes['lk.publish_on_behalf'] === newAgentParticipant.identity, + p.kind === ParticipantKind.AGENT && p.attributes[ParticipantAttributes.publishOnBehalf] === newAgentParticipant.identity, ) ?? null ) : null; @@ -161,7 +160,7 @@ export default class Agent extends (EventEmitter as new () => TypedEventEmitter< } else if ( roomConnectionState === ConnectionState.Connecting || !this.agentParticipant || - !this.attributes[stateAttribute] + !this.attributes[ParticipantAttributes.state] ) { newConnectionState = 'connecting'; } else { @@ -189,9 +188,9 @@ export default class Agent extends (EventEmitter as new () => TypedEventEmitter< newConversationalState = 'listening'; } - if (this.agentParticipant && this.attributes[stateAttribute]) { + if (this.agentParticipant && this.attributes[ParticipantAttributes.state]) { // ref: https://github.com/livekit/agents/blob/65170238db197f62f479eb7aaef1c0e18bfad6e7/livekit-agents/livekit/agents/voice/events.py#L97 - const agentState = this.attributes[stateAttribute] as 'initializing' | 'idle' | 'listening' | 'thinking' | 'speaking'; + const agentState = this.attributes[ParticipantAttributes.state] as 'initializing' | 'idle' | 'listening' | 'thinking' | 'speaking'; newConversationalState = agentState; } diff --git a/agent-sdk/agent-session/message/receive/TranscriptionMessageReceiver.ts b/agent-sdk/agent-session/message/receive/TranscriptionMessageReceiver.ts index 2d823d490..601e54788 100644 --- a/agent-sdk/agent-session/message/receive/TranscriptionMessageReceiver.ts +++ b/agent-sdk/agent-session/message/receive/TranscriptionMessageReceiver.ts @@ -4,12 +4,7 @@ import { TextStreamInfo } from "@/agent-sdk/external-deps/client-sdk-js"; import { type ReceivedMessage, type ReceivedTranscriptionMessage } from ".."; import MessageReceiver from "./MessageReceiver"; - -export enum TranscriptionAttributes { - Final = "lk.transcription_final", - Segment = "lk.segment_id", - TrackId = "lk.transcribed_track_id", -} +import { ParticipantAttributes } from "@/agent-sdk/lib/participant-attributes"; /** * Processes new `lk.transcription` data stream events generated by the agent for both user and @@ -51,9 +46,9 @@ export default class TranscriptionMessageReceiver extends MessageReceiver { async start() { const textStreamHandler = async (reader: TextStreamReader, participantInfo: { identity: string }) => { - const transcriptionSegmentId = reader.info.attributes?.[TranscriptionAttributes.Segment]; + const transcriptionSegmentId = reader.info.attributes?.[ParticipantAttributes.TranscriptionSegmentId]; const isTranscription = Boolean(transcriptionSegmentId); - const isFinal = reader.info.attributes?.[TranscriptionAttributes.Final] === 'true'; + const isFinal = reader.info.attributes?.[ParticipantAttributes.TranscriptionFinal] === 'true'; let currentStreamId = reader.info.id; @@ -62,7 +57,7 @@ export default class TranscriptionMessageReceiver extends MessageReceiver { if (message.content.streamInfo.id === reader.info.id) { return true; } - if (isTranscription && transcriptionSegmentId === message.content.streamInfo.attributes?.[TranscriptionAttributes.Segment]) { + if (isTranscription && transcriptionSegmentId === message.content.streamInfo.attributes?.[ParticipantAttributes.TranscriptionSegmentId]) { return true; } return false; diff --git a/agent-sdk/lib/participant-attributes.ts b/agent-sdk/lib/participant-attributes.ts new file mode 100644 index 000000000..5fef00128 --- /dev/null +++ b/agent-sdk/lib/participant-attributes.ts @@ -0,0 +1,9 @@ +/** An enum of first party livekit attributes generated by the serverside agents sdk */ +export enum ParticipantAttributes { + state = 'lk.agent.state', + publishOnBehalf = 'lk.publish_on_behalf', + + TranscriptionFinal = "lk.transcription_final", + TranscriptionSegmentId = "lk.segment_id", + TranscribedTrackId = "lk.transcribed_track_id", +} From a7ff04f2534d4b6e4ac9e8d29d0765b1397bed05 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Tue, 19 Aug 2025 10:10:05 -0400 Subject: [PATCH 50/51] feat: remove canSend, proxy all messages to all `MessageSender`s --- .../message/send/ChatMessageSender.ts | 6 +----- .../message/send/CombinedMessageSender.ts | 18 +++--------------- .../message/send/MessageSender.ts | 2 -- 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/agent-sdk/agent-session/message/send/ChatMessageSender.ts b/agent-sdk/agent-session/message/send/ChatMessageSender.ts index 484b7a43f..4116a77c8 100644 --- a/agent-sdk/agent-session/message/send/ChatMessageSender.ts +++ b/agent-sdk/agent-session/message/send/ChatMessageSender.ts @@ -1,6 +1,6 @@ import { LocalParticipant } from "livekit-client"; -import { type ReceivedChatLoopbackMessage, type SentChatMessage, type SentMessage } from ".."; +import { type ReceivedChatLoopbackMessage, type SentChatMessage } from ".."; import MessageSender from "./MessageSender"; import MessageReceiver from "../receive/MessageReceiver"; @@ -15,10 +15,6 @@ export default class ChatMessageSender extends MessageSender { this.localParticipant = localParticipant; } - canSend(message: SentMessage): message is SentChatMessage { - return message.content.type === 'chat'; - } - async send(message: SentChatMessage) { for (const callback of this.loopbackReceiverCallbacks) { callback(message); diff --git a/agent-sdk/agent-session/message/send/CombinedMessageSender.ts b/agent-sdk/agent-session/message/send/CombinedMessageSender.ts index d119c5fe1..86a26145d 100644 --- a/agent-sdk/agent-session/message/send/CombinedMessageSender.ts +++ b/agent-sdk/agent-session/message/send/CombinedMessageSender.ts @@ -13,21 +13,9 @@ export default class CombinedMessageSender extends MessageSender { this.messageSenders = messageSenders; } - canSend(message: SentMessage): message is SentMessage { - return true; - } - async send(message: SentMessage) { - for (const sender of this.messageSenders) { - // FIXME: an open question - should this only ever send with one MessageSender or potentially - // multiple? It doesn't matter now given there is only one MessageSender (ChatMessageSender) - // but I'm not sure the right long term call. - if (sender.canSend(message)) { - await sender.send(message); - return; - } - } - - throw new Error(`CombinedMessageSender - cannot find a MessageSender to send message ${JSON.stringify(message)}`); + await Promise.all(this.messageSenders.map(async (sender) => { + return sender.send(message); + })); } } diff --git a/agent-sdk/agent-session/message/send/MessageSender.ts b/agent-sdk/agent-session/message/send/MessageSender.ts index d00904a6c..083f4823f 100644 --- a/agent-sdk/agent-session/message/send/MessageSender.ts +++ b/agent-sdk/agent-session/message/send/MessageSender.ts @@ -1,7 +1,5 @@ import { type SentMessage } from ".."; export default abstract class MessageSender { - /** Can this MessageSender handle sending the given message? */ - abstract canSend(message: SentMessage): message is Message abstract send(message: Message): Promise; } From 9d7c5b30995fac992c8a88685b3f1fb36a981c12 Mon Sep 17 00:00:00 2001 From: Ryan Gaus Date: Tue, 19 Aug 2025 13:32:07 -0400 Subject: [PATCH 51/51] feat: add chat message options to SentChatMessage --- agent-sdk/agent-session/AgentSession.ts | 11 +++++++---- agent-sdk/agent-session/message/index.ts | 6 ++++++ .../message/send/ChatMessageSender.ts | 19 +++++++++++++++---- .../message/send/CombinedMessageSender.ts | 6 +++--- .../message/send/MessageSender.ts | 6 +++--- agent-sdk/index.tsx | 9 ++++++--- 6 files changed, 40 insertions(+), 17 deletions(-) diff --git a/agent-sdk/agent-session/AgentSession.ts b/agent-sdk/agent-session/AgentSession.ts index d1e30be9b..999d7856c 100644 --- a/agent-sdk/agent-session/AgentSession.ts +++ b/agent-sdk/agent-session/AgentSession.ts @@ -14,6 +14,8 @@ import { ReceivedMessageAggregator, type ReceivedMessageAggregatorOptions, ReceivedMessageAggregatorEvent, + SentMessageOptions, + SentChatMessageOptions, } from "./message"; import Agent, { AgentConnectionState, AgentConversationalState, AgentEvent } from './Agent'; import { ConnectionCredentialsProvider } from './ConnectionCredentialsProvider'; @@ -323,9 +325,10 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter( + message: Message, + options: Message extends SentMessage ? SentMessageOptions : SentChatMessageOptions, + ) { if (!this.messageSender) { throw new Error('AgentSession.sendMessage - cannot send message until room is connected and MessageSender initialized!'); } @@ -335,7 +338,7 @@ export class AgentSession extends (EventEmitter as new () => TypedEventEmitter void) => void | undefined; diff --git a/agent-sdk/agent-session/message/index.ts b/agent-sdk/agent-session/message/index.ts index fc5ce4303..371d0d62d 100644 --- a/agent-sdk/agent-session/message/index.ts +++ b/agent-sdk/agent-session/message/index.ts @@ -1,3 +1,4 @@ +import { SendTextOptions } from 'livekit-client'; import { TextStreamInfo } from '@/agent-sdk/external-deps/client-sdk-js'; export type BaseMessageId = string; @@ -23,9 +24,14 @@ export type ReceivedMessage = // TODO: images? attachments? rpc? export type SentChatMessage = BaseMessage<'outbound', | { type: 'chat', text: string }>; +export type SentChatMessageOptions = SendTextOptions | undefined; + export type SentMessage = | SentChatMessage; +export type SentMessageOptions = + | (Message extends SentChatMessage ? SentChatMessageOptions : never); + // FIXME: maybe update all these functions to not have default exports as to avoid the duplicate // names being written here? export { default as MessageSender } from './send/MessageSender'; diff --git a/agent-sdk/agent-session/message/send/ChatMessageSender.ts b/agent-sdk/agent-session/message/send/ChatMessageSender.ts index 4116a77c8..aad69b509 100644 --- a/agent-sdk/agent-session/message/send/ChatMessageSender.ts +++ b/agent-sdk/agent-session/message/send/ChatMessageSender.ts @@ -1,12 +1,12 @@ import { LocalParticipant } from "livekit-client"; -import { type ReceivedChatLoopbackMessage, type SentChatMessage } from ".."; +import { SentMessage, SentMessageOptions, type ReceivedChatLoopbackMessage, type SentChatMessage } from ".."; import MessageSender from "./MessageSender"; import MessageReceiver from "../receive/MessageReceiver"; /** A `MessageSender` for sending chat messages via the `lk.chat` datastream topic. */ -export default class ChatMessageSender extends MessageSender { +export default class ChatMessageSender extends MessageSender { private localParticipant: LocalParticipant; private loopbackReceiverCallbacks: Set<(incomingMessage: SentChatMessage) => void> = new Set(); @@ -15,12 +15,23 @@ export default class ChatMessageSender extends MessageSender { this.localParticipant = localParticipant; } - async send(message: SentChatMessage) { + isSentChatMessage(message: SentMessage): message is SentChatMessage { + return message.content.type === 'chat'; + } + + async send(message: SentChatMessage, options: SentMessageOptions) { + if (!this.isSentChatMessage(message)) { + return; + } + // FIXME: maybe there's a more elegant way of doing this, where it also + // gets checked as part of `isSentChatMessage`? + const chatMessageOptions = options as SentMessageOptions; + for (const callback of this.loopbackReceiverCallbacks) { callback(message); } - await this.localParticipant.sendText(message.content.text, /* FIXME: options here? */); + await this.localParticipant.sendText(message.content.text, chatMessageOptions); // FIXME: do I need to handle sending legacy chat messages too? // const legacyChatMsg: LegacyChatMessage = { diff --git a/agent-sdk/agent-session/message/send/CombinedMessageSender.ts b/agent-sdk/agent-session/message/send/CombinedMessageSender.ts index 86a26145d..5cba83161 100644 --- a/agent-sdk/agent-session/message/send/CombinedMessageSender.ts +++ b/agent-sdk/agent-session/message/send/CombinedMessageSender.ts @@ -1,4 +1,4 @@ -import { type SentMessage } from ".."; +import { SentMessageOptions, type SentMessage } from ".."; import MessageSender from "./MessageSender"; /** @@ -13,9 +13,9 @@ export default class CombinedMessageSender extends MessageSender { this.messageSenders = messageSenders; } - async send(message: SentMessage) { + async send(message: SentMessage, options: SentMessageOptions) { await Promise.all(this.messageSenders.map(async (sender) => { - return sender.send(message); + return sender.send(message, options); })); } } diff --git a/agent-sdk/agent-session/message/send/MessageSender.ts b/agent-sdk/agent-session/message/send/MessageSender.ts index 083f4823f..6f8a874c5 100644 --- a/agent-sdk/agent-session/message/send/MessageSender.ts +++ b/agent-sdk/agent-session/message/send/MessageSender.ts @@ -1,5 +1,5 @@ -import { type SentMessage } from ".."; +import { SentMessageOptions, type SentMessage } from ".."; -export default abstract class MessageSender { - abstract send(message: Message): Promise; +export default abstract class MessageSender { + abstract send(message: SentMessage, options: SentMessageOptions): Promise; } diff --git a/agent-sdk/index.tsx b/agent-sdk/index.tsx index a62c12ccd..919acb6c8 100644 --- a/agent-sdk/index.tsx +++ b/agent-sdk/index.tsx @@ -13,7 +13,7 @@ import { import { TrackReference, trackSourceToProtocol } from "@/agent-sdk/external-deps/components-js"; import { ParticipantEventCallbacks } from "../node_modules/livekit-client/src/room/participant/Participant"; import { AgentSession, AgentSessionCallbacks, AgentSessionEvent } from "./agent-session/AgentSession"; -import { ReceivedMessage, ReceivedMessageAggregator, ReceivedMessageAggregatorEvent, SentMessage } from "./agent-session/message"; +import { ReceivedMessage, ReceivedMessageAggregator, ReceivedMessageAggregatorEvent, SentChatMessageOptions, SentMessage, SentMessageOptions } from "./agent-session/message"; import { AgentCallbacks, AgentEvent } from "./agent-session/Agent"; import { ParticipantPermission } from "livekit-server-sdk"; import { usePersistentUserChoices } from "@livekit/components-react"; @@ -69,8 +69,11 @@ export function useAgentMessages() { }; }, [agentSession, agentSession.isAvailable]); - const send = useCallback(async (message: SentMessage) => { - return agentSession.sendMessage(message); + const send = useCallback(async ( + message: SentMessage | string, + options: Message extends SentMessage ? SentMessageOptions : SentChatMessageOptions, + ) => { + return agentSession.sendMessage(message, options); }, [agentSession]); const { messages, ready } = useMemo(() => {