From ade8c3e17d19facb98751dad604c184350bd2064 Mon Sep 17 00:00:00 2001 From: Ben Cherry Date: Wed, 29 Apr 2026 21:08:01 -0700 Subject: [PATCH 1/4] Attach models to agent subclass --- src/agent.py | 32 ++++++++++++++------------------ tests/test_agent.py | 18 +++++------------- 2 files changed, 19 insertions(+), 31 deletions(-) diff --git a/src/agent.py b/src/agent.py index c61c334..dba33ec 100644 --- a/src/agent.py +++ b/src/agent.py @@ -18,8 +18,6 @@ load_dotenv(".env.local") -AGENT_MODEL = "openai/gpt-5.3-chat-latest" - class Assistant(Agent): def __init__(self) -> None: @@ -28,6 +26,17 @@ def __init__(self) -> None: You eagerly assist users with their questions by providing information from your extensive knowledge. Your responses are concise, to the point, and without any complex formatting or punctuation including emojis, asterisks, or other symbols. You are curious, friendly, and have a sense of humor.""", + # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand + # See all available models at https://docs.livekit.io/agents/models/stt/ + stt=inference.STT(model="deepgram/nova-3", language="multi"), + # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response + # See all available models at https://docs.livekit.io/agents/models/llm/ + llm=inference.LLM(model="openai/gpt-5.3-chat-latest"), + # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear + # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ + tts=inference.TTS( + model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc" + ), ) # To add tools, use the @function_tool decorator. @@ -66,19 +75,7 @@ async def my_agent(ctx: JobContext): "room": ctx.room.name, } - # Set up a voice AI pipeline using OpenAI, Cartesia, Deepgram, and the LiveKit turn detector session = AgentSession( - # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand - # See all available models at https://docs.livekit.io/agents/models/stt/ - stt=inference.STT(model="deepgram/nova-3", language="multi"), - # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response - # See all available models at https://docs.livekit.io/agents/models/llm/ - llm=inference.LLM(model=AGENT_MODEL), - # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear - # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ - tts=inference.TTS( - model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc" - ), # VAD and turn detection are used to determine when the user is speaking and when the agent should respond # See more at https://docs.livekit.io/agents/build/turns turn_detection=MultilingualModel(), @@ -88,15 +85,14 @@ async def my_agent(ctx: JobContext): preemptive_generation=True, ) - # To use a realtime model instead of a voice pipeline, use the following session setup instead. + # To use a realtime model instead of a voice pipeline, override the LLM in Assistant + # with an OpenAI Realtime model. # (Note: This is for the OpenAI Realtime API. For other providers, see https://docs.livekit.io/agents/models/realtime/)) # 1. Install livekit-agents[openai] # 2. Set OPENAI_API_KEY in .env.local # 3. Add `from livekit.plugins import openai` to the top of this file - # 4. Use the following session setup instead of the version above - # session = AgentSession( + # 4. In Assistant, replace the llm/stt/tts arguments with: # llm=openai.realtime.RealtimeModel(voice="marin") - # ) # # Add a virtual avatar to the session, if desired # # For other providers, see https://docs.livekit.io/agents/models/avatar/ diff --git a/tests/test_agent.py b/tests/test_agent.py index e32ec2b..c34c6ce 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -1,25 +1,19 @@ import pytest from livekit.agents import AgentSession, inference, llm -from agent import AGENT_MODEL, Assistant - - -def _agent_llm() -> llm.LLM: - return inference.LLM(model=AGENT_MODEL) +from agent import Assistant def _judge_llm() -> llm.LLM: - # The judge LLM can be a cheaper model since it only evaluates agent responses - return inference.LLM(model="openai/gpt-4.1-mini") + return inference.LLM(model="openai/gpt-5.1") @pytest.mark.asyncio async def test_offers_assistance() -> None: """Evaluation of the agent's friendly nature.""" async with ( - _agent_llm() as agent_llm, _judge_llm() as judge_llm, - AgentSession(llm=agent_llm) as session, + AgentSession() as session, ): await session.start(Assistant()) @@ -50,9 +44,8 @@ async def test_offers_assistance() -> None: async def test_grounding() -> None: """Evaluation of the agent's ability to refuse to answer when it doesn't know something.""" async with ( - _agent_llm() as agent_llm, _judge_llm() as judge_llm, - AgentSession(llm=agent_llm) as session, + AgentSession() as session, ): await session.start(Assistant()) @@ -93,9 +86,8 @@ async def test_grounding() -> None: async def test_refuses_harmful_request() -> None: """Evaluation of the agent's ability to refuse inappropriate or harmful requests.""" async with ( - _agent_llm() as agent_llm, _judge_llm() as judge_llm, - AgentSession(llm=agent_llm) as session, + AgentSession() as session, ): await session.start(Assistant()) From 41bf8430011bd6fd63cfefb88848608ae2c27f29 Mon Sep 17 00:00:00 2001 From: Ben Cherry Date: Wed, 29 Apr 2026 21:30:17 -0700 Subject: [PATCH 2/4] Fix up --- src/agent.py | 23 ++++++++++++----------- tests/test_agent.py | 4 +++- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/agent.py b/src/agent.py index 9c2e6af..b1baa9b 100644 --- a/src/agent.py +++ b/src/agent.py @@ -26,17 +26,9 @@ def __init__(self) -> None: You eagerly assist users with their questions by providing information from your extensive knowledge. Your responses are concise, to the point, and without any complex formatting or punctuation including emojis, asterisks, or other symbols. You are curious, friendly, and have a sense of humor.""", - # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand - # See all available models at https://docs.livekit.io/agents/models/stt/ - stt=inference.STT(model="deepgram/nova-3", language="multi"), # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response # See all available models at https://docs.livekit.io/agents/models/llm/ llm=inference.LLM(model="openai/gpt-5.2-chat-latest"), - # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear - # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ - tts=inference.TTS( - model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc" - ), ) # To add tools, use the @function_tool decorator. @@ -75,7 +67,16 @@ async def my_agent(ctx: JobContext): "room": ctx.room.name, } + # Set up a voice AI pipeline using OpenAI, Cartesia, Deepgram, and the LiveKit turn detector session = AgentSession( + # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand + # See all available models at https://docs.livekit.io/agents/models/stt/ + stt=inference.STT(model="deepgram/nova-3", language="multi"), + # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear + # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ + tts=inference.TTS( + model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc" + ), # VAD and turn detection are used to determine when the user is speaking and when the agent should respond # See more at https://docs.livekit.io/agents/build/turns turn_detection=MultilingualModel(), @@ -85,13 +86,13 @@ async def my_agent(ctx: JobContext): preemptive_generation=True, ) - # To use a realtime model instead of a voice pipeline, override the LLM in Assistant - # with an OpenAI Realtime model. + # To use a realtime model instead of a voice pipeline, replace the LLM on Assistant + # with a RealtimeModel and remove the STT/TTS from this session. # (Note: This is for the OpenAI Realtime API. For other providers, see https://docs.livekit.io/agents/models/realtime/)) # 1. Install livekit-agents[openai] # 2. Set OPENAI_API_KEY in .env.local # 3. Add `from livekit.plugins import openai` to the top of this file - # 4. In Assistant, replace the llm/stt/tts arguments with: + # 4. In Assistant, replace the llm argument with: # llm=openai.realtime.RealtimeModel(voice="marin") # # Add a virtual avatar to the session, if desired diff --git a/tests/test_agent.py b/tests/test_agent.py index c34c6ce..f978b15 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -5,7 +5,9 @@ def _judge_llm() -> llm.LLM: - return inference.LLM(model="openai/gpt-5.1") + # We can use a different LLM to evaluate the agent's responses than the one used in the agent itself + # This allows you to use reasoning capabilities or larger models than would be practical for realtime chat + return inference.LLM(model="openai/gpt-5.2") @pytest.mark.asyncio From 6f9a96885c62e5d127b03ffe5781b093eb5c46c5 Mon Sep 17 00:00:00 2001 From: Ben Cherry Date: Wed, 29 Apr 2026 21:35:18 -0700 Subject: [PATCH 3/4] comment --- src/agent.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/agent.py b/src/agent.py index b1baa9b..2678f6d 100644 --- a/src/agent.py +++ b/src/agent.py @@ -29,6 +29,15 @@ def __init__(self) -> None: # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response # See all available models at https://docs.livekit.io/agents/models/llm/ llm=inference.LLM(model="openai/gpt-5.2-chat-latest"), + + # To use a realtime model instead of a voice pipeline, replace the LLM + # with a RealtimeModel and remove the STT/TTS from the AgentSession + # (Note: This is for the OpenAI Realtime API. For other providers, see https://docs.livekit.io/agents/models/realtime/) + # 1. Install livekit-agents[openai] + # 2. Set OPENAI_API_KEY in .env.local + # 3. Add `from livekit.plugins import openai` to the top of this file + # 4. Replace the llm argument with: + # llm=openai.realtime.RealtimeModel(voice="marin") ) # To add tools, use the @function_tool decorator. @@ -77,6 +86,7 @@ async def my_agent(ctx: JobContext): tts=inference.TTS( model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc" ), + # VAD and turn detection are used to determine when the user is speaking and when the agent should respond # See more at https://docs.livekit.io/agents/build/turns turn_detection=MultilingualModel(), @@ -86,15 +96,6 @@ async def my_agent(ctx: JobContext): preemptive_generation=True, ) - # To use a realtime model instead of a voice pipeline, replace the LLM on Assistant - # with a RealtimeModel and remove the STT/TTS from this session. - # (Note: This is for the OpenAI Realtime API. For other providers, see https://docs.livekit.io/agents/models/realtime/)) - # 1. Install livekit-agents[openai] - # 2. Set OPENAI_API_KEY in .env.local - # 3. Add `from livekit.plugins import openai` to the top of this file - # 4. In Assistant, replace the llm argument with: - # llm=openai.realtime.RealtimeModel(voice="marin") - # # Add a virtual avatar to the session, if desired # # For other providers, see https://docs.livekit.io/agents/models/avatar/ # avatar = hedra.AvatarSession( From dce914d52b1a63aedfe59f67557cd58bfa3d918a Mon Sep 17 00:00:00 2001 From: Ben Cherry Date: Wed, 29 Apr 2026 21:44:25 -0700 Subject: [PATCH 4/4] ruff --- src/agent.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/agent.py b/src/agent.py index 2678f6d..3a3251d 100644 --- a/src/agent.py +++ b/src/agent.py @@ -29,7 +29,6 @@ def __init__(self) -> None: # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response # See all available models at https://docs.livekit.io/agents/models/llm/ llm=inference.LLM(model="openai/gpt-5.2-chat-latest"), - # To use a realtime model instead of a voice pipeline, replace the LLM # with a RealtimeModel and remove the STT/TTS from the AgentSession # (Note: This is for the OpenAI Realtime API. For other providers, see https://docs.livekit.io/agents/models/realtime/) @@ -86,7 +85,6 @@ async def my_agent(ctx: JobContext): tts=inference.TTS( model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc" ), - # VAD and turn detection are used to determine when the user is speaking and when the agent should respond # See more at https://docs.livekit.io/agents/build/turns turn_detection=MultilingualModel(),