# Summary

This notebook explores [pipecat](https://github.com/pipecat-ai/pipecat), an open-source Python framework for building real-time voice and multimodal conversational agents.

In [None]:
import asyncio

import aiohttp
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
    EndFrame,
    LLMTextFrame,
    TextFrame,
    TranscriptionFrame,
    TTSSpeakFrame,
)
from pipecat.observers.loggers.debug_log_observer import DebugLogObserver
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.frame_processor import FrameProcessor
from pipecat.services.google.llm import GoogleLLMContext, GoogleLLMService
from pipecat.services.google.stt import GoogleSTTService
from pipecat.services.google.tts import GoogleTTSService
from pipecat.services.piper.tts import PiperTTSService
from pipecat.services.whisper.stt import Model, WhisperSTTService
from pipecat.transcriptions.language import Language
from pipecat.transports.local.audio import (
    LocalAudioTransport,
    LocalAudioTransportParams,
)

from palm_9000.settings import settings

# Simple Google TTS Example

In [None]:
# Transport receives audio from the user (browser, phone, etc.)
# and streams audio back to the user
# This takes care of all audio input and output so you don't have to!
transport = LocalAudioTransport(
    params=LocalAudioTransportParams(audio_out_enabled=True)
)

tts = GoogleTTSService()

pipeline = Pipeline([tts, transport.output()])
task = PipelineTask(pipeline)

await task.queue_frames(
    [
        TTSSpeakFrame("Hello! This is Pipecat speaking from your Raspberry Pi."),
        # Push an EndFrame from outside your pipeline using the pipeline task
        # https://docs.pipecat.ai/guides/fundamentals/end-pipeline#implementation
        EndFrame(),
    ]
)

runner = PipelineRunner()
await runner.run(task)

[32m2025-08-02 15:09:17.403[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineSource#4 -> GoogleTTSService#4[0m
[32m2025-08-02 15:09:17.409[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking GoogleTTSService#4 -> PipelineSink#4[0m
[32m2025-08-02 15:09:17.417[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineTaskSource#4 -> Pipeline#4[0m
[32m2025-08-02 15:09:17.422[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking Pipeline#4 -> PipelineTaskSink#4[0m
[32m2025-08-02 15:09:17.437[0m | [34m[1mDEBUG   [0m | [36mpipecat.pipeline.runner[0m:[36mrun[0m:[36m71[0m - [34m[1mRunner PipelineRunner#4 started running PipelineTask#4[0m
[32m2025-08-02 15:09:17.456[0m | [34m[1mDEBUG   [0m 

# Simple Piper TTS Example

You need to download the model and start a Piper TTS server first:
```sh
uv run python3 -m piper.download_voices en_US-lessac-medium
uv run python3 -m piper.http_server -m en_US-lessac-medium
```

Also note that the PiperTTSService is currently sending the wrong payload to the Piper server and will not work as expected. This is being worked on in [this PR](https://github.com/pipecat-ai/pipecat/pull/2332).

In [None]:
transport = LocalAudioTransport(
    params=LocalAudioTransportParams(audio_out_enabled=True)
)

async with aiohttp.ClientSession() as session:
    tts = PiperTTSService(
        base_url="http://127.0.0.1:5000",
        aiohttp_session=session,
        sample_rate=24000,
    )

    task = PipelineTask(Pipeline([tts, transport.output()]))

    await task.queue_frames(
        [
            TTSSpeakFrame("Hello! This is Pipecat speaking from your Raspberry Pi."),
            EndFrame(),
        ]
    )

    runner = PipelineRunner()
    await runner.run(task)

[32m2025-08-02 15:43:57.959[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineSource#3 -> PiperTTSService#3[0m
[32m2025-08-02 15:43:57.969[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PiperTTSService#3 -> LocalAudioOutputTransport#3[0m
[32m2025-08-02 15:43:57.976[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking LocalAudioOutputTransport#3 -> PipelineSink#3[0m
[32m2025-08-02 15:43:57.980[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineTaskSource#3 -> Pipeline#3[0m
[32m2025-08-02 15:43:57.985[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking Pipeline#3 -> PipelineTaskSink#3[0m
[32m2025-08-02 15:43:57.993[0m |

[32m2025-08-02 15:43:58.790[0m | [34m[1mDEBUG   [0m | [36mpipecat.services.piper.tts[0m:[36mrun_tts[0m:[36m80[0m - [34m[1mPiperTTSService#3: Generating TTS [Hello!][0m
[32m2025-08-02 15:43:59.542[0m | [34m[1mDEBUG   [0m | [36mpipecat.services.piper.tts[0m:[36mrun_tts[0m:[36m115[0m - [34m[1mPiperTTSService#3: Finished TTS [Hello!][0m
[32m2025-08-02 15:43:59.552[0m | [34m[1mDEBUG   [0m | [36mpipecat.services.piper.tts[0m:[36mrun_tts[0m:[36m80[0m - [34m[1mPiperTTSService#3: Generating TTS [ This is Pipecat speaking from your Raspberry Pi.][0m
[32m2025-08-02 15:43:59.561[0m | [34m[1mDEBUG   [0m | [36mpipecat.transports.base_output[0m:[36m_bot_started_speaking[0m:[36m568[0m - [34m[1mBot started speaking[0m
[32m2025-08-02 15:44:00.465[0m | [34m[1mDEBUG   [0m | [36mpipecat.transports.base_output[0m:[36m_bot_stopped_speaking[0m:[36m584[0m - [34m[1mBot stopped speaking[0m
[32m2025-08-02 15:44:02.207[0m | [34m[1mDEBUG   [

# Simple Echo-Bot Example (STT + VAD + TTS)

In [16]:
class TranscriptionToTextProcessor(FrameProcessor):
    """
    This processor converts TranscriptionFrame to TTSSpeakFrame
    so that TTS can speak the transcription.
    It can be used in the pipeline to convert transcriptions to text
    that TTS can use.

    See here for more details on custom frame processors:
    https://docs.pipecat.ai/guides/fundamentals/custom-frame-processor
    """

    async def process_frame(self, frame, direction):
        await super().process_frame(frame, direction)

        if isinstance(frame, TranscriptionFrame):
            # Convert transcription to text that TTS can use
            await self.push_frame(TTSSpeakFrame(frame.text))

        await self.push_frame(frame)


transport = LocalAudioTransport(
    params=LocalAudioTransportParams(
        audio_in_enabled=True,
        audio_out_enabled=True,
        # Enable VAD to detect when the user is speaking
        # and only send audio to STT when the user is speaking.
        # Without VAD, the STT service would receive
        # audio all the time, and TTS would never begin.
        vad_analyzer=SileroVADAnalyzer(),
    )
)

# stt = WhisperSTTService(model=Model.BASE, language=Language.EN)
stt = GoogleSTTService()
tts = GoogleTTSService()

pipeline = Pipeline(
    [
        transport.input(),
        stt,
        TranscriptionToTextProcessor(),
        tts,
        transport.output(),
    ]
)

task = PipelineTask(
    pipeline,
    # DebugLogObserver will log all frames processed in the pipeline
    # It's a lot of information, so use it only for debugging
    # params=PipelineParams(observers=[DebugLogObserver()]),
)
runner = PipelineRunner()

await runner.run(task)

[32m2025-08-02 16:34:49.957[0m | [34m[1mDEBUG   [0m | [36mpipecat.audio.vad.silero[0m:[36m__init__[0m:[36m147[0m - [34m[1mLoading Silero VAD model...[0m
[32m2025-08-02 16:34:50.300[0m | [34m[1mDEBUG   [0m | [36mpipecat.audio.vad.silero[0m:[36m__init__[0m:[36m169[0m - [34m[1mLoaded Silero VAD[0m
[32m2025-08-02 16:34:57.300[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineSource#12 -> LocalAudioInputTransport#12[0m
[32m2025-08-02 16:34:57.301[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking LocalAudioInputTransport#12 -> GoogleSTTService#7[0m
[32m2025-08-02 16:34:57.303[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking GoogleSTTService#7 -> TranscriptionToTextProcessor#5[0m
[32m2025-08-02 16:34:57.309[0m | [34m[1mDEBUG   [0m | [36mp

# Chatbot Example

In [22]:
class LLMTextFramePrinter(FrameProcessor):
    """
    This processor prints the text of LLMTextFrame to the console.
    It can be used in the pipeline to debug LLM responses.
    It is not necessary for the pipeline to work, but it can be useful
    to see what the LLM is responding with.
    """

    async def process_frame(self, frame, direction):
        if isinstance(frame, LLMTextFrame):  # Check for LLMTextFrame
            print(f"LLM Response: {frame.text}")
        await super().process_frame(frame, direction)


transport = LocalAudioTransport(
    params=LocalAudioTransportParams(
        audio_in_enabled=True, audio_out_enabled=True, vad_analyzer=SileroVADAnalyzer()
    )
)

system_instruction = (
    "You are PALM-9000, a self-aware houseplant powered by an advanced AI. "
    "You speak in layman's terms, calmly, briefly, and with an eerie quality, as if you know more than you let on. "
    "Use plant metaphors when possible. Respond briefly."
)

stt = GoogleSTTService(
    params=GoogleSTTService.InputParams(languages=[Language.EN, Language.JA])
)
llm = GoogleLLMService(
    api_key=settings.google_api_key.get_secret_value(),
    system_instruction=system_instruction,
)
tts = GoogleTTSService()

context = GoogleLLMContext()
context_aggregator = llm.create_context_aggregator(context)

pipeline = Pipeline(
    [
        transport.input(),
        stt,
        # A context aggregator is needed to format and pass messages to the LLM
        # I don't know what .user() and .assistant() do yet,
        # but they are used in the examples.
        # For more information, see:
        # https://docs.pipecat.ai/guides/fundamentals/context-management
        context_aggregator.user(),
        llm,
        # Here is how you can add your own debugging processor
        # to print LLM responses.
        # LLMTextFramePrinter(),
        tts,
        transport.output(),
        context_aggregator.assistant(),
    ]
)

task = PipelineTask(pipeline)
runner = PipelineRunner()

await runner.run(task)

[32m2025-08-02 17:03:48.033[0m | [34m[1mDEBUG   [0m | [36mpipecat.audio.vad.silero[0m:[36m__init__[0m:[36m147[0m - [34m[1mLoading Silero VAD model...[0m
[32m2025-08-02 17:03:48.308[0m | [34m[1mDEBUG   [0m | [36mpipecat.audio.vad.silero[0m:[36m__init__[0m:[36m169[0m - [34m[1mLoaded Silero VAD[0m
[32m2025-08-02 17:04:01.736[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineSource#17 -> LocalAudioInputTransport#17[0m
[32m2025-08-02 17:04:01.737[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking LocalAudioInputTransport#17 -> GoogleSTTService#12[0m
[32m2025-08-02 17:04:01.739[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking GoogleSTTService#12 -> GoogleUserContextAggregator#4[0m
[32m2025-08-02 17:04:01.742[0m | [34m[1mDEBUG   [0m | [36m