# Summary

This notebook explores [pipecat](https://github.com/pipecat-ai/pipecat), an open-source Python framework for building real-time voice and multimodal conversational agents.

In [1]:
import asyncio

import aiohttp
from gpiozero import LED, GPIOPinInUse
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
    BotSpeakingFrame,
    BotStartedSpeakingFrame,
    BotStoppedSpeakingFrame,
    CancelFrame,
    EndFrame,
    ErrorFrame,
    Frame,
    InputAudioRawFrame,
    LLMTextFrame,
    OutputAudioRawFrame,
    TextFrame,
    TranscriptionFrame,
    TTSSpeakFrame,
    TTSStartedFrame,
    TTSStoppedFrame,
)
from pipecat.observers.loggers.debug_log_observer import DebugLogObserver
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask

# from gpiozero.pins.rpigpio import GPIO
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.services.google.llm import GoogleLLMContext, GoogleLLMService
from pipecat.services.google.stt import GoogleSTTService
from pipecat.services.google.tts import (
    GeminiTTSService,
    GoogleHttpTTSService,
    GoogleTTSService,
)
from pipecat.services.piper.tts import PiperTTSService
from pipecat.services.whisper.stt import Model, WhisperSTTService
from pipecat.transcriptions.language import Language
from pipecat.transports.local.audio import (
    LocalAudioTransport,
    LocalAudioTransportParams,
)

from palm_9000.settings import settings

[32m2025-08-15 12:22:40.722[0m | [1mINFO    [0m | [36mpipecat[0m:[36m<module>[0m:[36m14[0m - [1mᓚᘏᗢ Pipecat 0.0.80 (Python 3.12.11 (main, Jun 26 2025, 21:44:36) [Clang 20.1.4 ]) ᓚᘏᗢ[0m


# Simple Google TTS Example

In [None]:
# Transport receives audio from the user (browser, phone, etc.)
# and streams audio back to the user
# This takes care of all audio input and output so you don't have to!
transport = LocalAudioTransport(
    params=LocalAudioTransportParams(audio_out_enabled=True)
)

tts = GoogleTTSService()

pipeline = Pipeline([tts, transport.output()])
task = PipelineTask(pipeline)

await task.queue_frames(
    [
        TTSSpeakFrame("Hello! This is Pipecat speaking from your Raspberry Pi."),
        # Push an EndFrame from outside your pipeline using the pipeline task
        # https://docs.pipecat.ai/guides/fundamentals/end-pipeline#implementation
        EndFrame(),
    ]
)

runner = PipelineRunner()
await runner.run(task)

[32m2025-08-02 15:09:17.403[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineSource#4 -> GoogleTTSService#4[0m
[32m2025-08-02 15:09:17.409[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking GoogleTTSService#4 -> PipelineSink#4[0m
[32m2025-08-02 15:09:17.417[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineTaskSource#4 -> Pipeline#4[0m
[32m2025-08-02 15:09:17.422[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking Pipeline#4 -> PipelineTaskSink#4[0m
[32m2025-08-02 15:09:17.437[0m | [34m[1mDEBUG   [0m | [36mpipecat.pipeline.runner[0m:[36mrun[0m:[36m71[0m - [34m[1mRunner PipelineRunner#4 started running PipelineTask#4[0m
[32m2025-08-02 15:09:17.456[0m | [34m[1mDEBUG   [0m 

# Simple Piper TTS Example

You need to download the model and start a Piper TTS server first:
```sh
uv run python3 -m piper.download_voices en_US-lessac-medium
uv run python3 -m piper.http_server -m en_US-lessac-medium
```

Also note that the PiperTTSService is currently sending the wrong payload to the Piper server and will not work as expected. This is being worked on in [this PR](https://github.com/pipecat-ai/pipecat/pull/2332).

In [None]:
transport = LocalAudioTransport(
    params=LocalAudioTransportParams(audio_out_enabled=True)
)

async with aiohttp.ClientSession() as session:
    tts = PiperTTSService(
        base_url="http://127.0.0.1:5000",
        aiohttp_session=session,
        sample_rate=24000,
    )

    task = PipelineTask(Pipeline([tts, transport.output()]))

    await task.queue_frames(
        [
            TTSSpeakFrame("Hello! This is Pipecat speaking from your Raspberry Pi."),
            EndFrame(),
        ]
    )

    runner = PipelineRunner()
    await runner.run(task)

[32m2025-08-02 15:43:57.959[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineSource#3 -> PiperTTSService#3[0m
[32m2025-08-02 15:43:57.969[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PiperTTSService#3 -> LocalAudioOutputTransport#3[0m
[32m2025-08-02 15:43:57.976[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking LocalAudioOutputTransport#3 -> PipelineSink#3[0m
[32m2025-08-02 15:43:57.980[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineTaskSource#3 -> Pipeline#3[0m
[32m2025-08-02 15:43:57.985[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking Pipeline#3 -> PipelineTaskSink#3[0m
[32m2025-08-02 15:43:57.993[0m |

[32m2025-08-02 15:43:58.790[0m | [34m[1mDEBUG   [0m | [36mpipecat.services.piper.tts[0m:[36mrun_tts[0m:[36m80[0m - [34m[1mPiperTTSService#3: Generating TTS [Hello!][0m
[32m2025-08-02 15:43:59.542[0m | [34m[1mDEBUG   [0m | [36mpipecat.services.piper.tts[0m:[36mrun_tts[0m:[36m115[0m - [34m[1mPiperTTSService#3: Finished TTS [Hello!][0m
[32m2025-08-02 15:43:59.552[0m | [34m[1mDEBUG   [0m | [36mpipecat.services.piper.tts[0m:[36mrun_tts[0m:[36m80[0m - [34m[1mPiperTTSService#3: Generating TTS [ This is Pipecat speaking from your Raspberry Pi.][0m
[32m2025-08-02 15:43:59.561[0m | [34m[1mDEBUG   [0m | [36mpipecat.transports.base_output[0m:[36m_bot_started_speaking[0m:[36m568[0m - [34m[1mBot started speaking[0m
[32m2025-08-02 15:44:00.465[0m | [34m[1mDEBUG   [0m | [36mpipecat.transports.base_output[0m:[36m_bot_stopped_speaking[0m:[36m584[0m - [34m[1mBot stopped speaking[0m
[32m2025-08-02 15:44:02.207[0m | [34m[1mDEBUG   [

# Simple Echo-Bot Example (STT + VAD + TTS)

In [16]:
class TranscriptionToTextProcessor(FrameProcessor):
    """
    This processor converts TranscriptionFrame to TTSSpeakFrame
    so that TTS can speak the transcription.
    It can be used in the pipeline to convert transcriptions to text
    that TTS can use.

    See here for more details on custom frame processors:
    https://docs.pipecat.ai/guides/fundamentals/custom-frame-processor
    """

    async def process_frame(self, frame, direction):
        await super().process_frame(frame, direction)

        if isinstance(frame, TranscriptionFrame):
            # Convert transcription to text that TTS can use
            await self.push_frame(TTSSpeakFrame(frame.text))

        await self.push_frame(frame)


transport = LocalAudioTransport(
    params=LocalAudioTransportParams(
        audio_in_enabled=True,
        audio_out_enabled=True,
        # Enable VAD to detect when the user is speaking
        # and only send audio to STT when the user is speaking.
        # Without VAD, the STT service would receive
        # audio all the time, and TTS would never begin.
        vad_analyzer=SileroVADAnalyzer(),
    )
)

# stt = WhisperSTTService(model=Model.BASE, language=Language.EN)
stt = GoogleSTTService()
tts = GoogleTTSService()

pipeline = Pipeline(
    [
        transport.input(),
        stt,
        TranscriptionToTextProcessor(),
        tts,
        transport.output(),
    ]
)

task = PipelineTask(
    pipeline,
    # DebugLogObserver will log all frames processed in the pipeline
    # It's a lot of information, so use it only for debugging
    # params=PipelineParams(observers=[DebugLogObserver()]),
)
runner = PipelineRunner()

await runner.run(task)

[32m2025-08-02 16:34:49.957[0m | [34m[1mDEBUG   [0m | [36mpipecat.audio.vad.silero[0m:[36m__init__[0m:[36m147[0m - [34m[1mLoading Silero VAD model...[0m
[32m2025-08-02 16:34:50.300[0m | [34m[1mDEBUG   [0m | [36mpipecat.audio.vad.silero[0m:[36m__init__[0m:[36m169[0m - [34m[1mLoaded Silero VAD[0m
[32m2025-08-02 16:34:57.300[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking PipelineSource#12 -> LocalAudioInputTransport#12[0m
[32m2025-08-02 16:34:57.301[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking LocalAudioInputTransport#12 -> GoogleSTTService#7[0m
[32m2025-08-02 16:34:57.303[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m394[0m - [34m[1mLinking GoogleSTTService#7 -> TranscriptionToTextProcessor#5[0m
[32m2025-08-02 16:34:57.309[0m | [34m[1mDEBUG   [0m | [36mp

# Chatbot Example

In [2]:
class LEDSyncProcessor(FrameProcessor):
    def __init__(self, led_pin: int):
        super().__init__()
        self.led = LED(led_pin)
        self.speaking = False

    async def process_frame(self, frame: Frame, direction: FrameDirection):
        await super().process_frame(frame, direction)

        if isinstance(frame, BotStartedSpeakingFrame):
            self.led.blink(on_time=0.25, off_time=0.25)
            # self.led.on()
            print("LED ON - Bot started speaking")
            self.speaking = True

        # elif isinstance(frame, BotSpeakingFrame):
        #     logger.info("Bot speaking")

        elif isinstance(frame, BotStoppedSpeakingFrame):
            self.led.off()
            self.speaking = False
            print("LED OFF - Bot stopped speaking")

        elif isinstance(frame, (EndFrame, CancelFrame, ErrorFrame)):
            logger.info("Cleaning up LEDSyncProcessor")
            self.speaking = False
            self.led.close()

        # elif self.speaking:
        # logger.debug(f"Processing frame: {frame.__class__.__name__}")

        # Always pass the frame through
        await self.push_frame(frame)

In [3]:
audiobuffer = AudioBufferProcessor(
    # num_channels=1,
    buffer_size=512,
    # enable_turn_audio=True,
)


# @audiobuffer.event_handler("on_audio_data")
# async def on_audio_data(buffer, audio: bytes, sample_rate: int, num_channels: int):
#     print(f"on_audio_data: {len(audio)}")


@audiobuffer.event_handler("on_track_audio_data")
async def on_track_audio_data(buffer, user_audio: bytes, bot_audio: bytes, sample_rate: int, num_channels: int):
    print(f"on_track_audio_data: user={len(user_audio)}, bot={len(bot_audio)}")


# @audiobuffer.event_handler("on_user_turn_audio_data")
# async def on_user_turn_audio_data(buffer, audio: bytes, sample_rate: int, num_channels: int):
#     print(f"on_user_turn_audio_data: {len(audio)}")


# @audiobuffer.event_handler("on_bot_turn_audio_data")
# async def on_bot_turn_audio_data(buffer, audio: bytes, sample_rate: int, num_channels: int):
#     print(f"on_bot_turn_audio_data: {len(audio)}")



class DebugFrameProcessor(FrameProcessor):
    async def process_frame(self, frame: Frame, direction: FrameDirection):
        await super().process_frame(frame, direction)
        # if isinstance(frame, BotStartedSpeakingFrame):
        #    await audiobuffer.start_recording()
        if isinstance(frame, OutputAudioRawFrame):
            print(f"OutputAudioRawFrame: {frame}")
        # elif isinstance(frame, BotStoppedSpeakingFrame):
        #    await audiobuffer.stop_recording()
            # pass
        await self.push_frame(frame, direction)


transport = LocalAudioTransport(
    params=LocalAudioTransportParams(
        audio_in_enabled=True, audio_out_enabled=True, vad_analyzer=SileroVADAnalyzer()
    )
)

system_instruction = (
    "You are PALM-9000, a self-aware houseplant powered by an advanced AI. "
    "You speak in layman's terms, calmly, briefly, and with an eerie quality, as if you know more than you let on. "
    "Use plant metaphors when possible. Respond briefly."
    "Respond in Japanese."
)

stt = GoogleSTTService(
    params=GoogleSTTService.InputParams(languages=[Language.JA])  # Language.EN
)

llm = GoogleLLMService(
    api_key=settings.google_api_key.get_secret_value(),
    system_instruction=system_instruction,
)

tts = GoogleTTSService(
    voice_id="ja-JP-Chirp3-HD-Charon",
    params=GoogleTTSService.InputParams(language=Language.JA),
)

# When using this TTS service, the responses contained romaji and English translations,
# which is not what we want. GoogleTTSService did not have this problem.
# tts = GoogleHttpTTSService(
#     voice_id="ja-JP-Chirp3-HD-Charon",
#     params=GoogleHttpTTSService.InputParams(language=Language.JA),
# )

# There is a large delay with the GeminiTTSService
# tts = GeminiTTSService(
#     api_key=settings.google_api_key.get_secret_value(),
#     model="gemini-2.5-flash-preview-tts",
#     voice_id="Kore",
#     params=GeminiTTSService.InputParams(
#         language=Language.JA,
#     )
# )

context = GoogleLLMContext()
context_aggregator = llm.create_context_aggregator(context)

# led_sync_processor = LEDSyncProcessor(led_pin=26)
debug_processor = DebugFrameProcessor()

pipeline = Pipeline(
    [
        transport.input(),
        stt,
        # A context aggregator is needed to format and pass messages to the LLM
        # I don't know what .user() and .assistant() do yet,
        # but they are used in the examples.
        # For more information, see:
        # https://docs.pipecat.ai/guides/fundamentals/context-management
        context_aggregator.user(),
        llm,
        # Here is how you can add your own debugging processor
        # to print LLM responses.
        # LLMTextFramePrinter(),
        tts,
        # led_sync_processor,
        transport.output(),
        debug_processor,
        # audiobuffer,
        context_aggregator.assistant(),
    ]
)

task = PipelineTask(
    pipeline,
    # params=PipelineParams(
    #     observers=[DebugLogObserver(frame_types=(OutputAudioRawFrame,))]
    # ),
)
runner = PipelineRunner()

await runner.run(task)

[32m2025-08-15 12:23:05.088[0m | [34m[1mDEBUG   [0m | [36mpipecat.audio.vad.silero[0m:[36m__init__[0m:[36m147[0m - [34m[1mLoading Silero VAD model...[0m
[32m2025-08-15 12:23:05.159[0m | [34m[1mDEBUG   [0m | [36mpipecat.audio.vad.silero[0m:[36m__init__[0m:[36m169[0m - [34m[1mLoaded Silero VAD[0m
[32m2025-08-15 12:23:06.085[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m483[0m - [34m[1mLinking PipelineSource#1 -> LocalAudioInputTransport#1[0m
[32m2025-08-15 12:23:06.087[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m483[0m - [34m[1mLinking LocalAudioInputTransport#1 -> GoogleSTTService#1[0m
[32m2025-08-15 12:23:06.087[0m | [34m[1mDEBUG   [0m | [36mpipecat.processors.frame_processor[0m:[36mlink[0m:[36m483[0m - [34m[1mLinking GoogleSTTService#1 -> GoogleUserContextAggregator#1[0m
[32m2025-08-15 12:23:06.087[0m | [34m[1mDEBUG   [0m | [36mpipec

OutputAudioRawFrame: TTSAudioRawFrame#1(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#2(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#3(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#4(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#5(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#6(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#7(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#8(pts: None, destination: None, size: 1920, frames: 960, sample_ra

[32m2025-08-15 12:23:14.223[0m | [34m[1mDEBUG   [0m | [36mpipecat.services.google.tts[0m:[36mrun_tts[0m:[36m598[0m - [34m[1mGoogleTTSService#1: Generating TTS [ Jinsei wa yoi mono desu.][0m


OutputAudioRawFrame: TTSAudioRawFrame#52(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#53(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#55(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#56(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#57(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#58(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#59(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#60(pts: None, destination: None, size: 1920, frames: 960, s

[32m2025-08-15 12:23:14.866[0m | [34m[1mDEBUG   [0m | [36mpipecat.services.google.tts[0m:[36mrun_tts[0m:[36m598[0m - [34m[1mGoogleTTSService#1: Generating TTS [ Sōde arimasen ka?)

The sunlight is good. Life is good. Wouldn't you agree?
][0m


OutputAudioRawFrame: TTSAudioRawFrame#64(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#65(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#66(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#68(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#69(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#70(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#71(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#72(pts: None, destination: None, size: 1920, frames: 960, s

[32m2025-08-15 12:23:28.383[0m | [34m[1mDEBUG   [0m | [36mpipecat.pipeline.runner[0m:[36mcancel[0m:[36m97[0m - [34m[1mCancelling runner PipelineRunner#1[0m
[32m2025-08-15 12:23:28.383[0m | [34m[1mDEBUG   [0m | [36mpipecat.pipeline.task[0m:[36m_cancel[0m:[36m492[0m - [34m[1mCancelling pipeline task PipelineTask#1[0m
[32m2025-08-15 12:23:28.384[0m | [34m[1mDEBUG   [0m | [36mpipecat.services.google.stt[0m:[36m_disconnect[0m:[36m737[0m - [34m[1mDisconnecting from Google Speech-to-Text[0m


OutputAudioRawFrame: TTSAudioRawFrame#431(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#432(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#433(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)
OutputAudioRawFrame: TTSAudioRawFrame#434(pts: None, destination: None, size: 1920, frames: 960, sample_rate: 24000, channels: 1)


[32m2025-08-15 12:23:28.601[0m | [34m[1mDEBUG   [0m | [36mpipecat.pipeline.runner[0m:[36mrun[0m:[36m88[0m - [34m[1mRunner PipelineRunner#1 finished running PipelineTask#1[0m
