In [18]:
import io
import wave

import pyaudio
from google import genai
from google.genai import types
from pydantic import BaseModel, SecretStr
from pydantic_settings import BaseSettings, SettingsConfigDict


class Google(BaseModel):
    api_key: SecretStr | None = None


class Settings(BaseSettings):
    google: Google = Google()
    model_config = SettingsConfigDict(
        env_file=".env", env_nested_delimiter="__", extra="ignore"
    )


settings = Settings()

client = genai.Client(api_key=settings.google.api_key.get_secret_value())

In [19]:
def play_audio(audio_bytes: bytes, sample_rate=24000):
    buffer = io.BytesIO()

    # Wrap PCM data in WAV headers in-memory
    with wave.open(buffer, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)  # 16-bit PCM
        wf.setframerate(sample_rate)
        wf.writeframes(audio_bytes)

    buffer.seek(0)

    # Play audio with PyAudio
    wf = wave.open(buffer, "rb")
    pa = pyaudio.PyAudio()
    stream = pa.open(
        format=pa.get_format_from_width(wf.getsampwidth()),
        channels=wf.getnchannels(),
        rate=wf.getframerate(),
        output=True,
    )
    data = wf.readframes(1024)
    while data:
        stream.write(data)
        data = wf.readframes(1024)

    stream.stop_stream()
    stream.close()
    pa.terminate()

In [None]:
response = client.models.generate_content(
    model="models/gemini-2.5-flash-preview-tts",
    contents="Say quickly, with an ominous voice: 'こんにちは、ボブさん'",
    config=types.GenerateContentConfig(
        response_modalities=["AUDIO"],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                    voice_name="Kore",
                )
            )
        ),
    ),
)

data = response.candidates[0].content.parts[0].inline_data.data

In [21]:
play_audio(data)