https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live

Install the required packages

In [1]:
!pip3 install -qU google-genai

Restart the instance after installing the package

In [2]:
from IPython import Application

app = Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

In [2]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

Import dependencies

In [5]:
from google import genai
from google.genai.types import LiveConnectConfig

import os

from IPython.display import Audio, Markdown, display

import numpy as np

Set environment variables

In [6]:
PROJECT_ID = "cloud-llm-preview1"  # @param {type: "string"}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

Initialize Vertex

In [9]:
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

### Load the Gemini 2.0 Flash model

Multimodal Live API is a new capability introduced with the [Gemini 2.0 Flash model](https://cloud.google.com/vertex-ai/generative-ai/docs/gemini-v2).

In [10]:
MODEL_ID = "gemini-2.0-flash-exp"  # @param {type: "string"}

## **Example 1**: Text to text

You send one text prompt and receive text response.

In [24]:
config = LiveConnectConfig(response_modalities=["TEXT"])

async with client.aio.live.connect(
                    model=MODEL_ID,
                    config=config,
                ) as session:
    text_input = "Hello? Gemini are you there?"
    display(Markdown(f"**Input:** {text_input}"))

    await session.send(input=text_input, end_of_turn=True)

    text_response = []

    async for message in session.receive():
        print(message)
        if message.server_content.model_turn.parts:
            print(message.text)
            text_response.append(message.text)

    display(Markdown(f"**Response >** {''.join(text_response)}"))

**Input:** Hello? Gemini are you there?

setup_complete=None server_content=LiveServerContent(model_turn=Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=None, function_response=None, inline_data=None, text='Yes')], role='model'), turn_complete=None, interrupted=None) tool_call=None tool_call_cancellation=None
Yes
setup_complete=None server_content=LiveServerContent(model_turn=Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=None, function_response=None, inline_data=None, text=", I'm here. What would you like to talk about?\n")], role='model'), turn_complete=None, interrupted=None) tool_call=None tool_call_cancellation=None
, I'm here. What would you like to talk about?

setup_complete=None server_content=LiveServerContent(model_turn=Content(parts=None, role='model'), turn_complete=True, interrupted=None) tool_call=None tool_call_cancellation=None


**Response >** Yes, I'm here. What would you like to talk about?


## **Example 2**: Text to audio

You send one text prompt and receive audio response.

In [28]:
config = LiveConnectConfig(response_modalities=["AUDIO"])

async with client.aio.live.connect(
                    model=MODEL_ID,
                    config=config,
                ) as session:
    text_input = "Hello? Gemini are you there?"
    display(Markdown(f"**Input:** {text_input}"))

    await session.send(input=text_input, end_of_turn=True)

    audio_response = []

    async for message in session.receive():
        print(message)
        if message.server_content.model_turn:
          for part in message.server_content.model_turn.parts:
            audio_response.append(np.frombuffer(part.inline_data.data, dtype=np.int16))

    if audio_response:
        display(Audio(np.concatenate(audio_response), rate=24000, autoplay=True))

**Input:** Hello? Gemini are you there?

setup_complete=None server_content=LiveServerContent(model_turn=Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=None, function_response=None, inline_data=Blob(data=b'\x01\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfe\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0

## **Example 3**: Text to audio in a chat

**Step 1**: You set up a chat with the API to answer your text prompts and return responses in audio.

In [29]:
config = LiveConnectConfig(response_modalities=["AUDIO"])

async def main() -> None:
    async with client.aio.live.connect(model=MODEL_ID, config=config) as session:

        async def send() -> bool:
            text_input = input("Input > ")
            if text_input.lower() in ("q", "quit", "exit"):
                return False
            await session.send(input=text_input, end_of_turn=True)
            return True

        async def receive() -> None:

            audio_data = []

            async for message in session.receive():
                if message.server_content.model_turn:
                    for part in message.server_content.model_turn.parts:
                        if part.inline_data:
                            audio_data.append(
                                np.frombuffer(part.inline_data.data, dtype=np.int16)
                            )

                if message.server_content.turn_complete:
                    display(Markdown("**Response >**"))
                    display(
                        Audio(np.concatenate(audio_data), rate=24000, autoplay=True)
                    )
                    break

            return

        while True:
            if not await send():
                break
            await receive()

**Step 2** Run the chat, input your prompts, or type `q`, `quit` or `exit` to exit.


In [31]:
await main()

Input > hi


**Response >**

Input > Christmas is almost here ....


**Response >**

Input > Nope. Can you help me make some?


**Response >**

Input > Hmm... are there any holiday markets?


**Response >**

Input > Nope... Are there any in San Francisco


**Response >**

Input > Yes


**Response >**

Input > Something that I can visit tomorrow


**Response >**

Input > Outdoors


**Response >**

Input > Sure


**Response >**

Input > Yes!


**Response >**

Input > quit
