In [1]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Getting started with Multimodal Live API

| | |
|-|-|
| Author(s) | [Laxmi Harikumar](https://github.com/laxmi-genai) |

# Overview

The [Multimodal Live API](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live) enables low-latency, two-way interactions that use text, audio, and video input, with audio and text output. This facilitates natural, human-like voice conversations with the ability to interrupt the model at any time. The model's video understanding capability expands communication modalities, enabling you to share camera input or screencasts and ask questions about them.

This notebook demonstrates the building blocks of Multimodal Live API and focuses on using the Vertex AI SDK for Python with the Gemini 2.0 Flash model.

For more information, see the [Generative AI on Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/overview) documentation.

### Objectives


This notebook is a beginner's guide (first in its series) focusing on getting started with Multimodal Live API with Vertex AI SDK for Python.  By the end of this notebook, you will be able to:

* Understand the basic concepts of Multimodal Live API
* Use the Vertex AI SDK for Python to interact with the API
* Send and receive text and audio data
* Integrate Google Search as a tool

This hands-on approach emphasizes providing you with a deeper understanding of the underlying building blocks and concepts.



### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator) to generate a cost estimate based on your projected usage.

#### Install the required packages

In [None]:
!pip3 install -qU google-genai

#### Restart the instance after installing the package

In [1]:
from IPython import Application

app = Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

#### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

In [1]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

#### Import dependencies

In [19]:
from google import genai
from google.genai.types import LiveConnectConfig

import os

from IPython.display import Audio, Markdown, display

import numpy as np
from google.genai.types import Tool, GenerationConfig, GoogleSearch

#### Set environment variables

In [5]:
# Replace with your GCP Project ID
PROJECT_ID = ""  # @param {type: "string"}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

PROJECT_LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

#### Initialize Vertex

In [6]:
vertexai_client = genai.Client(vertexai=True, project=PROJECT_ID, location=PROJECT_LOCATION)

####  Load the Gemini 2.0 Flash model

Multimodal Live API is a new capability introduced with the [Gemini 2.0 Flash model](https://cloud.google.com/vertex-ai/generative-ai/docs/gemini-v2).

In [12]:
MODEL = "gemini-2.0-flash-exp"  # @param {type: "string"}

Configure the output generation parameters.

In [20]:
generation_config = GenerationConfig(maxOutputTokens=8192,
                                     temperature=1)

## **Example 1**: Text to text

You send one text prompt and receive text response.

In [25]:
# Configure the response modality to "TEXT" and use the specified generation config.
config = LiveConnectConfig(response_modalities=["TEXT"],
                           generation_config=generation_config)

async def main() -> None:
  # Connect to the Multimodal Live API using the specified model and configuration.
  async with vertexai_client.aio.live.connect(
                          model=MODEL,
                          config=config) as session:
      """
      Sends a text prompt to the Gemini model and displays the text response.

      This example demonstrates a simple text-to-text interaction using the
      Multimodal Live API. It sends a text prompt to the specified Gemini model
      and asynchronously receives and displays the model's text response.
      """

      # Define the text prompt to send to the model.
      text_input = "Write a short story about a cat who goes on an adventure."
      # Display the model's text input.
      display(Markdown(f"**Input:** {text_input}"))

      # Send the text prompt to the model.
      await session.send(input=text_input, end_of_turn=True)

      text_response = []

      # Asynchronously receive the model's response.
      async for message in session.receive():
          if message.server_content.model_turn.parts:
              text_response.append(message.text)

      # Display the model's text response.
      display(Markdown(f"**Response >** {''.join(text_response)}"))

# Run the main function.
await main()

**Input:** Write a short story about a cat who goes on an adventure.

**Response >** Okay, I can do that for you. Whiskers, a tabby cat with emerald eyes, lived a predictable life of naps and kibble. One day, a robin landed on his windowsill, chirping tales of a world beyond the garden. Curiosity piqued, Whiskers leaped out the window, landing softly on the grass. He explored the woods, chased butterflies, and even helped a lost squirrel find its home. He returned home tired but full of amazing stories to tell, and his life was never the same. What do you think of the story?


## **Example 2**: Text to audio

You send one text prompt and receive audio response.

In [26]:
# Configure the response modality to "AUDIO" and use the specified generation config.
config = LiveConnectConfig(response_modalities=["AUDIO"],
                           generation_config=generation_config)

async def main() -> None:
  # Connect to the Vertex AI Live API using the specified model and configuration.
  async with vertexai_client.aio.live.connect(
                        model=MODEL,
                        config=config,
                    ) as session:
      """
      Sends a text prompt to the Gemini model and plays the audio response.

      This example demonstrates a text-to-audio interaction using the
      Multimodal Live API. It sends a text prompt to the specified Gemini model
      and asynchronously receives and plays the model's audio response.
      """
      # Define the text prompt to send to the model.
      text_input = "Tell a short story about a cat who goes on an adventure."
      display(Markdown(f"**Input:** {text_input}"))

      # Send the text prompt to the model.
      await session.send(input=text_input, end_of_turn=True)

      audio_response = []

      # Asynchronously receive the model's audio response.
      async for message in session.receive():
          if message.server_content.model_turn:
            for part in message.server_content.model_turn.parts:
              # Extract the audio data from the response.
              audio_response.append(np.frombuffer(part.inline_data.data, dtype=np.int16))

      # If audio response is not empty, combine the audio chunks and play it.
      if audio_response:
          display(Audio(np.concatenate(audio_response), rate=24000, autoplay=True))

# Run the main function.
await main()

**Input:** Tell a short story about a cat who goes on an adventure.

## **Example 3**: Google Search as a tool in Text to Audio

This example demonstrates how to use Google Search as a tool within a text-to-audio interaction

In [35]:
# Define the Google Search tool to enable web search capabilities.
google_search_tool = Tool(
    google_search = GoogleSearch()
)

# Configure the response modality to "AUDIO", enable the Google Search tool,
# and use the specified generation config.
config = LiveConnectConfig(response_modalities=["AUDIO"],
                           tools=[google_search_tool],
                           generation_config=generation_config)

async def main() -> None:
  # Connect to the Vertex AI Live API using the specified model and configuration.
  async with vertexai_client.aio.live.connect(
                        model=MODEL,
                        config=config,
                    ) as session:
      # Define the text prompt that requires a Google Search to answer.
      text_input = "What's the current price of Google stock?"
      display(Markdown(f"**Input:** {text_input}"))

      # Send the text prompt to the model.
      await session.send(input=text_input, end_of_turn=True)

      audio_response = []

      # Asynchronously receive the model's audio response.
      async for message in session.receive():
          if message.server_content.model_turn:
            for part in message.server_content.model_turn.parts:
              # Extract the audio data from the response.
              audio_response.append(np.frombuffer(part.inline_data.data, dtype=np.int16))

      # If audio response is not empty, combine the audio chunks and play it.
      if audio_response:
          display(Audio(np.concatenate(audio_response), rate=24000, autoplay=True))

# Run the main function.
await main()

**Input:** What's the current price of Google stock?

# Conclusion

This notebook provides a starting point to explore its capabilities using the Vertex AI SDK for Python for Multimodal Live API.

You can adapt these examples to create custom solutions that meet your specific needs.