In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Using Code Generation Tool in Multimodal Live API

Author(s) | [Laxmi Harikumar](https://github.com/laxmi-genai) |

## Overview
[Gemini 2.0 Flash](https://cloud.google.com/vertex-ai/generative-ai/docs/gemini-v2#2.0-flash) is Google's latest generally available model in the Gemini family. It's Google's workhorse model for all daily tasks and features enhanced performance and supports real-time Live API. 2.0 Flash is an upgrade path for 1.5 Flash users who want a slightly slower model with significantly better quality, or 1.5 Pro users who want slightly better quality and real-time latency for less.

The [Multimodal Live API](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live) enables low-latency, two-way interactions that use text, audio, and video input, with audio and text output. This facilitates natural, human-like voice conversations with the ability to interrupt the model at any time. The model's video understanding capability expands communication modalities, enabling you to share camera input or screencasts and ask questions about them.

### Pricing
Information on the pricing for Gemini 2.0 Flash is available on our [Pricing page](https://cloud.google.com/vertex-ai/generative-ai/pricing).

## Objective
This tutorial demonstrates the following simple examples to help you get started with the Multimodal Live API using the Google Gen AI SDK in Vertex AI.

- Text-to-audio generation
- Code execution

See the [Multimodal Live API](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/multimodal-live) page for more details.

TODO

## Getting Started

### Install Google Gen AI SDK for Python

In [291]:
!pip install --upgrade --quiet google-genai

### Restart runtime
To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)
If you are running this notebook on Google Colab, run the cell below to authenticate your environment.

In [292]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()



## Getting Started

### Import libraries

In [293]:
from google import genai
from google.genai.types import LiveConnectConfig

import os
import json

from IPython.display import Audio, Markdown, display

import numpy as np
from google.genai.types import (
    Tool,
    GenerateContentConfig,
    ToolCodeExecution,
    Part,
    LiveConnectConfig,
    PrebuiltVoiceConfig,
    SpeechConfig,
    VoiceConfig,
)

### Set Google Cloud project information and create client

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [294]:
PROJECT_ID = ""  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

In [295]:
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

## Use the Gemini 2.0 Flash model

In [296]:
MODEL_ID = "gemini-2.0-flash-001"  # @param {type: "string"}

## Configure model parameters

You can include parameter values in each call that you send to a model to control how the model generates a response. The model can generate different results for different parameter values. You can experiment with different model parameters to see how the results change.

- Learn more about [experimenting with parameter values](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/prompts/adjust-parameter-values).

- See a list of all [Gemini API parameters](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#parameters)

In [297]:
generation_config =  GenerateContentConfig(
    max_output_tokens=8192,
    temperature=0.3,
)

## Text-to-Audio generation

Send a text prompt and receive a model response in audio.

**Notes**
- Multimodal Live API supports the following voices:
  - Puck
  - Charon
  - Kore
  - Fenrir
  - Aoede

In [298]:
config = LiveConnectConfig(
    response_modalities=["AUDIO"],
    speech_config=SpeechConfig(
        voice_config=VoiceConfig(
            prebuilt_voice_config=PrebuiltVoiceConfig(
                voice_name="Kore",
            )
        )
    ),
)

In [299]:
async def main() -> None:
  # Connect to the Vertex AI Live API using the specified model and configuration.
  async with client.aio.live.connect(
                        model=MODEL_ID,
                        config=config,
                    ) as session:

      # Define the text prompt to send to the model.
      text_input = "Explain the encoder and decoder in a transformer in 5 to 6 lines."
      display(Markdown(f"**Input:** {text_input}"))

      # Send the text prompt to the model.
      await session.send(input=text_input, end_of_turn=True)

      audio_response = []

      # Asynchronously receive the model's audio response.
      async for message in session.receive():
          if message.server_content.model_turn:
            for part in message.server_content.model_turn.parts:
              # Extract the audio data from the response.
              audio_response.append(np.frombuffer(part.inline_data.data, dtype=np.int16))

      # If audio response is not empty, combine the audio chunks and play it.
      if audio_response:
          display(Audio(np.concatenate(audio_response), rate=24000, autoplay=True))

# Run the main function.
await main()

**Input:** Explain the encoder and decoder in a transformer in 5 to 6 lines.

## Code Execution


### Set up Code Execution as a tool

The [Gemini API code execution](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/code-execution?hl=en) feature enables the model to generate and run Python code and learn iteratively from the results until it arrives at a final output. You can use this code execution capability to build applications that benefit from code-based reasoning and that produce text output. For example, you could use code execution in an application that solves equations or processes text.

The Gemini API provides code execution as a tool, similar to function calling. After you add code execution as a tool, the model decides when to use it.

In [300]:
# Initialize Google Search tool
code_execution_tool = Tool(code_execution=ToolCodeExecution())

### Configure the LiveConnect Session

In [302]:
# Configure Live Connect session
config = LiveConnectConfig(
    response_modalities=["TEXT"],
    tools=[code_execution_tool],
    generation_config= generation_config,
)

In [303]:
async def main() -> None:
    # Connect to the Multimodal Live API using the specified model and configuration.
    async with client.aio.live.connect(model=MODEL_ID, config=config) as session:

        # Asynchronous function to handle user input
        async def send() -> bool:
            text_input = input("Input > ")
            # Check for quit commands
            if text_input.lower() in ("q", "quit", "exit"):
                return False
            # Send user input to the session with end_of_turn flag
            await session.send(input=text_input, end_of_turn=True)
            return True

        # Asynchronous function to receive response from the session
        async def receive() -> None:
            response = []
            async for message in session.receive():
                # Process model turn messages
                if message.text:
                  response.append(message.text)
                if message.server_content.model_turn.parts:
                    for part in message.server_content.model_turn.parts:
                        if part.executable_code:
                            display(
                                Markdown(
                                    f"""
                                      **Executable code:**
                                      ```py
                                      {part.executable_code.code}
                                      ```
                                      """
                                )
                            )

                # break  # Exit the receive loop after displaying response
            if response:
              display(Markdown(f"**Response >** {''.join(response)}"))
            return


        # Main interaction loop
        while True:
            if not await send():  # Get user input and check for exit
                break
            await receive()  # Receive and display response


In [304]:
await main()

Input > Write code to calculate the 15th fibonacci number then find the nearest palindrome to it



                                      **Executable code:**
                                      ```py
                                      import math

def fibonacci(n):
    if n <= 0:
        return 0
    elif n == 1:
        return 1
    else:
        a, b = 0, 1
        for _ in range(2, n + 1):
            a, b = b, a + b
        return b

def is_palindrome(n):
    return str(n) == str(n)[::-1]

def nearest_palindrome(n):
    if is_palindrome(n):
        return n

    s = str(n)
    length = len(s)

    # Generate the first half of the palindrome
    first_half = s[:length // 2]

    # Generate the two candidate palindromes
    if length % 2 == 0:
        palindrome1 = first_half + first_half[::-1]
    else:
        palindrome1 = first_half + s[length // 2] + first_half[::-1]

    palindrome1 = int(palindrome1)

    # Generate the second palindrome by incrementing or decrementing the middle digits
    if length % 2 == 0:
        mid = int(first_half)
        lower_mid = str(mid - 1) if mid > 0 else '0'
        upper_mid = str(mid + 1)
        palindrome2_lower = int(lower_mid + lower_mid[::-1]) if mid > 0 else float('inf')
        palindrome2_upper = int(upper_mid + upper_mid[::-1])
        palindrome2 = min(palindrome2_lower, palindrome2_upper)

    else:
        mid = int(s[length // 2])
        lower_mid = mid - 1 if mid > 0 else 0
        upper_mid = mid + 1
        temp_half = first_half + str(lower_mid)
        palindrome2_lower = int(temp_half + temp_half[::-1][:-1]) if lower_mid >= 0 else float('inf')
        temp_half = first_half + str(upper_mid)
        palindrome2_upper = int(temp_half + temp_half[::-1][:-1])
        palindrome2 = min(palindrome2_lower, palindrome2_upper)

    # Return the closer palindrome
    if abs(n - palindrome1) <= abs(n - palindrome2):
        return palindrome1
    else:
        return palindrome2

# Calculate the 15th Fibonacci number
fib_15 = fibonacci(15)
print(f'{fib_15=}')

# Find the nearest palindrome
nearest_pal = nearest_palindrome(fib_15)
print(f'{nearest_pal=}')

                                      ```
                                      

**Response >** The 15th Fibonacci number is 610. The nearest palindrome to 610 is 616.


Input > q
