Source: [Multimodality with Gemini](https://partner.cloudskillsboost.google/paths/2294/course_templates/1397/labs/566670)

In [1]:
from vertexai.preview.generative_models import (
    GenerationConfig,
    GenerativeModel,
    Image,
    Part,
)

In [6]:
"""
Helper functions
"""

import http.client
import typing
import urllib.request

import IPython.display

# PIL is imported from the pillow package
from PIL import Image as PIL_Image
from PIL import ImageOps as PIL_ImageOps


def display_images(
    images: typing.Iterable[Image],
    max_width: int = 600,
    max_height: int = 350,
) -> None:
    """Display a collection of images in a Jupyter notebook with size constraints.
    
    This function takes an iterable of Vertex AI Image objects and displays them
    in the notebook. Images are automatically resized if they exceed the specified
    maximum dimensions while maintaining aspect ratio.
    
    Args:
        images: An iterable of Vertex AI Image objects to display.
        max_width: Maximum width in pixels for displayed images. Defaults to 600.
        max_height: Maximum height in pixels for displayed images. Defaults to 350.
        
    Returns:
        None. Images are displayed directly in the notebook.
        
    Note:
        Images are converted to RGB mode for compatibility with Jupyter environments.
        The function uses PIL's contain method to resize images while preserving
        aspect ratio.
    """
    for image in images:
        pil_image = typing.cast(PIL_Image.Image, image._pil_image)
        if pil_image.mode != "RGB":
            # RGB is supported by all Jupyter environments (e.g. RGBA is not yet)
            pil_image = pil_image.convert("RGB")
        image_width, image_height = pil_image.size
        if max_width < image_width or max_height < image_height:
            # Resize to display a smaller notebook image
            pil_image = PIL_ImageOps.contain(pil_image, (max_width, max_height))
        IPython.display.display(pil_image)


def get_image_bytes_from_url(image_url: str) -> bytes:
    """Download image data from a URL and return as bytes.
    
    This function fetches image data from a given URL and returns the raw bytes.
    It's a utility function used by load_image_from_url to download images
    from web URLs.
    
    Args:
        image_url: The URL of the image to download.
        
    Returns:
        The image data as bytes.
        
    Raises:
        urllib.error.URLError: If the URL cannot be opened or the image cannot be downloaded.
        http.client.HTTPException: If there's an HTTP error during the request.
    """
    with urllib.request.urlopen(image_url) as response:
        response = typing.cast(http.client.HTTPResponse, response)
        image_bytes = response.read()
    return image_bytes


def load_image_from_url(image_url: str) -> Image:
    """Load an image from a URL and convert it to a Vertex AI Image object.
    
    This function downloads an image from a given URL and converts it to a
    Vertex AI Image object that can be used with Gemini models for multimodal
    processing.
    
    Args:
        image_url: The URL of the image to load.
        
    Returns:
        A Vertex AI Image object containing the downloaded image data.
        
    Raises:
        urllib.error.URLError: If the URL cannot be opened or the image cannot be downloaded.
        ValueError: If the downloaded data cannot be converted to a valid image.
    """
    image_bytes = get_image_bytes_from_url(image_url)
    return Image.from_bytes(image_bytes)


def display_content_as_image(content: str | Image | Part) -> bool:
    """Display content if it's an image, otherwise return False.
    
    This function checks if the given content is a Vertex AI Image object and
    displays it if so. It's used as part of the multimodal content display
    pipeline to handle different content types appropriately.
    
    Args:
        content: The content to check and potentially display. Can be a string,
                Vertex AI Image, or Part object.
                
    Returns:
        True if the content was an image and was displayed, False otherwise.
    """
    if not isinstance(content, Image):
        return False
    display_images([content])
    return True


def display_content_as_video(content: str | Image | Part) -> bool:
    """Display content if it's a video Part, otherwise return False.
    
    This function checks if the given content is a Part object containing video
    data and displays it if so. It converts Google Cloud Storage URIs to
    publicly accessible URLs for video playback in the notebook.
    
    Args:
        content: The content to check and potentially display. Can be a string,
                Vertex AI Image, or Part object.
                
    Returns:
        True if the content was a video and was displayed, False otherwise.
        
    Note:
        This function assumes the Part object contains a file_uri that points to
        a Google Cloud Storage location. It converts gs:// URIs to HTTPS URLs
        for public access.
    """
    if not isinstance(content, Part):
        return False
    part = typing.cast(Part, content)
    file_path = part.file_data.file_uri.removeprefix("gs://")
    video_url = f"https://storage.googleapis.com/{file_path}"
    IPython.display.display(IPython.display.Video(video_url, width=600))
    return True


def print_multimodal_prompt(contents: list[str | Image | Part]) -> None:
    """Print and display multimodal content for readability.
    
    Given a list of contents that would be sent to Gemini (text, images, videos),
    this function displays each content type appropriately. Images and videos
    are displayed using their respective display functions, while text content
    is printed to the console.
    
    This function is useful for debugging and understanding what content is
    being sent to multimodal AI models like Gemini.
    
    Args:
        contents: A list of content items that can include strings (text),
                 Vertex AI Image objects, or Part objects (videos).
                 
    Returns:
        None. Content is displayed directly in the notebook or printed to console.
        
    Example:
        >>> contents = ["Describe this image:", image_obj, "What do you see?"]
        >>> print_multimodal_prompt(contents)
        # This will display the image and print the text prompts
    """
    for content in contents:
        if display_content_as_image(content):
            continue
        if display_content_as_video(content):
            continue
        print(content)
