In [1]:
# Sample code from https://cloud.google.com/vertex-ai/docs/generative-ai/multimodal/send-multimodal-prompts
# REQUIRED:
# 1. Install gcloud CLI from https://cloud.google.com/sdk/docs/install
# 2. gcloud auth application-default login
# 3. gcloud config set project mipt-hack-01 

import http.client
import typing
import urllib.request
from vertexai.preview.generative_models import GenerativeModel, Image

# create helper function
def load_image_from_url(image_url: str) -> Image:
    with urllib.request.urlopen(image_url) as response:
        response = typing.cast(http.client.HTTPResponse, response)
        image_bytes = response.read()
    return Image.from_bytes(image_bytes)

# Load images from Cloud Storage URI
landmark1 = load_image_from_url("https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark1.png")
landmark2 = load_image_from_url("https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark2.png")
landmark3 = load_image_from_url("https://storage.googleapis.com/cloud-samples-data/vertex-ai/llm/prompts/landmark3.png")

# Pass multimodal prompt
model = GenerativeModel("gemini-pro-vision")
response = model.generate_content(
    [landmark1, "city: Rome, Landmark: the Colosseum",
     landmark2, "city: Beijing, Landmark: Forbidden City",
     landmark3, ]
)
print(response)

candidates {
  content {
    role: "model"
    parts {
      text: " city: Rio de Janeiro, Landmark: Christ the Redeemer"
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_DANGEROUS_CONTENT
    probability: NEGLIGIBLE
  }
}
usage_metadata {
  prompt_token_count: 791
  candidates_token_count: 11
  total_token_count: 802
}

