# Vision APIs

There are some APIs which can support image analysis on GCP. 

- Cloud Vision API
- Gemini Pro Vision
- Imagen 1 / 2


## Vision API

In [None]:
#!pip3 install google-cloud-vision

In [None]:
from google.cloud import storage
import os

bucket_name = os.environ.get("BUCKET_UPLOAD_TEMP")

def upload_file_to_temp_bucket(file_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    blob = bucket.blob(file_name)
    blob.upload_from_filename(file_name)
    
    return blob.public_url.replace("https://storage.googleapis.com/", "gs://")


In [None]:
from google.cloud import vision

client = vision.ImageAnnotatorClient()
file_uri = upload_file_to_temp_bucket("resources/wooribank_login.png")

image = vision.Image()
image.source.image_uri = file_uri


### Test - object detection

In [None]:
def test_object_detection():
  objects = client.object_localization(image=image).localized_object_annotations
  print(f"Number of objects found: {len(objects)}")
  for object_ in objects:
    print(f"\n{object_.name} (confidence: {object_.score})")
    print("Normalized bounding polygon vertices: ")
    for vertex in object_.bounding_poly.normalized_vertices:
      print(f" - ({vertex.x}, {vertex.y})")

test_object_detection()


### Test - label detection

In [None]:
def test_label_detection():
  response = client.label_detection(image=image)
  labels = response.label_annotations
  print("Labels:")

  for label in labels:
    print(label)

  if response.error.message:
    raise Exception(
      "{}\nFor more info on error messages, check: "
      "https://cloud.google.com/apis/design/errors".format(response.error.message)
    )

test_label_detection()

## Gemini Pro Vision 

### Test - Image analysis with prompt 1


In [None]:

def get_multimodal_model():
    from vertexai.preview.generative_models import (
        GenerationConfig,
        GenerativeModel,
        Image,
        Part,
        HarmBlockThreshold,
        HarmCategory,
    )
    multimodal_model = GenerativeModel("gemini-pro-vision")
    return multimodal_model

multimodal_model = get_multimodal_model()

In [None]:
def test_multimodal_generation():
    from vertexai.preview.generative_models import (
        GenerationConfig,
        GenerativeModel,
        Image,
        Part,
        HarmBlockThreshold,
        HarmCategory,
    )
    prompt = "Please provide me with the input fields and their locations in the provided screenshot."

    generation_config = GenerationConfig(
        temperature=0.1,
        max_output_tokens=2048,
    )

    safety_config = {
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    }

    prompt = "Please provide me with the input fields and their locations in the provided screenshot."

    image = Part.from_uri(
        uri=file_uri,
        mime_type="image/png",
    )
    contents = [prompt, image]

    responses = multimodal_model.generate_content(contents, generation_config=generation_config, 
        safety_settings=safety_config, stream=False)

    for response in responses:
        print(response)

test_multimodal_generation()

## Imagen 1 / 2

### Test 

In [1]:
import os

PROJECT_ID = os.environ.get("PROJECT_ID")
LOCATION = "us-central1"

In [3]:
import vertexai
from vertexai.vision_models import ImageTextModel

vertexai.init(project=PROJECT_ID, location=LOCATION)
model = ImageTextModel.from_pretrained("imagetext@001")

In [5]:

def test_ask_question_with_imagen():
    from vertexai.vision_models import Image

    source_image = Image.load_from_file(location='resources/wooribank_login.png')

    prompt = "Please provide me with the input fields and their locations in the provided screenshot."

    answers = model.ask_question(
        image=source_image,
        question=prompt,
    )

    for answer in answers:
        print(answer)

test_ask_question_with_imagen()


yeskey
