In [36]:
import json

schema="""{
  "type": "object",
  "properties": {
    "objects": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": {
            "type": "string",
            "description": "A preliminary name assigned to the object."
          },
          "description": {
            "type": "string",
            "description": "A detailed textual description of the object."
          },
          "x1": {
            "type": "number",
            "description": "The x-coordinate of the top-left corner of the object's bounding box."
          },
          "y1": {
            "type": "number",
            "description": "The y-coordinate of the top-left corner of the object's bounding box."
          },
          "x2": {
            "type": "number",
            "description": "The x-coordinate of the bottom-right corner of the object's bounding box."
          },
          "y2": {
            "type": "number",
            "description": "The y-coordinate of the bottom-right corner of the object's bounding box."
          }
        },
        "required": ["name", "description", "x1", "y1", "x2", "y2"]
      }
    }
  },
  "required": ["objects"]
}
"""

VAR_INTRODUCTION=f"You are an image analyst expert in object detection.\n You are tasked with detecting all distinct objects present within the given image.\
Identify and list every object, logo, brand, text, even if they are small, partially visible or inside anothe objects."

VAR_INSTRUCTIONS= f"""For each object provide its x, y coordinates for the bounding box (top-left and bottom-right corners).
Classify the object and provide its name as a text description.
Assign a preliminary name for the object based on its context in the image.
Ensure that:
- Objects inside other objects are treated as separate entities with their own coordinates and descriptions.
- All objects are captured and avoid omitting any items.\n
The JSON output is structured clearly with the following JSON format.<JSONSchema>{json.dumps(schema)}</JSONSchema>"
               """
description_prompt=VAR_INTRODUCTION+VAR_INSTRUCTIONS
 

In [37]:
import base64
import time
import typing
import math
import numpy as np

from google.cloud import aiplatform
from google.protobuf import struct_pb2

#libraries to generate image summaries
from vertexai.vision_models import Video
from vertexai.vision_models import VideoSegmentConfig
from vertexai.vision_models import MultiModalEmbeddingModel
from vertexai.language_models import TextEmbeddingModel
from vertexai.vision_models import Image as vision_model_Image
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part as GenerativeModelPart,
    HarmBlockThreshold,
    HarmCategory,
)
from typing import Any, Dict, List, Literal, Optional, Union
 

contents= [{
                                                                    "role": "user",
                                                                    "parts": [
                                                                        {
                                                                    
                                                                        
                                                                        "file_data": {
                                                                            "mime_type":  "image/jpeg",
                                                                            "file_uri": "gs://nineshowcaseassets/IMAGES/009ab9c0acee5c85d3551b01a82fef7d8425f6d1.jpeg"
                                                                        } 
                                                                        

                                                                        },
                                                                        { "text": description_prompt }
                                                                    ]
                                                                    }
                                                                ]


generative_multimodal_model= GenerativeModel("gemini-1.5-pro-002")
generation_config=GenerationConfig(temperature=0.01, top_k=40,top_p=0.95,max_output_tokens=8192, response_mime_type='application/json')
 
safety_settings=  {
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                    }

model_response = generative_multimodal_model.generate_content(
                                    contents ,safety_settings=safety_settings,generation_config=generation_config,
                                   )   

In [39]:

import json
# Convert JSON string to a Python dictionary
parsed_json = json.loads(model_response.candidates[0].content.parts[0].text)

print(len(parsed_json['objects']))
parsed_json['objects']

10


[{'name': 'woman',
  'description': 'A woman wearing a white sleeveless top with floral embroidery.',
  'x1': 268,
  'y1': 20,
  'x2': 795,
  'y2': 829},
 {'name': 'food',
  'description': 'A plate of food with rice, vegetables, and a piece of meat.',
  'x1': 250,
  'y1': 759,
  'x2': 457,
  'y2': 982},
 {'name': 'plate',
  'description': 'A white plate',
  'x1': 782,
  'y1': 802,
  'x2': 985,
  'y2': 999},
 {'name': 'food',
  'description': 'A plate with crispy fried food, possibly pork skin.',
  'x1': 487,
  'y1': 815,
  'x2': 982,
  'y2': 999},
 {'name': 'refrigerator',
  'description': 'A white refrigerator with glass doors.',
  'x1': 498,
  'y1': 0,
  'x2': 750,
  'y2': 824},
 {'name': 'bottles',
  'description': 'Bottles inside the refrigerator.',
  'x1': 535,
  'y1': 60,
  'x2': 709,
  'y2': 447},
 {'name': 'bottles',
  'description': 'Bottles on a shelf.',
  'x1': 280,
  'y1': 317,
  'x2': 402,
  'y2': 460},
 {'name': 'tiktok_logo',
  'description': 'The TikTok logo.',
  'x1': 

In [12]:

import json
# Convert JSON string to a Python dictionary
parsed_json = json.loads(model_response.candidates[0].content.parts[0].text)

print(len(parsed_json['objects']))
parsed_json['objects']

12


[{'bbox': {'x1': 0, 'y1': 0, 'x2': 428, 'y2': 254},
  'category': 'Packaged Goods',
  'name': 'product box'},
 {'bbox': {'x1': 1, 'y1': 248, 'x2': 303, 'y2': 902},
  'category': 'Signage',
  'name': 'price tag'},
 {'bbox': {'x1': 294, 'y1': 303, 'x2': 556, 'y2': 699},
  'category': 'Signage',
  'name': 'price tag'},
 {'bbox': {'x1': 536, 'y1': 0, 'x2': 722, 'y2': 343},
  'category': 'Packaged Goods',
  'name': 'product box'},
 {'bbox': {'x1': 544, 'y1': 380, 'x2': 692, 'y2': 618},
  'category': 'Signage',
  'name': 'price tag'},
 {'bbox': {'x1': 554, 'y1': 539, 'x2': 813, 'y2': 999},
  'category': 'Packaged Goods',
  'name': 'product box'},
 {'bbox': {'x1': 681, 'y1': 0, 'x2': 870, 'y2': 371},
  'category': 'Packaged Goods',
  'name': 'product box'},
 {'bbox': {'x1': 780, 'y1': 600, 'x2': 972, 'y2': 999},
  'category': 'Packaged Goods',
  'name': 'product box'},
 {'bbox': {'x1': 861, 'y1': 0, 'x2': 999, 'y2': 414},
  'category': 'Packaged Goods',
  'name': 'product box'},
 {'bbox': {'x

12