In [11]:
from google import genai
from google.genai import types
from dotenv import load_dotenv
import os
import cv2
import json
from pathlib import Path

In [13]:
load_dotenv()
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
MODEL_ID = "gemini-robotics-er-1.5-preview"

image_folder_path = Path("/home/ubuntu/pi-sim-evals/test_imgs/")
image_path = image_folder_path / "exterior_img.png"

image = cv2.imread(str(image_path))
with open(str(image_path), 'rb') as f:
    image_bytes = f.read()

In [15]:
def parse_json(json_output):
  # Parsing out the markdown fencing
  lines = json_output.splitlines()
  for i, line in enumerate(lines):
    if line == "```json":
      # Remove everything before "```json"
      json_output = "\n".join(lines[i + 1 :])
      # Remove everything after the closing "```"
      json_output = json_output.split("```")[0]
      break  # Exit the loop once "```json" is found
  return json_output

In [16]:
PROMPT = """
          Point to no more than 10 items in the image. The label returned
          should be an identifying name for the object detected.
          The answer should follow the json format: [{"point": <point>,
          "label": <label1>}, ...]. The points are in [y, x] format
          normalized to 0-1000.
        """

image_response = client.models.generate_content(
    model=MODEL_ID,
    contents=[
        types.Part.from_bytes(
            data=image_bytes,
            mime_type='image/png',
        ),
        PROMPT
    ],
    config = types.GenerateContentConfig(
        temperature=0.5,
        thinking_config=types.ThinkingConfig(thinking_budget=0)
    )
)

In [26]:
json_output = json.loads(parse_json(image_response.text))

# scale the points to the original image size
y_scale = image.shape[0] / 1000
x_scale = image.shape[1] / 1000

# scale the points to the original image size
for item in json_output:
    item['point'] = (int(item['point'][0] * y_scale), int(item['point'][1] * x_scale))

json_output

[{'point': (122, 130), 'label': 'red circle'},
 {'point': (114, 138), 'label': 'blue circle'},
 {'point': (131, 107), 'label': 'green block'},
 {'point': (144, 118), 'label': 'blue block'},
 {'point': (113, 158), 'label': 'yellow block'},
 {'point': (124, 139), 'label': 'red plate'},
 {'point': (131, 125), 'label': 'wooden table'},
 {'point': (86, 168), 'label': 'counter'},
 {'point': (84, 67), 'label': 'brick wall'},
 {'point': (65, 116), 'label': 'chair'}]

In [27]:
annotated_image = image.copy()
for item in json_output:
    cv2.circle(annotated_image, (item['point'][1], item['point'][0]), 10, (0, 0, 255), 2)
    cv2.putText(annotated_image, item['label'], (item['point'][1], item['point'][0]), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)

cv2.imwrite(image_folder_path / "annotated_image.png", annotated_image)

True