In [10]:
from google import genai
from google.genai import types
from dotenv import load_dotenv
import os
import cv2
import json
from pathlib import Path

In [11]:
load_dotenv()
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
MODEL_ID = "gemini-robotics-er-1.5-preview"

image_folder_path = Path("/home/ubuntu/pi-sim-evals/test_imgs/")
image_path = image_folder_path / "right_image_1.png"

image = cv2.imread(str(image_path))
with open(str(image_path), 'rb') as f:
    image_bytes = f.read()

In [12]:
def parse_json(json_output):
  # Parsing out the markdown fencing
  lines = json_output.splitlines()
  for i, line in enumerate(lines):
    if line == "```json":
      # Remove everything before "```json"
      json_output = "\n".join(lines[i + 1 :])
      # Remove everything after the closing "```"
      json_output = json_output.split("```")[0]
      break  # Exit the loop once "```json" is found
  return json_output

In [None]:
def query_gemini(image_bytes):
    PROMPT = """
            Return bounding boxes as a JSON array with labels. Never return masks orcode fencing.
            Find all the objects on the table.
            The label returned should be an identifying name for the object detected.
            If an object is present multiple times, name each according to their UNIQUE CHARACTERISTIC
            (colors, size, position, etc.)
            The format should be as follows:
            [{"box_2d": [ymin, xmin, ymax, xmax], "label": <label for the object>}]
            normalized to 0-1000. The values in box_2d must only be integers.
            """

    image_response = client.models.generate_content(
        model=MODEL_ID,
        contents=[
            types.Part.from_bytes(
                data=image_bytes,
                mime_type='image/png',
            ),
            PROMPT
        ],
        config = types.GenerateContentConfig(
            temperature=0.5,
            thinking_config=types.ThinkingConfig(thinking_budget=0)
        )
    )

    return json.loads(parse_json(image_response.text))

json_output = query_gemini(image_bytes)

In [60]:
# scale the points to the original image size
y_scale = image.shape[0] / 1000
x_scale = image.shape[1] / 1000

# scale the points to the original image size
for item in json_output:
    box_2d = [int(item['box_2d'][0] * y_scale), int(item['box_2d'][1] * x_scale), int(item['box_2d'][2] * y_scale), int(item['box_2d'][3] * x_scale)]
    item['box_2d'] = box_2d

json_output

[{'box_2d': [344, 631, 415, 666], 'label': 'can'},
 {'box_2d': [342, 508, 410, 550], 'label': 'can'},
 {'box_2d': [402, 742, 504, 862], 'label': 'Domino sugar box'},
 {'box_2d': [308, 752, 364, 806], 'label': 'red mug'},
 {'box_2d': [272, 663, 348, 707], 'label': 'yellow bottle'},
 {'box_2d': [449, 538, 550, 663], 'label': 'red bowl'},
 {'box_2d': [357, 839, 416, 910], 'label': "Rubik's cube"},
 {'box_2d': [267, 716, 300, 788], 'label': 'banana'}]

In [71]:
def plot_bounding_boxes(image, json_output):
    annotated_image = image.copy()

    colors = [
        (0, 0, 255),      # red
        (0, 255, 0),      # green
        (255, 0, 0),      # blue
        (0, 255, 255),    # yellow
        (0, 165, 255),    # orange
        (255, 192, 203),  # pink
        (128, 0, 128),    # purple
        (42, 42, 165),    # brown
        (128, 128, 128),  # gray
        (255, 255, 0),    # cyan
    ]

    for i, item in enumerate(json_output):
        color = colors[i % len(colors)]
        y1, x1, y2, x2 = item['box_2d']
        label = item['label']

        cv2.rectangle(annotated_image, (x1, y1), (x2, y2), color, 2) # draw rectangle
        # draw label with background
        text_width, text_height = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)[0]
        cv2.rectangle(annotated_image, (x1, y2 - text_height - 10), (x1 + text_width, y2), color, -1)
        cv2.putText(annotated_image, label, (x1, y2 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 2)

    return annotated_image

annotated_image = plot_bounding_boxes(image, json_output)
cv2.imwrite(image_folder_path / "annotated_image.png", annotated_image)

True