## GroundingDINO Inference

In [1]:
# Input and output directories
INPUT_DIR = "../zm_scraper/auctions/preprocessed"
OUTPUT_DIR = "../zm_scraper/auctions/gdino/output"
OUTPUT_LABELS_DIR = "../zm_scraper/auctions/gdino/output_labels"
OUTPUT_DISPLAY_DIR = "../zm_scraper/auctions/gdino/output_display"

In [2]:
# Configurable thresholds and prompt
TEXT_PROMPT_CSV = "../zm_scraper/items-prompt.csv"
BOX_THRESHOLD = 0.1
TEXT_THRESHOLD = 0.25
REMOVE_NESTING = True   ## Nesting Filtering during inference (mild postprocessing)
IOU_NMS_THRESHOLD = 0.9

In [8]:
import os
import cv2
import torch
import pandas as pd
from pathlib import Path
import numpy as np
import json
import math
from groundingdino.util.inference import load_model, load_image, predict, annotate


def iou_xyxy(a, b):
    ax1, ay1, ax2, ay2 = a
    bx1, by1, bx2, by2 = b
    inter_w = max(0.0, min(ax2, bx2) - max(ax1, bx1))
    inter_h = max(0.0, min(ay2, by2) - max(ay1, by1))
    inter = inter_w * inter_h
    if inter <= 0:
        return 0.0
    area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1)
    area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1)
    union = area_a + area_b - inter
    return inter / union if union > 0 else 0.0

# -----------------------------------
# Config
# -----------------------------------
MODEL_CONFIG_PATH = "./groundingdino/config/GroundingDINO_SwinB_cfg.py"
MODEL_WEIGHTS_PATH = "./weights/checkpoint_best_regular.pth"

# -----------------------------------
# Load model
# -----------------------------------
model = load_model(MODEL_CONFIG_PATH, MODEL_WEIGHTS_PATH)

# Create output base dirs
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_LABELS_DIR, exist_ok=True)

# -----------------------------------
# Load prompts CSV
# -----------------------------------
df_prompts = pd.read_csv(TEXT_PROMPT_CSV)
id_to_prompt = dict(zip(df_prompts["id"].astype(str), df_prompts["prompt"]))

# -----------------------------------
# Process images per ID
# -----------------------------------
for item_id, text_prompt in id_to_prompt.items():
    input_subdir = Path(INPUT_DIR) / item_id
    output_subdir = Path(OUTPUT_DIR) / item_id
    labels_subdir = Path(OUTPUT_LABELS_DIR) / item_id

    if not input_subdir.exists():
        print(f"Skipping {item_id}: folder {input_subdir} does not exist.")
        continue

    output_subdir.mkdir(parents=True, exist_ok=True)
    labels_subdir.mkdir(parents=True, exist_ok=True)

    # Loop over all images in this ID's folder
    for filename in os.listdir(input_subdir):
        input_path = input_subdir / filename

        # Skip non-image files
        if not (
            filename.lower().endswith(".jpg")
            or filename.lower().endswith(".jpeg")
            or filename.lower().endswith(".png")
        ):
            continue

        base_filename = os.path.splitext(filename)[0]

        # Load image
        image_source, image = load_image(str(input_path))
        hgt, wid = image_source.shape[:2]

        # Predict
        boxes, logits, phrases = predict(
            model=model,
            image=image,
            caption=text_prompt,
            box_threshold=BOX_THRESHOLD,
            text_threshold=TEXT_THRESHOLD,
        )

        # Prepare detections
        detections = []
        for i in range(len(boxes)):
            cx_norm, cy_norm, w_norm, h_norm = boxes[i].tolist()

            cx_abs = cx_norm * wid
            cy_abs = cy_norm * hgt
            w_abs = w_norm * wid
            h_abs = h_norm * hgt

            x_min = cx_abs - w_abs / 2
            y_min = cy_abs - h_abs / 2
            x_max = cx_abs + w_abs / 2
            y_max = cy_abs + h_abs / 2

            area = w_abs * h_abs

            phrase = phrases[i] if phrases[i] else "NIL"
            detections.append({
                "box_norm": [cx_norm, cy_norm, w_norm, h_norm],
                "box_abs": [x_min, y_min, x_max, y_max],
                "logit": logits[i],
                "phrase": phrase,
                "area": area
            })

        if REMOVE_NESTING:
            # Sort by confidence descending so we keep the stronger one first
            detections_sorted = sorted(detections, key=lambda d: float(d["logit"]), reverse=True)
            kept = []
            for det in detections_sorted:
                keep = True
                for k in kept:
                    if iou_xyxy(det["box_abs"], k["box_abs"]) >= IOU_NMS_THRESHOLD:
                        # Overlaps too much with a higher-confidence box -> drop
                        keep = False
                        break
                if keep:
                    kept.append(det)
            detections = kept


        # Reconstruct lists for annotation
        final_boxes = [d["box_norm"] for d in detections]
        final_logits = [d["logit"] for d in detections]
        final_phrases = [d["phrase"] for d in detections]

        if final_boxes:
            final_boxes_tensor = torch.tensor(final_boxes)
        else:
            final_boxes_tensor = torch.empty((0, 4))

        # Annotate
        annotated_frame = annotate(
            image_source=image_source,
            boxes=final_boxes_tensor,
            logits=final_logits,
            phrases=final_phrases,
        )
        
        # Draw annotation IDs on the image
        for idx, det in enumerate(detections, start=1):
            x_min, y_min, x_max, y_max = det["box_abs"]
            label_id = str(idx)  # ID starting at 1 for this image
            cv2.putText(
                annotated_frame,
                label_id,
                (int(x_min), int(y_min) - 5),  # slightly above the top-left corner
                cv2.FONT_HERSHEY_SIMPLEX,
                0.6,        # font scale
                (0, 255, 0),  # green text
                2,          # thickness
                cv2.LINE_AA
            )


        # Save annotated image
        output_image_path = output_subdir / filename
        cv2.imwrite(str(output_image_path), annotated_frame)

        # -----------------------------------
        # Save labels in COCO JSON format
        # -----------------------------------
        coco_output = {
            "images": [{
                "id": 1,  # You can increment across dataset if aggregating
                "file_name": filename,
                "width": wid,
                "height": hgt
            }],
            "categories": [],
            "annotations": []
        }

        category_name_to_id = {}
        category_counter = 1
        annotation_id = 1

        for det in detections:
            phrase = det["phrase"]
            if phrase not in category_name_to_id:
                category_name_to_id[phrase] = category_counter
                coco_output["categories"].append({
                    "id": category_counter,
                    "name": phrase
                })
                category_counter += 1

            x_min, y_min, x_max, y_max = det["box_abs"]
            coco_bbox = [
                int(x_min),
                int(y_min),
                int(x_max - x_min),  # width
                int(y_max - y_min)   # height
            ]

            coco_output["annotations"].append({
                "id": annotation_id,
                "image_id": 1,
                "category_id": category_name_to_id[phrase],
                "bbox": [round(float(v), 3) for v in coco_bbox],  # round each bbox value
                "area": int(det["area"]),
                "score": round(float(det["logit"]), 3),
                "iscrowd": 0
            })
            annotation_id += 1

        # Save JSON
        output_json_path = labels_subdir / f"{base_filename}.json"
        with open(output_json_path, "w") as f:
            json.dump(coco_output, f, indent=2)
        
#         Save as text
#         output_label_path = labels_subdir / (base_filename + ".txt")
#         with open(output_label_path, "w") as f:
#             for det in detections:
#                 x_min, y_min, x_max, y_max = det["box_abs"]
#                 line = (
#                     f"{det['phrase']} {det['logit']:.4f} "
#                     f"{x_min:.2f} {y_min:.2f} {x_max:.2f} {y_max:.2f}\n"
#                 )
#                 f.write(line)

        print(f"[{item_id}] Processed {filename} -> {output_image_path}")

print("All images processed.")


final text_encoder_type: bert-base-uncased




[1] Processed c1190881000.png -> ../zm_scraper/listing/gdino/output/1/c1190881000.png
[1] Processed b1187681880.png -> ../zm_scraper/listing/gdino/output/1/b1187681880.png
[1] Processed x1189677434.png -> ../zm_scraper/listing/gdino/output/1/x1189677434.png
[1] Processed m1168646860.png -> ../zm_scraper/listing/gdino/output/1/m1168646860.png
[1] Processed r1186986297.png -> ../zm_scraper/listing/gdino/output/1/r1186986297.png
[1] Processed c1191323690.png -> ../zm_scraper/listing/gdino/output/1/c1191323690.png
[1] Processed o1191069257.png -> ../zm_scraper/listing/gdino/output/1/o1191069257.png
[1] Processed q1190999908.png -> ../zm_scraper/listing/gdino/output/1/q1190999908.png
[1] Processed h1191606615.png -> ../zm_scraper/listing/gdino/output/1/h1191606615.png
[1] Processed x1191598905.png -> ../zm_scraper/listing/gdino/output/1/x1191598905.png
[1] Processed m1191510476.png -> ../zm_scraper/listing/gdino/output/1/m1191510476.png
[1] Processed l1182133390.png -> ../zm_scraper/listing

[1] Processed k1191395334.png -> ../zm_scraper/listing/gdino/output/1/k1191395334.png
[1] Processed j1187062556.png -> ../zm_scraper/listing/gdino/output/1/j1187062556.png
[1] Processed x1121603563.png -> ../zm_scraper/listing/gdino/output/1/x1121603563.png
[1] Processed w1181337501.png -> ../zm_scraper/listing/gdino/output/1/w1181337501.png
[1] Processed n1191590384.png -> ../zm_scraper/listing/gdino/output/1/n1191590384.png
[1] Processed n1191690494.png -> ../zm_scraper/listing/gdino/output/1/n1191690494.png
[1] Processed l1180181252.png -> ../zm_scraper/listing/gdino/output/1/l1180181252.png
[1] Processed p1187246330.png -> ../zm_scraper/listing/gdino/output/1/p1187246330.png
[1] Processed w1191415731.png -> ../zm_scraper/listing/gdino/output/1/w1191415731.png
[1] Processed u1191585367.png -> ../zm_scraper/listing/gdino/output/1/u1191585367.png
[1] Processed s1178157340.png -> ../zm_scraper/listing/gdino/output/1/s1178157340.png
[1] Processed b1190374058.png -> ../zm_scraper/listing

[1] Processed j1155396060.png -> ../zm_scraper/listing/gdino/output/1/j1155396060.png
[1] Processed q1183324759.png -> ../zm_scraper/listing/gdino/output/1/q1183324759.png
[1] Processed j1191431672.png -> ../zm_scraper/listing/gdino/output/1/j1191431672.png
[1] Processed u1187785966.png -> ../zm_scraper/listing/gdino/output/1/u1187785966.png
[1] Processed e1183317765.png -> ../zm_scraper/listing/gdino/output/1/e1183317765.png
[1] Processed v1120503911.png -> ../zm_scraper/listing/gdino/output/1/v1120503911.png
[1] Processed p1188910687.png -> ../zm_scraper/listing/gdino/output/1/p1188910687.png
[1] Processed b1183312199.png -> ../zm_scraper/listing/gdino/output/1/b1183312199.png
[2] Processed x1185026795.png -> ../zm_scraper/listing/gdino/output/2/x1185026795.png
[2] Processed 1183422386.png -> ../zm_scraper/listing/gdino/output/2/1183422386.png
[2] Processed e1191032005.png -> ../zm_scraper/listing/gdino/output/2/e1191032005.png
[2] Processed w1183158454.png -> ../zm_scraper/listing/g

[3] Processed o1191829396.png -> ../zm_scraper/listing/gdino/output/3/o1191829396.png
[3] Processed q1191566819.png -> ../zm_scraper/listing/gdino/output/3/q1191566819.png
[3] Processed w1191615331.png -> ../zm_scraper/listing/gdino/output/3/w1191615331.png
[3] Processed o1191824345.png -> ../zm_scraper/listing/gdino/output/3/o1191824345.png
[3] Processed e1190825724.png -> ../zm_scraper/listing/gdino/output/3/e1190825724.png
[3] Processed q1175201682.png -> ../zm_scraper/listing/gdino/output/3/q1175201682.png
[3] Processed e1191338151.png -> ../zm_scraper/listing/gdino/output/3/e1191338151.png
[3] Processed b1187277326.png -> ../zm_scraper/listing/gdino/output/3/b1187277326.png
[3] Processed w1191003804.png -> ../zm_scraper/listing/gdino/output/3/w1191003804.png
[3] Processed 1182698598.png -> ../zm_scraper/listing/gdino/output/3/1182698598.png
[3] Processed e1179009805.png -> ../zm_scraper/listing/gdino/output/3/e1179009805.png
[3] Processed f1154734978.png -> ../zm_scraper/listing/g

[3] Processed l1191353837.png -> ../zm_scraper/listing/gdino/output/3/l1191353837.png
[3] Processed t1191527313.png -> ../zm_scraper/listing/gdino/output/3/t1191527313.png
[3] Processed b1187441783.png -> ../zm_scraper/listing/gdino/output/3/b1187441783.png
[3] Processed j1185687129.png -> ../zm_scraper/listing/gdino/output/3/j1185687129.png
[3] Processed w1184158234.png -> ../zm_scraper/listing/gdino/output/3/w1184158234.png
[3] Processed m1191425689.png -> ../zm_scraper/listing/gdino/output/3/m1191425689.png
[3] Processed o1190944251.png -> ../zm_scraper/listing/gdino/output/3/o1190944251.png
[3] Processed p1171340896.png -> ../zm_scraper/listing/gdino/output/3/p1171340896.png
[3] Processed g1190570632.png -> ../zm_scraper/listing/gdino/output/3/g1190570632.png
[3] Processed j1190929952.png -> ../zm_scraper/listing/gdino/output/3/j1190929952.png
[3] Processed x1179305942.png -> ../zm_scraper/listing/gdino/output/3/x1179305942.png
[3] Processed r1182700824.png -> ../zm_scraper/listing

[4] Processed j1191559623.png -> ../zm_scraper/listing/gdino/output/4/j1191559623.png
[4] Processed c1189277412.png -> ../zm_scraper/listing/gdino/output/4/c1189277412.png
[4] Processed 1126237052.png -> ../zm_scraper/listing/gdino/output/4/1126237052.png
[4] Processed p1191768461.png -> ../zm_scraper/listing/gdino/output/4/p1191768461.png
[4] Processed c1191622454.png -> ../zm_scraper/listing/gdino/output/4/c1191622454.png
[4] Processed 1190767842.png -> ../zm_scraper/listing/gdino/output/4/1190767842.png
[4] Processed p1191618639.png -> ../zm_scraper/listing/gdino/output/4/p1191618639.png
[4] Processed u1191631296.png -> ../zm_scraper/listing/gdino/output/4/u1191631296.png
[4] Processed r1191222266.png -> ../zm_scraper/listing/gdino/output/4/r1191222266.png
[4] Processed n1191549926.png -> ../zm_scraper/listing/gdino/output/4/n1191549926.png
[4] Processed h1191600287.png -> ../zm_scraper/listing/gdino/output/4/h1191600287.png
[4] Processed p1191568309.png -> ../zm_scraper/listing/gdi

[5] Processed f1174041028.png -> ../zm_scraper/listing/gdino/output/5/f1174041028.png
[5] Processed e1178664397.png -> ../zm_scraper/listing/gdino/output/5/e1178664397.png
[5] Processed p1187267693.png -> ../zm_scraper/listing/gdino/output/5/p1187267693.png
[5] Processed e1187205501.png -> ../zm_scraper/listing/gdino/output/5/e1187205501.png
[5] Processed j1191728283.png -> ../zm_scraper/listing/gdino/output/5/j1191728283.png
[5] Processed m1191719765.png -> ../zm_scraper/listing/gdino/output/5/m1191719765.png
[5] Processed u1127527151.png -> ../zm_scraper/listing/gdino/output/5/u1127527151.png
[5] Processed n1181024942.png -> ../zm_scraper/listing/gdino/output/5/n1181024942.png
[5] Processed b1191057471.png -> ../zm_scraper/listing/gdino/output/5/b1191057471.png
[5] Processed p1190346016.png -> ../zm_scraper/listing/gdino/output/5/p1190346016.png
[5] Processed u1185937530.png -> ../zm_scraper/listing/gdino/output/5/u1185937530.png
[5] Processed k1191729356.png -> ../zm_scraper/listing

[5] Processed b1094751541.png -> ../zm_scraper/listing/gdino/output/5/b1094751541.png
[5] Processed b1191737718.png -> ../zm_scraper/listing/gdino/output/5/b1191737718.png
[5] Processed b1160855625.png -> ../zm_scraper/listing/gdino/output/5/b1160855625.png
[5] Processed g1191629025.png -> ../zm_scraper/listing/gdino/output/5/g1191629025.png
[5] Processed r1191422542.png -> ../zm_scraper/listing/gdino/output/5/r1191422542.png
[5] Processed q1031915603.png -> ../zm_scraper/listing/gdino/output/5/q1031915603.png
[5] Processed l1178403345.png -> ../zm_scraper/listing/gdino/output/5/l1178403345.png
[5] Processed f1175804046.png -> ../zm_scraper/listing/gdino/output/5/f1175804046.png
[5] Processed s1191121020.png -> ../zm_scraper/listing/gdino/output/5/s1191121020.png
[5] Processed w1191580281.png -> ../zm_scraper/listing/gdino/output/5/w1191580281.png
[5] Processed x1162587148.png -> ../zm_scraper/listing/gdino/output/5/x1162587148.png
[5] Processed l1191182887.png -> ../zm_scraper/listing

## Visualize images

In [9]:
import matplotlib.pyplot as plt
import os
from math import ceil
import pandas as pd
import numpy as np


valid_image_ext = ['.jpg', '.jpeg', '.png']

def visualize_and_save_images_for_item(
    image_paths,
    output_display_path,
    item_id,
    part_idx,
    prompt="",
    box_thresh=0.0,
    text_thresh=0.0,
    num_cols=5
):
    num_images = len(image_paths)
    num_rows = int(ceil(float(num_images) / float(num_cols)))

    f, axarr = plt.subplots(num_rows, num_cols, figsize=(num_cols * 6, num_rows * 6))
    f.tight_layout(pad=3.0)

    # Ensure axarr is flattened list
    if num_rows == 1:
        axarr = [axarr]
    axarr = sum([list(r) if isinstance(r, (list, tuple, np.ndarray)) else [r] for r in axarr], [])

    for idx in range(num_rows * num_cols):
        ax = axarr[idx]
        ax.axis('off')

        if idx < num_images:
            img_path = image_paths[idx]
            img = plt.imread(img_path)

            overlay_text = (
                f"{os.path.basename(img_path)}\n"
                f"Prompt: '{prompt}'\n"
                f"Box Thresh: {box_thresh} | Text Thresh: {text_thresh}"
            )

            ax.imshow(img)
            ax.text(
                1.0,
                0.0,
                overlay_text,
                ha="right",
                va="bottom",
                fontsize=9,
                color="white",
                wrap=True,
                transform=ax.transAxes,
                bbox=dict(
                    facecolor="black",
                    alpha=0.5,
                    boxstyle="round,pad=0.3"
                ),
            )

    output_file = os.path.join(output_display_path, f"{item_id}_{part_idx}.png")
    plt.savefig(output_file)
    plt.close(f)
    print(f"[{item_id}] Saved visualization part {part_idx} to {output_file}")


# -------------------------------------------------------------
# Main function to iterate over all IDs in ITEMS_LIST
# -------------------------------------------------------------
def batch_visualize_items(
    items_list_csv,
    output_base_dir,
    output_display_dir,
    num_cols=5,
    box_thresh=0.0,
    text_thresh=0.0,
    max_images_per_grid=25
):
    os.makedirs(output_display_dir, exist_ok=True)

    df = pd.read_csv(items_list_csv, usecols=["id", "prompt"])
    ids_and_prompts = df[["id", "prompt"]].astype(str).values.tolist()

    for item_id, prompt in ids_and_prompts:
        input_folder = os.path.join(output_base_dir, item_id)
        if not os.path.exists(input_folder):
            print(f"[{item_id}] Folder {input_folder} does not exist. Skipping.")
            continue

        # Gather all images
        image_paths = [
            os.path.join(input_folder, image)
            for image in sorted(os.listdir(input_folder))
            if os.path.splitext(image)[1].lower() in valid_image_ext
        ]

        if not image_paths:
            print(f"[{item_id}] No images found in {input_folder}. Skipping.")
            continue

        # Split into batches of max_images_per_grid
        total_images = len(image_paths)
        parts = [
            image_paths[i:i + max_images_per_grid]
            for i in range(0, total_images, max_images_per_grid)
        ]

        for part_idx, part_image_paths in enumerate(parts, start=1):
            visualize_and_save_images_for_item(
                image_paths=part_image_paths,
                output_display_path=output_display_dir,
                item_id=item_id,
                part_idx=part_idx,
                prompt=prompt,
                box_thresh=box_thresh,
                text_thresh=text_thresh,
                num_cols=num_cols
            )

# -------------------------------------------------------------
# Example usage
# -------------------------------------------------------------

batch_visualize_items(
    items_list_csv=TEXT_PROMPT_CSV,
    output_base_dir=OUTPUT_DIR,
    output_display_dir=OUTPUT_DISPLAY_DIR,
    num_cols=5,
    box_thresh=BOX_THRESHOLD,
    text_thresh=TEXT_THRESHOLD,
    max_images_per_grid=25
)


[1] Saved visualization part 1 to ../zm_scraper/listing/gdino/output_display/1_1.png
[1] Saved visualization part 2 to ../zm_scraper/listing/gdino/output_display/1_2.png
[1] Saved visualization part 3 to ../zm_scraper/listing/gdino/output_display/1_3.png
[1] Saved visualization part 4 to ../zm_scraper/listing/gdino/output_display/1_4.png
[1] Saved visualization part 5 to ../zm_scraper/listing/gdino/output_display/1_5.png
[1] Saved visualization part 6 to ../zm_scraper/listing/gdino/output_display/1_6.png
[1] Saved visualization part 7 to ../zm_scraper/listing/gdino/output_display/1_7.png
[1] Saved visualization part 8 to ../zm_scraper/listing/gdino/output_display/1_8.png
[2] Saved visualization part 1 to ../zm_scraper/listing/gdino/output_display/2_1.png
[2] Saved visualization part 2 to ../zm_scraper/listing/gdino/output_display/2_2.png
[2] Saved visualization part 3 to ../zm_scraper/listing/gdino/output_display/2_3.png
[2] Saved visualization part 4 to ../zm_scraper/listing/gdino/out