## Load the libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import json
from PIL import Image
import pycocotools.mask as mask_util
import cv2
import glob
from IPython.display import clear_output
import random

## The drawing functions

In [2]:
def show_box(boxes, colors):
    for box, color in zip(boxes, colors):
        x0, y0 = box[0], box[1]
        w, h = box[2], box[3]
        ax = plt.gca()
        ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor=color, facecolor=(0,0,0,0), lw=2))   

def show_anns(masks, colors, borders=True) -> None:
    """
    show the annotations
    """
    # return if no masks
    if len(masks) == 0:
        return

    # sort masks by size
    sorted_annot_and_color = sorted(
        zip(masks, colors), key=(lambda x: x[0].sum()), reverse=True
    )
    H, W = sorted_annot_and_color[0][0].shape[0], sorted_annot_and_color[0][0].shape[1]

    canvas = np.ones((H, W, 4))
    canvas[:, :, 3] = 0  # set the alpha channel
    contour_thickness = max(1, int(min(5, 0.01 * min(H, W))))
    for mask, color in sorted_annot_and_color:
        canvas[mask] = np.concatenate([color, [0.55]])
        if borders:
            contours, _ = cv2.findContours(
                np.array(mask, dtype=np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE
            )
            cv2.drawContours(
                canvas, contours, -1, (0.05, 0.05, 0.05, 1), thickness=contour_thickness
            )

    ax = plt.gca()
    ax.imshow(canvas)

## Set the path
path to the folder with the video frames and path to the json annotation file (ytvis format)

In [3]:
video_dir = "/fsx-onevision/shared/data/ta_data_center/definedAI_release/unified/train/JPEGImages_blurred_v0"
json_annotation = "/fsx-onevision-auto-sync/ythu/sam3/video_grounding/release_20250131/ytvis/balanced/eval_set_v1_20250131_200_pairs_ytvis_format.json"

Load the annotation

In [5]:
data = json.load(open(json_annotation))

The data is in YT-VIS format

In [None]:
data.keys()

In [None]:
data['videos'][0]

In [None]:
data['categories'][0]

Visualize the data

In [None]:
pairs = [pair for pair in data['video_np_pairs'] if pair['num_tracklets_dedup'] > 5]

In [None]:
pair = random.sample(pairs, 1)[0]
video = [vid for vid in data['videos'] if vid['id'] == pair['video_id']][0]
annotations = [annot for annot in data['annotations'] if annot['video_id'] == pair["video_id"] and annot["noun_phrase"] == pair['noun_phrase']]

colors = np.random.random((len(annotations), 3))

for fid in range(video['length']):
    
    masks = []
    bboxes = []
    for tracklet in annotations:
        if tracklet['segmentations'][fid] is not None:
            m = mask_util.decode(tracklet['segmentations'][fid]) > 0
            box = tracklet["bboxes"][fid]
        else:
            m = np.zeros((video["height"], video["width"])) > 0
            box = [0, 0, 0, 0]

        masks.append(m)
        bboxes.append(box)

    clear_output(wait=True)

    img = Image.open(f"{video_dir}/{video['file_names'][fid]}")
    plt.imshow(img)
    show_anns(masks,colors)
    show_box(bboxes, colors)
    plt.title(f"frame={video['file_names'][fid]}, np={pair['noun_phrase']}")
    plt.pause(0.1)  # Adjust the pause duration as needed