# Open-Vocabulary Object Detection Demo

This notebook creates a side-by-side GIF animation comparing our method in generalized zero-shot mode vs. a supervised baseline trained on seen classes.

Let's start with a few standard imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '7'

In [3]:
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab

import imageio
import requests
from io import BytesIO
from PIL import Image
import numpy as np

In [4]:
import json
import cv2
from copy import deepcopy

In [5]:
from maskrcnn_benchmark.config import cfg
from predictor import COCODemo

In [6]:
imageio.plugins.freeimage.download()

In [7]:
fourcc = cv2.VideoWriter_fourcc(*'XVID')

In [8]:
cfg2 = deepcopy(cfg)

In [9]:
cfg.merge_from_list([
    "MODEL.WEIGHT", "/home/alireza/workspace/ovo/runs-new/maskrcnn/130-rep/model_0120000.pth",
    "MODEL.CLS_AGNOSTIC_BBOX_REG", True,
    "MODEL.ROI_BOX_HEAD.EMB_DIM", 768,
    "MODEL.ROI_BOX_HEAD.EMBEDDING_BASED", True,
])

Now we create the `COCODemo` object. It contains a few extra options for conveniency, such as the confidence threshold for detections to be shown.

In [10]:
coco_demo = COCODemo(
    cfg,
    min_image_size=800,
    confidence_threshold=0.7,
)

Loading classes and their embedding

In [11]:
ann_file = '../datasets/coco/zero-shot/instances_val2017_all_2.json'

In [12]:
with open(ann_file, 'r') as fin:
    ann_data = json.load(fin)
class_embeddings = [np.zeros((768,), dtype=np.float32)]
class_names = ['__background']
for item in ann_data['categories']:
    class_embeddings.append(np.asarray(
        item['embedding']['BertEmb'], 
        dtype=np.float32))
    class_names.append(item['name'])
class_embeddings = np.stack(class_embeddings)

In [13]:
ann_file2 = '../datasets/coco/zero-shot/instances_val2017_seen.json'
with open(ann_file2, 'r') as fin:
    ann_data2 = json.load(fin)
class_names2 = ['__background']
for item in ann_data2['categories']:
    class_names2.append(item['name'])


In [14]:
unseen_indices = [i for i, l in enumerate(class_names) if l not in class_names2]

In [15]:
print(unseen_indices)

[5, 6, 12, 13, 16, 17, 22, 24, 28, 30, 33, 35, 46, 48, 55, 59, 64]


In [16]:
coco_demo.CATEGORIES = class_names
coco_demo.UNSEEN_CAT_INDICES = unseen_indices
coco_demo.model.roi_heads['box'].predictor.set_class_embeddings(class_embeddings)

Repeating the same to setup the baseline model

In [17]:
cfg2.merge_from_list([
    "MODEL.WEIGHT", "/home/alireza/workspace/ovo/runs-extra-ckpt/maskrcnn/004/model_0180000.pth",
    "MODEL.CLS_AGNOSTIC_BBOX_REG", True,
    "MODEL.ROI_BOX_HEAD.NUM_CLASSES", 81,
    "MODEL.ROI_BOX_HEAD.EMBEDDING_BASED", False,
])
coco_demo_2 = COCODemo(
    cfg2,
    min_image_size=800,
    confidence_threshold=0.7,
)
coco_demo_2.CATEGORIES = class_names2

Loading each frame of each video, processing it through two models, visualizing each, stitching side by side, and saving to a new video file.

In [18]:
input_path = '../datasets/videos/input/02/'
output_path = '../../outputs/rcnn_ov/videos/02.01/'
file_list = os.listdir(input_path)
assert not os.path.isdir(output_path)
os.makedirs(output_path)

In [19]:
file_list

['uUUWejExcLk.mp4',
 '9HF9a9P-lLw.mp4',
 '4vGaWRRs1cc.mp4',
 'bhWhuUhAbqk.mp4',
 'ufpqxRukWaI.mp4',
 'mGVHNXcwcKk.mp4',
 'JSvwWs7PIsw.mp4',
 'LYBKNaGad7o.mp4',
 '0xBxtyxw488.mp4',
 'cF0316u8OG4.mp4',
 'vOO_fDMyRiM.mp4',
 'Ehbg7RUmk.mp4',
 'EiLpNm8C0TM.mp4',
 'IxGPyoml9C4.mp4',
 'mqFXobS9KTw.mp4']

In [20]:
file_list = [
    '9HF9a9P-lLw.mp4',
    'cF0316u8OG4.mp4',
    'IxGPyoml9C4.mp4',    
    'vOO_fDMyRiM.mp4',
]

In [21]:
solo = False
side_by_side = True
generate_video = True
generate_gif = False

for fname in file_list:
    try:
        cap = cv2.VideoCapture(os.path.join(input_path, fname))
        out_sol = None
        out_sbs = None
        gif_sol = []
        gif_sbs = []
        opened = False
        while(cap.isOpened()):
            ret, frame = cap.read()
            if ret == False:
                if not opened:
                    raise Exception("Cannot open input file.")
                break
            opened = True
            if generate_video and solo and out_sol is None:
                out_sol = cv2.VideoWriter(os.path.join(output_path, 'solo_' + fname), fourcc, 
                    25.0, (frame.shape[1], frame.shape[0]))
                if not out_sol.isOpened():
                    raise Exception("Cannot open output file.")
            if generate_video and side_by_side and out_sbs is None:
                out_sbs = cv2.VideoWriter(os.path.join(output_path, 'side_by_side_' + fname), fourcc, 
                    25.0, (frame.shape[1] * 2, frame.shape[0]))
                if not out_sbs.isOpened():
                    raise Exception("Cannot open output file.")
            _, _, vis1 = coco_demo.run_on_opencv_image(frame)    
            if solo:
                if generate_gif:
                    gif_sol.append(vis1)
                if generate_video:
                    out_sol.write(vis1)
            if side_by_side:
                _, _, vis2 = coco_demo_2.run_on_opencv_image(frame)  
                vis_sbs = np.concatenate([vis2, vis1], axis=1)
                if generate_gif:
                    gif_sbs.append(vis_sbs)
                if generate_video:
                    out_sbs.write(vis_sbs)
        cap.release()
        if generate_video and solo:
            out_sol.release()
            out_sol = None
        if generate_video and side_by_side:
            out_sbs.release()
            out_sbs = None
        if generate_gif and solo:
            gif_sol = [Image.fromarray(item[:, :, ::-1]) for item in gif_sol]
            imageio.mimsave(
                os.path.join(output_path, 'solo_' + fname[:-3] + 'gif'), 
                gif_sol, 
                'GIF-FI',
                fps=25,
                quantizer='nq', # The options are 'nq' and 'wu'
                loop=0,
            )
        if generate_gif and side_by_side:
            gif_sbs = [Image.fromarray(item[:, :, ::-1]) for item in gif_sbs]
            imageio.mimsave(
                os.path.join(output_path, 'side_by_side_' + fname[:-3] + 'gif'), 
                gif_sbs, 
                'GIF-FI',
                fps=25,
                quantizer='nq', # The options are 'nq' and 'wu'
                loop=0,
            )
    except Exception as ex:
        print(ex)
        print(fname)
