In [1]:
from architectures.baseline.yoloclip import YOLOClip
import torch
from PIL import Image
from io import BytesIO
from dataset import RefCocoBatch, RefCocoConfig, RefCocoDataset
from pathlib import Path
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm

import clip
import torch.optim as optim
import matplotlib.patches as patches
import numpy as np


In [2]:
cfg = RefCocoConfig({
    "path": "../refcocog"
})

train_dataset = RefCocoDataset(config = cfg, phase = "train")
test_dataset = RefCocoDataset(config = cfg, phase = "test")

In [3]:
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=RefCocoDataset.batchify)

### Model

In [4]:
model = YOLOClip(device="cuda") 
#model, preprocessor = clip.load("ViT-B/32", device=device)
#optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)

YOLOv5  2023-3-27 Python-3.8.13 torch-1.12.1 CUDA:0 (NVIDIA GeForce RTX 3070 Laptop GPU, 8192MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [6]:
def get_iou(bb1, bb2):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.

    Parameters
    ----------
    bb1 : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x1, y1) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner
    bb2 : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x, y) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner

    Returns
    -------
    float
        in [0, 1]
    """
    assert bb1['x1'] < bb1['x2']
    assert bb1['y1'] < bb1['y2']
    assert bb2['x1'] < bb2['x2']
    assert bb2['y1'] < bb2['y2']

    # determine the coordinates of the intersection rectangle
    x_left = max(bb1['x1'], bb2['x1'])
    y_top = max(bb1['y1'], bb2['y1'])
    x_right = min(bb1['x2'], bb2['x2'])
    y_bottom = min(bb1['y2'], bb2['y2'])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = (bb1['x2'] - bb1['x1']) * (bb1['y2'] - bb1['y1'])
    bb2_area = (bb2['x2'] - bb2['x1']) * (bb2['y2'] - bb2['y1'])

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    assert iou >= 0.0
    assert iou <= 1.0
    return iou

In [18]:
model_iou = []
for batch in tqdm(train_dataloader):
    bboxes = model(batch.images, batch.sentences)   
    
    for i, obj in enumerate(bboxes):
        if obj is not None:
        
            fig, ax = plt.subplots()
            _, img_height, img_width = batch.images[i].shape
            ax.imshow(batch.images[i].permute(1, 2, 0))
            
            # Pred
            xmin, ymin, xmax, ymax = obj[0][:4].cpu().numpy()
            width = xmax - xmin
            height = ymax - ymin
            xcenter = xmin + width/2
            ycenter = ymin + height/2
            ax.add_patch(patches.Rectangle(xy=(xcenter,ycenter), width=width, height=height, fill=None, color="green", linewidth=3))

            # xmin, ymin, xmax, ymax
            pred_box = {"x1": xmin, "x2": xmax, "y1": ymin, "y2": ymax}
            # x, y, w, h
            true_box = {"x1": batch.bboxes[i][0], "x2": batch.bboxes[i][0] + batch.bboxes[i][2], "y1": batch.bboxes[i][1], "y2": batch.bboxes[i][1] + batch.bboxes[i][3]}

            # Ground
            width = true_box["x2"] - true_box["x1"]
            height = true_box["y2"] - true_box["y1"]
            xcenter = true_box["x1"] + width/2
            ycenter = true_box["y1"] + height/2
            ax.add_patch(patches.Rectangle(xy=(xcenter,ycenter), width=width, height=height, fill=None, color="blue", linewidth=3))

            # Other
            for obj in obj[1:]:
                xmin, ymin, xmax, ymax = obj[:4].cpu().numpy()
                width = xmax - xmin 
                height = ymax - ymin
                xcenter = xmin + width/2
                ycenter = ymin + height/2
                ax.add_patch(patches.Rectangle(xy=(xcenter,ycenter), width=width, height=height, fill=None, color="red", linewidth=3))

            # Score & plot
            iou = get_iou(pred_box, true_box)
            model_iou.append(iou)
            ax.set_title(f"{batch.sentences[i][0]}\n({iou}%)")

            plt.savefig("test.jpg")
            plt.close()
            break
    break

  0%|          | 0/1613 [00:00<?, ?it/s]


In [None]:
np.mean(model_iou)