# DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection

https://github.com/IDEA-Research/DINO

[Papers With Code Link](https://paperswithcode.com/paper/focal-modulation-networks)

## Build and install [DINO](https://github.com/IDEA-Research/DINO) Model.

DINO model requires building CUDA ops. After this step, we need to ***restart the runtime***.

In [1]:
!git -C DINO pull || git clone https://github.com/IDEACVR/DINO
%cd DINO
!pip install --quiet -r requirements.txt \
  && cd models/dino/ops \
  && python setup.py -q build install


Already up to date.
/content/DINO
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.17.9 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 10.0.1 which is incompatible.
db-dtypes 1.0.4 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 10.0.1 which is incompatible.[0m
zip_safe flag not set; analyzing archive contents...
__pycache__.MultiScaleDeformableAttention.cpython-38: module references __file__


In [2]:
!pip install --quiet -U pylance duckdb torch torchvision transforms numpy pyarrow pandas

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scipy 1.7.3 requires numpy<1.23.0,>=1.16.5, but you have numpy 1.23.5 which is incompatible.
pandas-gbq 0.17.9 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 10.0.1 which is incompatible.
db-dtypes 1.0.4 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 10.0.1 which is incompatible.[0m


In [15]:
# See https://github.com/IDEA-Research/DINO/blob/main/inference_and_visualization.ipynb
# for instruction to load model
from util.slconfig import SLConfig
from main import build_model_main
import torch

model_config_path = "config/DINO/DINO_4scale.py"

args = SLConfig.fromfile(model_config_path) 
args.device = 'cuda' 
model, criterion, postprocessors = build_model_main(args)

# Download model weights.
#
! [[ -f /tmp/model.pt ]] || gsutil cp gs://eto-public/models/dino/checkpoint0033_4scale.pth /tmp/model.pt

model_checkpoint_path = "/tmp/model.pt"
checkpoint = torch.load(model_checkpoint_path)
model.load_state_dict(checkpoint['model'])
_ = model.cuda().eval()


## Prepare COCO validation dataset

In [5]:
! gsutil cp gs://eto-public/datasets/coco/coco_val.lance.tar.gz /tmp/
! tar -C /tmp -xzf /tmp/coco_val.lance.tar.gz && rm /tmp/coco_val.lance.tar.gz

Copying gs://eto-public/datasets/coco/coco_val.lance.tar.gz...
\ [1 files][771.6 MiB/771.6 MiB]   65.2 MiB/s                                   
Operation completed over 1 objects/771.6 MiB.                                    


In [18]:
from lance.pytorch import Dataset
import torchvision.transforms as T
import pandas as pd

transform = T.Compose([
    T.Resize(400),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

THRESHOLD = 0.5

dataset = Dataset(
  "/tmp/coco_val.lance",
  columns=["image", "image_id"],
  mode="batch",
  batch_size=8
)
results = []
with torch.no_grad():
  for batch in dataset:
    image_ids = batch[1].cpu()
    imgs = [transform(img).cuda() for img in batch[0]]
    # print(batch, batch.shape)
    output = model(imgs)
    output = postprocessors['bbox'](
        output, torch.Tensor([[1.0, 1.0]] * len(imgs)).cuda())
    for image_id, out in zip(image_ids, output):
      mask = out["scores"] > THRESHOLD
      pred = {
          "image_id": image_id.item(),
          "dino": {
            "boxes": out["boxes"][mask].cpu().tolist(),
            "labels": out["labels"][mask].cpu().tolist(),
            "scores": out["scores"][mask].cpu().tolist(),
          }
      }
      results.append(pred)
    del imgs, output
    if len(results) > 100:
      break

df = pd.DataFrame(data=results)
df

Unnamed: 0,image_id,dino
0,397133,"{'boxes': [[0.049877457320690155, 0.8039787411..."
1,37777,"{'boxes': [[0.8468257784843445, 0.278243213891..."
2,252219,"{'boxes': [[0.790980875492096, 0.4010539650917..."
3,87038,"{'boxes': [[0.39683473110198975, 0.46433973312..."
4,174482,"{'boxes': [[0.3026779592037201, 0.028677880764..."
...,...,...
99,544519,"{'boxes': [[0.2790755033493042, 0.584424138069..."
100,96493,"{'boxes': [[0.2622336447238922, 0.001933038234..."
101,23899,"{'boxes': [[0.0013131499290466309, 0.073289811..."
102,340175,"{'boxes': [[0.6675552129745483, 0.446382582187..."


# We can now add the dino inferene results into the dataset for later reference

In [24]:
# We can now add the dino inferene results into the dataset for later reference

import pyarrow as pa

table = pa.Table.from_pandas(
    df, 
    schema=pa.schema([
        pa.field("image_id", pa.int64()), 
        pa.field("dino", pa.struct([
            pa.field("boxes", pa.list_(pa.list_(pa.float32(), 4))),
            pa.field("labels", pa.list_(pa.int8())),
            pa.field("scores", pa.list_(pa.float32())),
        ])),
    ]),
)

# TODO: expose merge via PyTorch dataset?
dataset._dataset.merge(table, left_on="image_id", right_on="image_id")

<lance.lib.FileSystemDataset at 0x7efd6256ce70>

In [29]:
import lance, duckdb

dataset = lance.dataset("/tmp/coco_val.lance")
print("Dataset version: ", dataset.version)

dataset.schema

Dataset version:  {'version': 2, 'timestamp': datetime.datetime(2022, 12, 7, 19, 36, 25)}


license: int64
file_name: string
coco_url: extension<image[uri]<ImageUriType>>
height: int16
width: int16
date_captured: timestamp[ns]
flickr_url: extension<image[uri]<ImageUriType>>
image_id: int64
split: dictionary<values=string, indices=int8, ordered=0>
image_uri: extension<image[uri]<ImageUriType>>
annotations: struct<segmentation: list<item: struct<counts: list<item: int32>, polygon: list<item: list<item: float>>, size: list<item: int32>>>, area: list<item: double>, iscrowd: list<item: bool>, bbox: list<item: fixed_size_list<item: float>[4]>, category_id: list<item: int16>, id: list<item: int64>, supercategory: list<item: string>, name: list<item: string>>
  child 0, segmentation: list<item: struct<counts: list<item: int32>, polygon: list<item: list<item: float>>, size: list<item: int32>>>
      child 0, item: struct<counts: list<item: int32>, polygon: list<item: list<item: float>>, size: list<item: int32>>
          child 0, counts: list<item: int32>
              child 0, item: 