# DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection

<a target="_blank" href="https://colab.research.google.com/github/eto-ai/lance/blob/main/python/notebooks/dino_coco.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

Code: https://github.com/IDEA-Research/DINO

Paper: [Papers With Code Link](https://paperswithcode.com/paper/dino-detr-with-improved-denoising-anchor-1)

## Build and install [DINO](https://github.com/IDEA-Research/DINO) Model.

DINO model requires building CUDA ops. After this step, we need to ***restart the runtime***.

In [9]:
!git -C DINO pull || git clone https://github.com/IDEACVR/DINO
%cd DINO
!pip install --quiet -r requirements.txt \
  && cd models/dino/ops \
  && python setup.py -q build install


Already up to date.
/content/DINO/DINO
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.17.9 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 10.0.1 which is incompatible.
db-dtypes 1.0.4 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 10.0.1 which is incompatible.[0m
In file included from [01m[K/content/DINO/DINO/models/dino/ops/src/vision.cpp:11:0[m[K:
[01m[K/content/DINO/DINO/models/dino/ops/src/ms_deform_attn.h:[m[K In function ‘[01m[Kat::Tensor ms_deform_attn_forward(const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, const at::Tensor&, int)[m[K’:
     if (value.type([01;35m[K)[m[K.is_cuda())
                    [01;35m[K^[m[K
In file included from [01m[K/usr/local/lib/python3.8/dist-packages/torch/include/ATen/core/Tensor.h:3:0[m[K,
                 from [01m[K/usr/local/lib/

In [10]:
!pip install --quiet -U pylance duckdb torch torchvision transforms numpy pyarrow pandas

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scipy 1.7.3 requires numpy<1.23.0,>=1.16.5, but you have numpy 1.23.5 which is incompatible.
pandas-gbq 0.17.9 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 10.0.1 which is incompatible.
db-dtypes 1.0.4 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 10.0.1 which is incompatible.[0m


In [11]:
# See https://github.com/IDEA-Research/DINO/blob/main/inference_and_visualization.ipynb
# for instruction to load model
from util.slconfig import SLConfig
from main import build_model_main
import torch

model_config_path = "config/DINO/DINO_4scale.py"

args = SLConfig.fromfile(model_config_path) 
args.device = 'cuda' 
model, criterion, postprocessors = build_model_main(args)

# Download model weights.
#
! [[ -f /tmp/model.pt ]] || gsutil cp gs://eto-public/models/dino/checkpoint0033_4scale.pth /tmp/model.pt

model_checkpoint_path = "/tmp/model.pt"
checkpoint = torch.load(model_checkpoint_path)
model.load_state_dict(checkpoint['model'])
_ = model.cuda().eval()




## Prepare COCO validation dataset

In [12]:
! gsutil cp gs://eto-public/datasets/coco/coco_val.lance.tar.gz /tmp/
! tar -C /tmp -xzf /tmp/coco_val.lance.tar.gz && rm /tmp/coco_val.lance.tar.gz

Copying gs://eto-public/datasets/coco/coco_val.lance.tar.gz...
- [1 files][771.6 MiB/771.6 MiB]   58.1 MiB/s                                   
Operation completed over 1 objects/771.6 MiB.                                    


In [18]:
from lance.pytorch import Dataset
import torchvision.transforms as T
import pandas as pd

transform = T.Compose([
    T.Resize(400),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

THRESHOLD = 0.4

dataset = Dataset(
  "/tmp/coco_val.lance",
  columns=["image", "image_id"],
  mode="batch",
  batch_size=8
)
results = []
with torch.no_grad():
  for batch in dataset:
    image_ids = batch[1].cpu()
    imgs = [transform(img).cuda() for img in batch[0]]
    # print(batch, batch.shape)
    output = model(imgs)
    output = postprocessors['bbox'](
        output, torch.Tensor([[1.0, 1.0]] * len(imgs)).cuda())
    for image_id, out in zip(image_ids, output):
      mask = out["scores"] > THRESHOLD
      pred = {
          "image_id": image_id.item(),
          "dino": {
            "boxes": out["boxes"][mask].cpu().tolist(),
            "labels": out["labels"][mask].cpu().tolist(),
            "scores": out["scores"][mask].cpu().tolist(),
          }
      }
      results.append(pred)
    del imgs, output

df = pd.DataFrame(data=results)
df

Unnamed: 0,image_id,dino
0,397133,"{'boxes': [[0.049877457320690155, 0.8039787411..."
1,37777,"{'boxes': [[0.8468257784843445, 0.278243213891..."
2,252219,"{'boxes': [[0.790980875492096, 0.4010539650917..."
3,87038,"{'boxes': [[0.39683473110198975, 0.46433973312..."
4,174482,"{'boxes': [[0.3026779592037201, 0.028677880764..."
...,...,...
4947,512403,"{'boxes': [[0.665542721748352, 0.5590986013412..."
4948,168974,"{'boxes': [[0.002011597156524658, 0.1653629839..."
4949,552775,"{'boxes': [[0.6701170206069946, 0.356898427009..."
4950,394940,"{'boxes': [[0.00048539042472839355, 0.11307901..."


# We can now add the dino inferene results into the dataset for later reference

In [14]:
# We can now add the dino inferene results into the dataset for later reference

import pyarrow as pa

table = pa.Table.from_pandas(
    df, 
    schema=pa.schema([
        pa.field("image_id", pa.int64()), 
        pa.field("dino", pa.struct([
            pa.field("boxes", pa.list_(pa.list_(pa.float32(), 4))),
            pa.field("labels", pa.list_(pa.int8())),
            pa.field("scores", pa.list_(pa.float32())),
        ])),
    ]),
)

# TODO: expose merge via PyTorch dataset?
dataset._dataset.merge(table, left_on="image_id", right_on="image_id")

<lance.lib.FileSystemDataset at 0x7f71e6ce0430>

In [21]:
import lance, duckdb

dataset = lance.dataset("/tmp/coco_val.lance")
print("Dataset version: ", dataset.version)

dataset.schema

Dataset version:  {'version': 2, 'timestamp': datetime.datetime(2022, 12, 7, 20, 38, 33)}


license: int64
file_name: string
coco_url: extension<image[uri]<ImageUriType>>
height: int16
width: int16
date_captured: timestamp[ns]
flickr_url: extension<image[uri]<ImageUriType>>
image_id: int64
split: dictionary<values=string, indices=int8, ordered=0>
image_uri: extension<image[uri]<ImageUriType>>
annotations: struct<segmentation: list<item: struct<counts: list<item: int32>, polygon: list<item: list<item: float>>, size: list<item: int32>>>, area: list<item: double>, iscrowd: list<item: bool>, bbox: list<item: fixed_size_list<item: float>[4]>, category_id: list<item: int16>, id: list<item: int64>, supercategory: list<item: string>, name: list<item: string>>
  child 0, segmentation: list<item: struct<counts: list<item: int32>, polygon: list<item: list<item: float>>, size: list<item: int32>>>
      child 0, item: struct<counts: list<item: int32>, polygon: list<item: list<item: float>>, size: list<item: int32>>
          child 0, counts: list<item: int32>
              child 0, item: 

In [22]:
duckdb.query("""
  SELECT
    image_id,
    height,
    width,
    annotations.bbox,
    annotations.category_id,
    dino.boxes as dino_boxes,
    dino.labels as dino_labels
  FROM dataset 
  WHERE dino IS NOT NULL
  LIMIT 10
""").df()

Unnamed: 0,image_id,height,width,bbox,category_id,dino_boxes,dino_labels
0,397133,427,640,"[[217.6199951171875, 240.5399932861328, 256.60...","[44, 67, 1, 1, 47, 47, 49, 50, 51, 51, 51, 51,...","[[0.049877457320690155, 0.8039787411689758, 0....","[51, 1, 51, 51, 67, 79, 1, 47, 47, 50]"
1,37777,230,352,"[[102.48999786376953, 118.47000122070312, 110....","[64, 62, 62, 62, 67, 82, 52, 55, 55, 55, 55, 5...","[[0.8468257784843445, 0.27824321389198303, 1.0...","[82, 79, 55, 55, 55, 55, 55, 52, 67]"
2,252219,428,640,"[[326.2799987792969, 174.55999755859375, 397.5...","[1, 1, 1, 28, 10, 47, 31]","[[0.790980875492096, 0.4010539650917053, 0.989...","[1, 1, 1, 28]"
3,87038,480,640,"[[253.2100067138672, 271.07000732421875, 312.7...","[2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.39683473110198975, 0.4643397331237793, 0.4...","[1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1]"
4,174482,388,640,"[[187.74000549316406, 5.840000152587891, 498.1...","[2, 3, 3, 3, 3, 3, 8, 8, 8, 10, 10, 10]","[[0.3026779592037201, 0.02867788076400757, 0.7...","[2, 3, 3, 3, 10, 3, 3, 8, 10, 3]"
5,403385,511,640,"[[411.1000061035156, 237.6999969482422, 504.10...","[70, 81]","[[0.6417204737663269, 0.46296626329421997, 0.7...","[70, 81]"
6,6818,640,427,"[[186.97000122070312, 471.8299865722656, 287.6...",[70],,
7,480985,500,375,"[[14.609999656677246, 68.54000091552734, 343.8...","[4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1]","[[0.14848709106445312, 0.1327759325504303, 0.8...","[4, 1, 1, 1, 1, 1]"
8,458054,426,640,"[[33.27000045776367, 0.0, 336.44000244140625, ...","[70, 70, 70, 70, 70, 70, 70, 70, 70, 70]","[[0.6517613530158997, 0.2983947694301605, 0.87...","[70, 70, 70, 70, 70, 70]"
9,331352,500,351,"[[28.030000686645508, 252.91000366210938, 321....","[70, 81]","[[0.07897031307220459, 0.11680880188941956, 0....",[70]
