# DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection

https://github.com/IDEA-Research/DINO

[Papers With Code Link](https://paperswithcode.com/paper/focal-modulation-networks)

In [1]:
!pip install --quiet pylance duckdb torch torchvision transforms

## Build and install [DINO]() Model

In [2]:
!git -C DINO pull || git clone https://github.com/IDEACVR/DINO
%cd DINO

!pip install --quiet -r requirements.txt \
  && cd models/dino/ops \
  && python setup.py -q build install

Already up to date.
/content/DINO
zip_safe flag not set; analyzing archive contents...
__pycache__.MultiScaleDeformableAttention.cpython-38: module references __file__


In [3]:
# See https://github.com/IDEA-Research/DINO/blob/main/inference_and_visualization.ipynb
# for instruction to load model
from util.slconfig import SLConfig
from main import build_model_main
model_config_path = "config/DINO/DINO_4scale.py"

args = SLConfig.fromfile(model_config_path) 
args.device = 'cuda' 
model, criterion, postprocessors = build_model_main(args)



In [4]:
# Downloads weights

# Download DINO-4scale weights
! [[ -f /tmp/model.pt ]] || gsutil cp gs://eto-public/models/dino/checkpoint0033_4scale.pth /tmp/model.pt
import torch
model_checkpoint_path = "/tmp/model.pt"
checkpoint = torch.load(model_checkpoint_path)
model.load_state_dict(checkpoint['model'])
_ = model.cuda().eval()

## Prepare COCO validation dataset to [Lance](https://github.com/eto-ai/lance) format.

In [31]:
! [[ -f annotations/instances_val2017.json ]] || ( \
  wget -O /tmp/annotations.zip http://images.cocodataset.org/annotations/annotations_trainval2017.zip && \
  unzip -o -qq /tmp/annotations.zip && rm annotations.zip \
)
! [[ -d val2017/ ]] || ( \
  wget -O /tmp/val2017.zip http://images.cocodataset.org/zips/val2017.zip && \
  unzip -o -qq /tmp/val2017.zip && \
  rm val2017.zip )

import pandas as pd
import json
with open("annotations/instances_val2017.json") as fobj:
  data = json.load(fobj)

print(data.keys())
images_df = (
    pd
    .DataFrame(data=data["images"])
    .rename(columns={"id": "image_id"}) 
)

print(images_df)
annos_df = (pd.DataFrame(data=data["annotations"]))
annos_df

dict_keys(['info', 'licenses', 'images', 'annotations', 'categories'])
      license         file_name  \
0           4  000000397133.jpg   
1           1  000000037777.jpg   
2           4  000000252219.jpg   
3           1  000000087038.jpg   
4           6  000000174482.jpg   
...       ...               ...   
4995        3  000000512403.jpg   
4996        4  000000168974.jpg   
4997        1  000000552775.jpg   
4998        3  000000394940.jpg   
4999        2  000000015335.jpg   

                                               coco_url  height  width  \
0     http://images.cocodataset.org/val2017/00000039...     427    640   
1     http://images.cocodataset.org/val2017/00000003...     230    352   
2     http://images.cocodataset.org/val2017/00000025...     428    640   
3     http://images.cocodataset.org/val2017/00000008...     480    640   
4     http://images.cocodataset.org/val2017/00000017...     388    640   
...                                                 ...     ... 

Unnamed: 0,segmentation,area,iscrowd,image_id,bbox,category_id,id
0,"[[510.66, 423.01, 511.72, 420.03, 510.45, 416....",702.10575,0,289343,"[473.07, 395.93, 38.65, 28.67]",18,1768
1,"[[289.74, 443.39, 302.29, 445.32, 308.09, 427....",27718.47630,0,61471,"[272.1, 200.23, 151.97, 279.77]",18,1773
2,"[[147.76, 396.11, 158.48, 355.91, 153.12, 347....",78969.31690,0,472375,"[124.71, 196.18, 372.85, 356.81]",18,2551
3,"[[260.4, 231.26, 215.06, 274.01, 194.33, 307.6...",108316.66515,0,520301,"[112.71, 154.82, 367.29, 479.35]",18,3186
4,"[[200.61, 253.97, 273.19, 318.49, 302.43, 336....",75864.53530,0,579321,"[200.61, 89.65, 400.22, 251.02]",18,3419
...,...,...,...,...,...,...,...
36776,"{'counts': [94823, 6, 473, 8, 471, 10, 469, 11...",3773.00000,1,15517,"[197, 248, 264, 45]",6,900600015517
36777,"{'counts': [277, 2, 361, 9, 1, 17, 3, 17, 3, 8...",112181.00000,1,439994,"[0, 0, 427, 458]",1,900100439994
36778,"{'counts': [2770, 6, 418, 8, 416, 10, 86, 6, 3...",47024.00000,1,117719,"[6, 75, 474, 263]",44,904400117719
36779,"{'counts': [3912, 10, 363, 18, 356, 23, 301, 1...",27277.00000,1,50149,"[10, 41, 403, 152]",52,905200050149


In [14]:
from lance.pytorch import Dataset
import torchvision.transforms as T
import pandas as pd

transform = T.Compose([
    T.Resize(400),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

threshold = 0.5

dataset = Dataset(
    "s3://eto-public/datasets/coco/coco.lance",
    columns=["image", "split", "image_id"],
    # mode="batch",
    batch_size=8)
results = []
with torch.no_grad():
  for batch in dataset:
    image_id = batch[2].cpu().item()
    imgs = [transform(batch[0]).cuda()]
    # print(batch, batch.shape)
    output = model(imgs)
    output = postprocessors['bbox'](output, torch.Tensor([[1.0, 1.0]]).cuda())[0]
    mask = output["scores"] > threshold
    pred = {
        "image_id": image_id,
        "boxes": output["boxes"][mask].cpu().numpy(),
        "labels": output["labels"][mask].cpu().numpy(),
        "scores": output["scores"][mask].cpu().numpy(),
    }
    del output
    results.append(pred)

df = pd.DataFrame(data=results)
df

  dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats)
  dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats)
  dim_t = 10000 ** (2 * (dim_t // 2) / 128)
  topk_boxes = topk_indexes // out_logits.shape[2]


KeyboardInterrupt: ignored