<img align="right" src="../img/auvisus.svg" width="100" >


# DEtection TRansformer Network

---

## 1. Import Modules

In [None]:
%load_ext autoreload
%autoreload 2

import os
import tensorflow as tf

from detr_models.detr.model import DETR
from detr_models.data_feeder.pvoc_feeder import PVOCFeeder
from detr_models.data_feeder.coco_feeder import COCOFeeder
from detr_models.data_feeder.uuid_iterator import UUIDIterator
from detr_models.detr.config import DefaultDETRConfig

from detr_models.backbone.backbone import Backbone
from detr_models.transformer.transformer import Transformer
from detr_models.detr.segmentation import SegmentationHead
from detr_models.transformer.attention import MultiHeadAttentionMap

from detr_models.detr.utils import create_positional_encodings, get_image_information



---

## 2. Initialize Model

In [None]:
# Specify storage path
storage_path = input(prompt='Please specify the storage path:\n')
batch_size = 1

# Additional Information used for initialization
config = DefaultDETRConfig()
training_config = {
        "storage_path": storage_path,
        "batch_size": batch_size,
    }

input_shape, count_images = get_image_information(storage_path, config.data_type)


---

In [None]:
data_feeder = COCOFeeder(
        storage_path=training_config["storage_path"],
        batch_size=training_config["batch_size"],
        num_queries=config.num_queries,
        num_classes=config.num_classes,
        image_width=config.image_width,
        image_height=config.image_height,
    )

In [None]:
input_data = next(data_feeder(False))

batch_images = input_data[0]
batch_cls = input_data[1]
batch_bboxs = input_data[2]
obj_indices = input_data[3]
batch_masks = input_data[4]

print("Batch Images: {}".format(batch_images.shape))
print("Batch Target Class Labels: {}".format(batch_cls.shape))
print("Batch Target Bounding Boxes: {}".format(batch_bboxs.shape))
print("Batch Target Masks: {}".format(batch_masks.shape))

---


## 3. Backbones

**Backbone Input:**  
$3 \times H_0 \times W_0$ ( $H_0,W_0$: Height and Width of Image)


**Backbone Output:**  
$C \times H \times W$ ( $H,W$: Height and Width of Feature Map)


### 3.1. ResNet50

In [None]:
# Backbone Configuration
# Mame, Config for Keras and output flag for segmentation
backbone_name = "ResNet50"
backbone_config = {
        "input_shape": input_shape,
        "include_top": False,
        "weights": "imagenet",
    }
return_intermediate = True

backbone = Backbone(backbone_name, backbone_config, return_intermediate).model
backbone.summary()

---

### 3.2. MobileNetV2

In [None]:
# Backbone Configuration
# Mame, Config for Keras and output flag for segmentation
backbone_name = "MobileNetV2"
backbone_config = {
        "input_shape": input_shape,
        "include_top": False,
        "weights": "imagenet",
    }
return_intermediate = False

backbone = Backbone(backbone_name, backbone_config, return_intermediate).model
backbone.summary()

----

## 3.3. Backbone Outputs (return_intermediate)

In [None]:
# Without intermediate feature maps
backbone_name = "ResNet50"
return_intermediate = False

backbone = Backbone(backbone_name, backbone_config, return_intermediate).model

feature_map = backbone(batch_images)
print("Feature Map Shape: {}".format(feature_map.shape))

In [None]:
# With intermediate feature maps
backbone_name = "ResNet50"
return_intermediate = True

backbone = Backbone(backbone_name, backbone_config, return_intermediate).model

feature_map = backbone(batch_images)
for idx, output in enumerate(feature_map):
    print("Level {} - Feature Map Shape: {}".format(idx, output.shape))
    
fpn_maps = feature_map[:-1]
feature_map = feature_map[-1]

---

## 4. Create Positional Encodings

In [None]:
fm_shape = feature_map.shape[1::]
positional_encodings = create_positional_encodings(fm_shape, config.dim_transformer//2, batch_size=1)
print("Positional Encodings Shape: {}".format(positional_encodings.shape))


---

## 5. Create Query Embeddings

In [None]:
query_pos = tf.ones((config.num_queries), dtype=tf.float32)
query_pos = tf.repeat(
    tf.expand_dims(query_pos, axis=0), repeats=batch_size, axis=0
)
query_embedding = tf.keras.layers.Embedding(
    input_dim=config.num_queries, output_dim=config.dim_transformer
)(query_pos)

print("Query Embeddings Shape: {}".format(query_embedding.shape))


---

## 6. Prepare Input Data For Transformer

**Backbone Output:**  
$C \times H \times W$ ( $H,W$: Height and Width of Feature Map)


**Transofmer Input:**
1. Reduce to smaller channel size  
$d \times H \times W$ ( $d$: Dimension of Transformer, $H,W$: Height and Width of Feature Map)    
  
  
2. Collapse Height and Width (inside Transformer)
$d \times HW$ ( $d$: Dimension of Transformer, $H,W$: Height and Width of Feature Map)

**Transofmer Output:**  
$N \times d$ ($N$: Number of Queries, $d$ Dimension of Transformer)   

In [None]:
transformer_input = tf.keras.layers.Conv2D(config.dim_transformer, kernel_size=1)(
            feature_map
        )
print("Transformer Input Shape: {}".format(transformer_input.shape))

---

## 7. Transformer

In [None]:
transformer = Transformer(
            config.num_transformer_layer,
            config.dim_transformer,
            config.num_heads,
            config.dim_feedforward,
        )

transformer_output, memory = transformer([transformer_input, positional_encodings, query_embedding])

print("Transformer Output Shape: {}".format(transformer_output.shape) )
print("Transformer Memory Shape: {}".format(memory.shape) )

In [None]:
transformer.summary()

---

## 8. Classification and Bounding Box Heads

In [None]:
cls_pred = tf.keras.layers.Dense(
    units=config.num_classes + 1, activation="softmax"
)(transformer_output)

bbox_pred = tf.keras.layers.Dense(units=4, activation="sigmoid")(
    transformer_output
)

print("Classification Output Shape: {}".format(cls_pred.shape))
print("Bounding Box Output Shape: {}".format(bbox_pred.shape))

----

## 9. Segmentation Head

**Input:**
1. Box Embeddings:  
$d \times N$  ($d$: Dimension of Transformer, $N$: Number of queries)



2. Encoded Image:  
$d \times H \times W$ ($d$: Dimension of Transformer, $H,W$: Height and Width of Feature Map)

### 9.1. Attention Boxes

In [None]:
attention_map = MultiHeadAttentionMap(
    dim_transformer=config.dim_transformer,
    num_heads=config.num_heads,
    dropout=0.0
)

attention_hmaps = attention_map([memory,transformer_output])
print("BBOX Mask Shape: {}".format(attention_hmaps.shape))

In [None]:
attention_map.summary()

### 9.2. Segmentation Head

In [None]:
segmentation_head = SegmentationHead(
    num_heads=config.num_heads,
    dim_transformer=config.dim_transformer,
)

In [None]:
print("Segmentation Head Input:")
print("Transformer Input: {}".format(transformer_input.shape))
print("Bounding Box Mask: {}".format(attention_hmaps.shape))

mask_pred = segmentation_head([transformer_input, attention_hmaps, fpn_maps])


print("Segmentation Head Output Shape: {}".format(mask_pred.shape))

In [None]:
segmentation_head.summary()

---

## 10. Complete DETR

### 10.1. Without Segmentation Head

In [None]:
detr = DETR(
        input_shape=input_shape,
        num_queries=config.num_queries,
        num_classes=config.num_classes,
        num_heads=config.num_heads,
        dim_transformer=config.dim_transformer,
        dim_feedforward=config.dim_feedforward,
        num_transformer_layer=config.num_transformer_layer,
        backbone_name=config.backbone_name,
        backbone_config=backbone_config,
        train_backbone=config.train_backbone,
    )

detr.build_model()

detr.model.summary()

---

### 10.2. Including Segmentation Head

In [None]:
detr = DETR(
        input_shape=input_shape,
        num_queries=config.num_queries,
        num_classes=config.num_classes,
        num_heads=config.num_heads,
        dim_transformer=config.dim_transformer,
        dim_feedforward=config.dim_feedforward,
        num_transformer_layer=config.num_transformer_layer,
        backbone_name=config.backbone_name,
        backbone_config=backbone_config,
        train_backbone=config.train_backbone,
    )

detr.build_model(False, True)

detr.model.summary()

---

---

---

*This Notebook was created by: [auvisus GmbH](https://www.auvisus.com/)*