In [1]:
import torch
import torchvision

In [2]:
from torchinfo import summary
model = torchvision.models.vgg16(pretrained=True)

In [3]:
print("Based on 3*224*224 Input")
summary(model, (1, 3, 224, 224))

Based on 3*224*224 Input


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Layer (type:depth-idx)                   Output Shape              Param #
VGG                                      --                        --
├─Sequential: 1-1                        [1, 512, 7, 7]            --
│    └─Conv2d: 2-1                       [1, 64, 224, 224]         1,792
│    └─ReLU: 2-2                         [1, 64, 224, 224]         --
│    └─Conv2d: 2-3                       [1, 64, 224, 224]         36,928
│    └─ReLU: 2-4                         [1, 64, 224, 224]         --
│    └─MaxPool2d: 2-5                    [1, 64, 112, 112]         --
│    └─Conv2d: 2-6                       [1, 128, 112, 112]        73,856
│    └─ReLU: 2-7                         [1, 128, 112, 112]        --
│    └─Conv2d: 2-8                       [1, 128, 112, 112]        147,584
│    └─ReLU: 2-9                         [1, 128, 112, 112]        --
│    └─MaxPool2d: 2-10                   [1, 128, 56, 56]          --
│    └─Conv2d: 2-11                      [1, 256, 56, 56]          29

In [4]:
print("Based on 3x300x300 input")
summary(model, (1, 3, 300, 300))

Based on 3x300x300 input


Layer (type:depth-idx)                   Output Shape              Param #
VGG                                      --                        --
├─Sequential: 1-1                        [1, 512, 9, 9]            --
│    └─Conv2d: 2-1                       [1, 64, 300, 300]         1,792
│    └─ReLU: 2-2                         [1, 64, 300, 300]         --
│    └─Conv2d: 2-3                       [1, 64, 300, 300]         36,928
│    └─ReLU: 2-4                         [1, 64, 300, 300]         --
│    └─MaxPool2d: 2-5                    [1, 64, 150, 150]         --
│    └─Conv2d: 2-6                       [1, 128, 150, 150]        73,856
│    └─ReLU: 2-7                         [1, 128, 150, 150]        --
│    └─Conv2d: 2-8                       [1, 128, 150, 150]        147,584
│    └─ReLU: 2-9                         [1, 128, 150, 150]        --
│    └─MaxPool2d: 2-10                   [1, 128, 75, 75]          --
│    └─Conv2d: 2-11                      [1, 256, 75, 75]          29

In [5]:
ssd300 = torchvision.models.detection.ssd300_vgg16(pretrained=True, progress=True, num_classes=91, pretrained_backbone=True)
print("Based on VGG-16 SSD-300")
summary(ssd300, (1, 3, 300, 300))

Based on VGG-16 SSD-300


Layer (type:depth-idx)                   Output Shape              Param #
SSD                                      --                        --
├─SSDFeatureExtractorVGG: 1              --                        --
│    └─ModuleList: 2-1                   --                        --
├─SSDHead: 1                             --                        --
│    └─SSDClassificationHead: 2          --                        --
│    │    └─ModuleList: 3-1              --                        12,163,242
│    └─SSDRegressionHead: 2              --                        --
│    │    └─ModuleList: 3-2              --                        534,648
├─GeneralizedRCNNTransform: 1-1          --                        --
├─SSDFeatureExtractorVGG: 1-2            [1, 256, 1, 1]            --
│    └─Sequential: 2-2                   [1, 512, 38, 38]          --
│    │    └─Conv2d: 3-3                  [1, 64, 300, 300]         (1,792)
│    │    └─ReLU: 3-4                    [1, 64, 300, 300]         

In [6]:
# Implementing own-SSD: input features

vgg_conv4_3_feature_out = torch.randn(1, 512, 38, 38)
vgg_conv5_3_feature_out = torch.randn(1, 1024, 19, 19)

# two features had different shape (5_3 is after MaxPool2d)
print(vgg_conv4_3_feature_out.shape)
print(vgg_conv5_3_feature_out.shape)

torch.Size([1, 512, 38, 38])
torch.Size([1, 1024, 19, 19])


In [7]:
# Implementing own-SSD: body

from torch import nn


class SSDObjectDetector(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        self.num_classes = num_classes
        
        self.detector0 = nn.Sequential(
            nn.Conv2d(1024, 1024, kernel_size=(3, 3), padding=3//2),
            nn.ReLU(inplace=True)
        )
        self.detector1 = nn.Sequential(
            nn.Conv2d(1024, 1024, kernel_size=(1, 1)),
            nn.ReLU(inplace=True)
        )
        self.detector2 = nn.Sequential(
            nn.Conv2d(1024, 256, kernel_size=(1, 1)),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 512, kernel_size=(3, 3), stride=2, padding=3//2),
            nn.ReLU(inplace=True)
        )
        self.detector3 = nn.Sequential(
            nn.Conv2d(512, 128, kernel_size=(1, 1)),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 256, kernel_size=(3, 3), stride=2, padding=3//2),
            nn.ReLU(inplace=True)
        )
        self.detector4 = nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=(1, 1)),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 256, kernel_size=(3, 3)),
            nn.ReLU(inplace=True)
        )
        self.detector5 = nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=(1, 1)),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 256, kernel_size=(3, 3)),
            nn.ReLU(inplace=True)
        )
        
        self.detectors = [
            self.detector0,
            self.detector1,
            self.detector2,
            self.detector3,
            self.detector4,
            self.detector5
        ]
        
        self.classifier0 = nn.Sequential(
            nn.Conv2d(1024, (4 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        self.classifier1 = nn.Sequential(
            nn.Conv2d(1024, (6 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        self.classifier2 = nn.Sequential(
            nn.Conv2d(512, (6 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        self.classifier3 = nn.Sequential(
            nn.Conv2d(256, (6 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        self.classifier4 = nn.Sequential(
            nn.Conv2d(256, (4 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        self.classifier5 = nn.Sequential(
            nn.Conv2d(256, (4 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        
        self.classifiers = [
            self.classifier0,
            self.classifier1,
            self.classifier2,
            self.classifier3,
            self.classifier4,
            self.classifier5
        ]
        
    def forward(self, x):
        
        detections = []
        classifier_id = {
            # detector_id: target_classifier_id
            0: 0,
            1: None,
            2: 1,
            3: 2,
            4: 3,
            5: 4,
            6: 5
        }
        
        # Final layer classification must be manually inferred
        for i, detector in enumerate(self.detectors):
            cid = classifier_id[i]
            if cid is not None:
                classifier = self.classifiers[cid]
                result = classifier(x)
                detections.append(result)
                
            x = detector(x)
            
        classifier = self.classifiers[classifier_id[6]]
        result = classifier(x)
        detections.append(result)
        
        return detections
        
detector = SSDObjectDetector(num_classes = 3)
# print(detector.forward(vgg_conv4_3_feature_out, vgg_conv5_3_feature_out))
# print(detector.forward(vgg_conv4_3_feature_out, vgg_conv5_3_feature_out).shape)
summary(detector, (1, 1024, 19, 19))

detector.to(torch.device('cpu'))
detections = detector(torch.randn(1, 1024, 19, 19))

for i, item in enumerate(detections):
    print("Detection %d:" % (i + 1))
    print("+ Feature size: ", item.shape[2:], "Detected bboxes (grouped by xywhc):", item.shape[1] // (4 + 3))
    

Detection 1:
+ Feature size:  torch.Size([19, 19]) Detected bboxes (grouped by xywhc): 4
Detection 2:
+ Feature size:  torch.Size([19, 19]) Detected bboxes (grouped by xywhc): 6
Detection 3:
+ Feature size:  torch.Size([10, 10]) Detected bboxes (grouped by xywhc): 6
Detection 4:
+ Feature size:  torch.Size([5, 5]) Detected bboxes (grouped by xywhc): 6
Detection 5:
+ Feature size:  torch.Size([3, 3]) Detected bboxes (grouped by xywhc): 4
Detection 6:
+ Feature size:  torch.Size([1, 1]) Detected bboxes (grouped by xywhc): 4


# 정리
- 7/27 - 구현을 하였으나 이 마지막에서 어떻게 해야할지 감이 안잡혀서 다시 기존 GitHub 구현체로 돌아간다.
- 7/28 - 다시한번 default bbox와 Loss 부분에 대해서 공부를 시작해 보기로 했다 (OneNote 참조)