In [1]:
'''
    Detector attachment
    - GitHub (shaoshengsong/MobileNetV3-SSD)에서 가져온 MobileNetV3 코드를 참고하여 SSD (Single Shot Detector)만 가져와서  
      기존의 MobileNetV3 모델에 붙여본다.
    - 기존 mobilenetv3-variation-impl의 코드를 활용한다.
'''

import torch
from torch import nn

In [2]:
import sys, os
sys.path.append(os.path.join(os.getcwd(), 'models', 'mb3_ssd'))

from models.mb3_ssd.vision.ssd.mobilenet_v3_ssd_lite import create_mobilenetv3_ssd_lite, create_mobilenetv3_ssd_lite_predictor
from models.mb3_ssd.vision.nn.mobilenet_v3 import MobileNetV3

from torchinfo import summary

In [3]:
model = MobileNetV3().features
summary(model, (1, 3, 224, 224))

Layer (type:depth-idx)                   Output Shape              Param #
Sequential                               --                        --
├─Conv2d: 1-1                            [1, 16, 112, 112]         448
├─BatchNorm2d: 1-2                       [1, 16, 112, 112]         32
├─h_swish: 1-3                           [1, 16, 112, 112]         --
├─MobileBlock: 1-4                       [1, 16, 56, 56]           --
│    └─Sequential: 2-1                   [1, 16, 112, 112]         --
│    │    └─Conv2d: 3-1                  [1, 16, 112, 112]         256
│    │    └─BatchNorm2d: 3-2             [1, 16, 112, 112]         32
│    │    └─ReLU: 3-3                    [1, 16, 112, 112]         --
│    └─Sequential: 2-2                   [1, 16, 56, 56]           --
│    │    └─Conv2d: 3-4                  [1, 16, 56, 56]           160
│    │    └─BatchNorm2d: 3-5             [1, 16, 56, 56]           32
│    └─SqueezeBlock: 2-3                 [1, 16, 56, 56]           --
│    │    └─

In [4]:
import torch
from torch import nn

In [5]:
class SqueezeExciteModule(nn.Module):
    def __init__(self, expand_size):
        super(SqueezeExciteModule, self).__init__()

        self.se_0_0 = nn.AdaptiveAvgPool2d(output_size=1)
        self.se_0_1 = nn.Flatten()

        self.se_1_0 = nn.Linear(in_features=expand_size, out_features=expand_size)
        self.se_1_1 = nn.ReLU(inplace=True)

        self.se_2_0 = nn.Linear(in_features=expand_size, out_features=expand_size)
        self.se_2_1 = nn.Hardsigmoid(inplace=True)

    def forward(self, x):
        x = self.se_0_0(x)
        x = self.se_0_1(x)

        x = self.se_1_0(x)
        x = self.se_1_1(x)

        x = self.se_2_0(x)
        x = self.se_2_1(x)
        x = torch.unsqueeze(x, -1)
        x = torch.unsqueeze(x, -1)
        return x


class Bottleneck(nn.Module):
    def __init__(self, in_channels, out_channels, dw_kernel_size, expand_size, squeeze_excite,
                 nonlinearity, stride, bias = True):
        super(Bottleneck, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.expand_size = expand_size
        self.squeeze_excite = squeeze_excite
        self.stride = stride
        self.dw_kernel_size = dw_kernel_size
        self.bias = bias

        if nonlinearity == 'hardswish':
            self.Nonliearity = nn.Hardswish
        elif nonlinearity == 'relu':
            self.Nonliearity = nn.ReLU
        else:
            raise RuntimeError("No such nonlinearity!")

        # 1x1 Conv2d + NL
        self.bottleneck_0_0 = nn.Conv2d(in_channels=in_channels, out_channels=expand_size, kernel_size=(1, 1),
                                        bias=self.bias)
        self.bottleneck_0_1 = nn.BatchNorm2d(num_features=expand_size)
        self.bottleneck_0_2 = self.Nonliearity(inplace=True)

        # Dwise + NL
        self.bottleneck_1_0 = nn.Conv2d(in_channels=expand_size, out_channels=expand_size,
                                        kernel_size=self.dw_kernel_size,
                                        stride=self.stride, padding=self.dw_kernel_size[0] // 2, groups=expand_size,
                                        bias=self.bias)
        self.bottleneck_1_1 = nn.BatchNorm2d(num_features=expand_size)

        # Squeeze-Excite
        if self.squeeze_excite:
            self.squeeze_excite_0 = SqueezeExciteModule(
                expand_size=expand_size
            )
        else:
            self.squeeze_excite_0 = nn.Identity()

        # Final 1x1 Conv2d
        self.bottleneck_final_0 = nn.Conv2d(in_channels=expand_size, out_channels=out_channels, kernel_size=(1, 1),
                                            bias=self.bias)
        self.bottleneck_final_1 = nn.BatchNorm2d(num_features=out_channels)

        # Downsampling first layer
        self.bottleneck_final_2 = nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
                                            kernel_size=(1, 1), stride=self.stride, bias=self.bias)

    def forward(self, x):
        x_0 = self.bottleneck_0_0(x)
        x_0 = self.bottleneck_0_1(x_0)
        x_0 = self.bottleneck_0_2(x_0)

        x_0 = self.bottleneck_1_0(x_0)
        x_0 = self.bottleneck_1_1(x_0)

        x_1 = self.squeeze_excite_0(x_0)
        x_0 = x_0 * x_1

        x_0 = self.bottleneck_final_0(x_0)
        x_0 = self.bottleneck_final_1(x_0)
        x_b = self.bottleneck_final_2(x)
        return x_0.add(x_b)

In [6]:
class MobileNetV3(nn.Module):
    def __init__(self, size='large', width_mult=1.0, classifier=True, classifier_out_features=1000, bias=True, dropout=0.2):
        super(MobileNetV3, self).__init__()
        
        self.bias = bias
        self.dropout = dropout
        
        if size.lower() == 'large':
            conv_0_0 = nn.Conv2d(in_channels=3, out_channels=int(16 * width_mult),
                                 kernel_size=(3, 3), stride=2, padding=3 // 2, bias=self.bias)
            conv_0_1 = nn.BatchNorm2d(num_features=int(16 * width_mult))
            conv_0_2 = nn.Hardswish(inplace=True)
            
            conv_1_0 = Bottleneck(in_channels=int(16 * width_mult),
                                  out_channels=int(16 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=16, squeeze_excite=False, nonlinearity='relu', stride=1, bias=self.bias)
            conv_2_0 = Bottleneck(in_channels=int(16 * width_mult),
                                  out_channels=int(24 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=64, squeeze_excite=False, nonlinearity='relu', stride=2, bias=self.bias)
            conv_3_0 = Bottleneck(in_channels=int(24 * width_mult),
                                  out_channels=int(24 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=72, squeeze_excite=False, nonlinearity='relu', stride=1, bias=self.bias)
            conv_4_0 = Bottleneck(in_channels=int(24 * width_mult),
                                  out_channels=int(40 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=72, squeeze_excite=True, nonlinearity='relu', stride=2, bias=self.bias)
            conv_5_0 = Bottleneck(in_channels=int(40 * width_mult),
                                  out_channels=int(40 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=120, squeeze_excite=True, nonlinearity='relu', stride=1, bias=self.bias)
            conv_6_0 = Bottleneck(in_channels=int(40 * width_mult),
                                  out_channels=int(40 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=120, squeeze_excite=True, nonlinearity='relu', stride=1, bias=self.bias)
            conv_7_0 = Bottleneck(in_channels=int(40 * width_mult),
                                  out_channels=int(80 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=240, squeeze_excite=False, nonlinearity='hardswish', stride=2, bias=self.bias)
            conv_8_0 = Bottleneck(in_channels=int(80 * width_mult),
                                  out_channels=int(80 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=200, squeeze_excite=False, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_9_0 = Bottleneck(in_channels=int(80 * width_mult),
                                  out_channels=int(80 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=184, squeeze_excite=False, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_10_0 = Bottleneck(in_channels=int(80 * width_mult),
                                  out_channels=int(80 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=184, squeeze_excite=False, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_11_0 = Bottleneck(in_channels=int(80 * width_mult),
                                  out_channels=int(112 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=480, squeeze_excite=True, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_12_0 = Bottleneck(in_channels=int(112 * width_mult),
                                  out_channels=int(112 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=672, squeeze_excite=True, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_13_0 = Bottleneck(in_channels=int(112 * width_mult),
                                  out_channels=int(160 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=672, squeeze_excite=True, nonlinearity='hardswish', stride=2, bias=self.bias)
            conv_14_0 = Bottleneck(in_channels=int(160 * width_mult),
                                  out_channels=int(160 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=960, squeeze_excite=True, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_15_0 = Bottleneck(in_channels=int(160 * width_mult),
                                  out_channels=int(160 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=960, squeeze_excite=True, nonlinearity='hardswish', stride=1, bias=self.bias)
            
            conv_16_0 = nn.Conv2d(in_channels=int(160 * width_mult), out_channels=int(960 * width_mult),
                                 kernel_size=(1, 1), stride=1, bias=self.bias)
            conv_16_1 = nn.BatchNorm2d(num_features=int(960 * width_mult))
            conv_16_2 = nn.Hardswish(inplace=True)
            
            if classifier:
                self.features = nn.Sequential(
                    conv_0_0,
                    conv_0_1,
                    conv_0_2,
                    conv_1_0,
                    conv_2_0,
                    conv_3_0, 
                    conv_4_0, 
                    conv_5_0, 
                    conv_6_0, 
                    conv_7_0, 
                    conv_8_0, 
                    conv_9_0, 
                    conv_10_0,
                    conv_11_0,
                    conv_12_0,
                    conv_13_0,
                    conv_14_0,
                    conv_15_0,
                    conv_16_0,
                    conv_16_1,
                    conv_16_2
                )
                self.classifiers = nn.Sequential(
                    nn.AdaptiveAvgPool2d(output_size=1),
                    nn.Flatten(start_dim=1),
                    nn.Linear(int(960 * width_mult), int(1280 * width_mult)),
                    nn.Dropout(p=self.dropout),
                    nn.Linear(int(1280 * width_mult), classifier_out_features)
                )
            else:
                self.features = nn.Sequential(
                    conv_0_0,
                    conv_0_1,
                    conv_0_2,
                    conv_1_0,
                    conv_2_0,
                    conv_3_0, 
                    conv_4_0, 
                    conv_5_0, 
                    conv_6_0, 
                    conv_7_0, 
                    conv_8_0, 
                    conv_9_0, 
                    conv_10_0,
                    conv_11_0,
                    conv_12_0,
                    conv_13_0,
                    conv_14_0,
                    conv_15_0,
                    conv_16_0,
                    conv_16_1,
                    conv_16_2,
                    nn.Conv2d(in_channels=int(960 * width_mult), out_channels=int(1280 * width_mult),
                                          kernel_size=(1, 1), bias=self.bias),
                    nn.Hardswish()
                )
                
                self.classifiers = nn.Identity()
            
        elif size.lower() == 'small':
            conv_0_0 = nn.Conv2d(in_channels=3, out_channels=int(16 * width_mult),
                                 kernel_size=(3, 3), stride=2, padding=3 // 2, bias=self.bias)
            conv_0_1 = nn.BatchNorm2d(num_features=int(16 * width_mult))
            conv_0_2 = nn.Hardswish()

            conv_1_0 = Bottleneck(in_channels=int(16 * width_mult),
                                  out_channels=int(16 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=16, squeeze_excite=True, nonlinearity='relu', stride=2, bias=self.bias)
            conv_2_0 = Bottleneck(in_channels=int(16 * width_mult),
                                  out_channels=int(24 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=72, squeeze_excite=False, nonlinearity='relu', stride=2, bias=self.bias)
            conv_3_0 = Bottleneck(in_channels=int(24 * width_mult),
                                  out_channels=int(24 * width_mult), dw_kernel_size=(3, 3),
                                  expand_size=88, squeeze_excite=False, nonlinearity='relu', stride=1, bias=self.bias)
            conv_4_0 = Bottleneck(in_channels=int(24 * width_mult),
                                  out_channels=int(40 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=96, squeeze_excite=True, nonlinearity='hardswish', stride=2, bias=self.bias)
            conv_5_0 = Bottleneck(in_channels=int(40 * width_mult),
                                  out_channels=int(40 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=240, squeeze_excite=True, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_6_0 = Bottleneck(in_channels=int(40 * width_mult),
                                  out_channels=int(40 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=240, squeeze_excite=True, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_7_0 = Bottleneck(in_channels=int(40 * width_mult),
                                  out_channels=int(48 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=120, squeeze_excite=True, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_8_0 = Bottleneck(in_channels=int(48 * width_mult),
                                  out_channels=int(48 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=144, squeeze_excite=True, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_9_0 = Bottleneck(in_channels=int(48 * width_mult),
                                  out_channels=int(96 * width_mult), dw_kernel_size=(5, 5),
                                  expand_size=288, squeeze_excite=True, nonlinearity='hardswish', stride=2, bias=self.bias)
            conv_10_0 = Bottleneck(in_channels=int(96 * width_mult),
                                   out_channels=int(96 * width_mult), dw_kernel_size=(5, 5),
                                   expand_size=576, squeeze_excite=True, nonlinearity='hardswish', stride=1, bias=self.bias)
            conv_11_0 = Bottleneck(in_channels=int(96 * width_mult),
                                   out_channels=int(96 * width_mult), dw_kernel_size=(5, 5),
                                   expand_size=576, squeeze_excite=True, nonlinearity='hardswish', stride=1, bias=self.bias)

            conv_12_0 = nn.Conv2d(in_channels=int(96 * width_mult), out_channels=int(576 * width_mult),
                                  kernel_size=(1, 1), bias=self.bias)
            conv_12_1 = nn.Hardswish()
            conv_12_2 = nn.BatchNorm2d(num_features=int(576 * width_mult))

            if classifier:
                self.features = nn.Sequential(
                    conv_0_0,
                    conv_0_1,
                    conv_0_2,
                    conv_1_0,
                    conv_2_0,
                    conv_3_0,
                    conv_4_0,
                    conv_5_0,
                    conv_6_0,
                    conv_7_0,
                    conv_8_0,
                    conv_9_0,
                    conv_10_0,
                    conv_11_0,
                    conv_12_0,
                    conv_12_1,
                    conv_12_2
                )
                
                self.classifiers = nn.Sequential(
                    nn.AdaptiveAvgPool2d(output_size=1),
                    nn.Flatten(start_dim=1),
                    nn.Linear(int(576 * width_mult), int(1024 * width_mult)),
                    nn.Dropout(p=self.dropout),
                    nn.Linear(int(1024 * width_mult), classifier_out_features)
                )
            else:
                self.features = nn.Sequential(
                    conv_0_0,
                    conv_0_1,
                    conv_0_2,
                    conv_1_0,
                    conv_2_0,
                    conv_3_0,
                    conv_4_0,
                    conv_5_0,
                    conv_6_0,
                    conv_7_0,
                    conv_8_0,
                    conv_9_0,
                    conv_10_0,
                    conv_11_0,
                    conv_12_0,
                    conv_12_1,
                    conv_12_2,
                    nn.Conv2d(in_channels=int(576 * width_mult), out_channels=int(1280 * width_mult),
                                          kernel_size=(1, 1), bias=self.bias),
                    nn.Hardswish()
                )
                
                self.classifiers = nn.Identity()

    def forward(self, x):
        x = self.features(x)
        x = self.classifiers(x)
        return x

In [7]:
# Test MobileNetV3-Small-1.0-224
summary(MobileNetV3(size='small', width_mult=1.0, classifier=False), input_size=(1, 3, 224, 224))

Layer (type:depth-idx)                        Output Shape              Param #
MobileNetV3                                   --                        --
├─Sequential: 1-1                             [1, 1280, 7, 7]           --
│    └─Conv2d: 2-1                            [1, 16, 112, 112]         448
│    └─BatchNorm2d: 2-2                       [1, 16, 112, 112]         32
│    └─Hardswish: 2-3                         [1, 16, 112, 112]         --
│    └─Bottleneck: 2-4                        [1, 16, 56, 56]           --
│    │    └─Conv2d: 3-1                       [1, 16, 112, 112]         272
│    │    └─BatchNorm2d: 3-2                  [1, 16, 112, 112]         32
│    │    └─ReLU: 3-3                         [1, 16, 112, 112]         --
│    │    └─Conv2d: 3-4                       [1, 16, 56, 56]           160
│    │    └─BatchNorm2d: 3-5                  [1, 16, 56, 56]           32
│    │    └─SqueezeExciteModule: 3-6          [1, 16, 1, 1]             544
│    │    └─Conv

In [8]:
# Test MobileNetV3-Large-1.0-300
summary(MobileNetV3(size='large', width_mult=1.0, classifier=False), input_size=(1, 3, 300, 300))

Layer (type:depth-idx)                        Output Shape              Param #
MobileNetV3                                   --                        --
├─Sequential: 1-1                             [1, 1280, 10, 10]         --
│    └─Conv2d: 2-1                            [1, 16, 150, 150]         448
│    └─BatchNorm2d: 2-2                       [1, 16, 150, 150]         32
│    └─Hardswish: 2-3                         [1, 16, 150, 150]         --
│    └─Bottleneck: 2-4                        [1, 16, 150, 150]         --
│    │    └─Conv2d: 3-1                       [1, 16, 150, 150]         272
│    │    └─BatchNorm2d: 3-2                  [1, 16, 150, 150]         32
│    │    └─ReLU: 3-3                         [1, 16, 150, 150]         --
│    │    └─Conv2d: 3-4                       [1, 16, 150, 150]         160
│    │    └─BatchNorm2d: 3-5                  [1, 16, 150, 150]         32
│    │    └─Identity: 3-6                     [1, 16, 150, 150]         --
│    │    └─Conv2

In [10]:
# Load features and iterate over features (if applicable)
base_model = MobileNetV3(size='small', width_mult=1.0, classifier=False).features

# Construct new sequential model for SSD300 feature linking
all_module_list = list(base_model.modules())

first_conv_modules = all_module_list[1:4]
bneck_modules = []
for module in all_module_list:
    if isinstance(module, Bottleneck):
        bneck_modules.append(module)
last_conv_modules = all_module_list[-5:]
print(last_conv_modules)

[Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1)), Hardswish(), BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), Conv2d(576, 1280, kernel_size=(1, 1), stride=(1, 1)), Hardswish()]


In [None]:
# Make SSDLite detector!
class DwConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
        super().__init__()
        if type(kernel_size) == type(tuple()):
            kernel_size = kernel_size[0]
        
        self.depthwise = nn.Conv2d(
            in_channels=in_channels, out_channels=in_channels, groups=in_channels,
            kernel_size=kernel_size, padding=kernel_size // 2, bias=False, stride = stride
        )
        self.pointwise = nn.Conv2d(
            in_channels=in_channels, out_channels=out_channels,
            kernel_size=1, bias=False
        )
                 
    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        return x
    
class MobileNetV3SSD(nn.Module):
    def __init__(self, size='small', width_mult=1.0, input_size=300, num_classes=3):
        super().__init__()
        
        self.features = MobileNetV3(size=size, width_mult=width_mult, classifier=False).features
        
        self.num_classes = num_classes
        
        # Bottleneck 2-13 output (paper Table 1.의 마지막에서 2번째 bneck의 출력)에 연결
        
        # extra0 input size: Identity: 1-2 [1280, 7, 7]
        self.extra0 = nn.Sequential(
            DwConv2d(1280, 1024, kernel_size=(3, 3)),
            nn.ReLU(inplace=True)
        )
        
        self.extra1 = nn.Sequential(
            DwConv2d(1024, 1024, kernel_size=(1, 1)),
            nn.ReLU(inplace=True)
        )
        self.extra2 = nn.Sequential(
            DwConv2d(1024, 256, kernel_size=(1, 1)),
            nn.ReLU(inplace=True),
            DwConv2d(256, 512, kernel_size=(3, 3), stride=2),
            nn.ReLU(inplace=True)
        )
        self.extra3 = nn.Sequential(
            DwConv2d(512, 128, kernel_size=(1, 1)),
            nn.ReLU(inplace=True),
            DwConv2d(128, 256, kernel_size=(3, 3), stride=2),
            nn.ReLU(inplace=True)
        )
        self.extra4 = nn.Sequential(
            DwConv2d(256, 128, kernel_size=(1, 1)),
            nn.ReLU(inplace=True),
            DwConv2d(128, 256, kernel_size=(3, 3)),
            nn.ReLU(inplace=True)
        )
        self.extra5 = nn.Sequential(
            DwConv2d(256, 128, kernel_size=(1, 1)),
            nn.ReLU(inplace=True),
            DwConv2d(128, 256, kernel_size=(3, 3)),
            nn.ReLU(inplace=True)
        )
        
        self.extras = [
            self.extra0,
            self.extra1,
            self.extra2,
            self.extra3,
            self.extra4,
            self.extra5
        ]
        
        # Classifier network will not change
        # (because output feature C size is same)
        self.classifier0 = nn.Sequential(
            nn.Conv2d(1024, (4 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        self.classifier1 = nn.Sequential(
            nn.Conv2d(1024, (6 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        self.classifier2 = nn.Sequential(
            nn.Conv2d(512, (6 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        self.classifier3 = nn.Sequential(
            nn.Conv2d(256, (6 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        self.classifier4 = nn.Sequential(
            nn.Conv2d(256, (4 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        self.classifier5 = nn.Sequential(
            nn.Conv2d(256, (4 * (self.num_classes + 4)), kernel_size=(3, 3), padding=3//2)
        )
        
        self.classifiers = [
            self.classifier0,
            self.classifier1,
            self.classifier2,
            self.classifier3,
            self.classifier4,
            self.classifier5
        ]
        
    def forward(self, x):
        x = self.features(x)
        detections = []
            
        classifier = self.classifiers[classifier_id[6]]
        result = classifier(x)
        detections.append(result)
        
        return x
    
    
# Test MobileNetV3-SSD-Small-1.0-224
summary(MobileNetV3SSD(size='small', width_mult=1.0, input_size=300), input_size=(1, 3, 300, 300))

In [None]:
'''
    Summarize SSD module from ssd.py
'''

import torch
from torch.nn import Conv2d, Sequential, ModuleList, BatchNorm2d
from torch import nn
from models.mb3_ssd.vision.nn.mobilenet_v2 import InvertedResidual
from models.mb3_ssd.vision.nn.mobilenet_v3 import MobileNetV3 as MobileNetV3Base
from models.mb3_ssd.vision.ssd.ssd import SSD, GraphPath
from models.mb3_ssd.vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
import numpy as np

class MobileNetV3Config(object):
    def __init__(self):
        self.image_size = 300
        self.image_mean = np.array([127, 127, 127])  # RGB layout
        self.image_std = 128.0
        self.iou_threshold = 0.45
        self.center_variance = 0.1
        self.size_variance = 0.2

        self.specs = [
            SSDSpec(19, 16, SSDBoxSizes(60, 105), [2, 3]),
            SSDSpec(10, 32, SSDBoxSizes(105, 150), [2, 3]),
            SSDSpec(5, 64, SSDBoxSizes(150, 195), [2, 3]),
            SSDSpec(3, 100, SSDBoxSizes(195, 240), [2, 3]),
            SSDSpec(2, 150, SSDBoxSizes(240, 285), [2, 3]),
            SSDSpec(1, 300, SSDBoxSizes(285, 330), [2, 3])
        ]


        self.priors = generate_ssd_priors(self.specs, self.image_size)
        
config = MobileNetV3Config()

# testing parameter
# base_net = MobileNetV3(size='small', width_mult=1.0, classifier=False).features
base_net = MobileNetV3Base().features
width_mult = 1.0
num_classes = 3
is_test = False

def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, onnx_compatible=False):
    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
    """
    ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
    return Sequential(
        Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
               groups=in_channels, stride=stride, padding=padding),
        BatchNorm2d(in_channels),
        ReLU(),
        Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
    )

source_layer_indexes = [GraphPath(11, 'conv'),20,]

extras = ModuleList([
    InvertedResidual(1280, 512, stride=2, expand_ratio=0.2),
    InvertedResidual(512, 256, stride=2, expand_ratio=0.25),
    InvertedResidual(256, 256, stride=2, expand_ratio=0.5),
    InvertedResidual(256, 64, stride=2, expand_ratio=0.25)
])

regression_headers = ModuleList([
    SeperableConv2d(in_channels=round(288 * width_mult), out_channels=6 * 4,
                    kernel_size=3, padding=1, onnx_compatible=False),
    SeperableConv2d(in_channels=1280, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
    SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
    SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
    SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
    Conv2d(in_channels=64, out_channels=6 * 4, kernel_size=1),
])

classification_headers = ModuleList([
    SeperableConv2d(in_channels=round(288 * width_mult), out_channels=6 * num_classes, kernel_size=3, padding=1),
    SeperableConv2d(in_channels=1280, out_channels=6 * num_classes, kernel_size=3, padding=1),
    SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
    SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
    SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
    Conv2d(in_channels=64, out_channels=6 * num_classes, kernel_size=1),
])

ssd_model = SSD(num_classes, base_net, source_layer_indexes,
           extras, classification_headers, regression_headers, is_test=is_test, config=config)

ssd_model.eval()
# if is_test:
#     confidences, boxes = ssd_model(torch.zeros(1, 3, 224, 224))
#     print('confidences:', confidences.shape)
#     print('boxes:', boxes.shape)
# else:
#     confidences, locations = ssd_model(torch.zeros(1, 3, 224, 224))
#     print('confidences:', confidences.shape)
#     print('locations:', locations.shape)

summary(base_net, (1, 3, 224, 224))

In [None]:
summary(ssd_model, (1, 3, 224, 224))