In [6]:
import torch
import torch.nn as nn
import torch.functional as F
import numpy as np
import torch.optim as optim

In [7]:
# EfficientNet에서는 Swish activation function을 사용합니다.
class Swish(nn.Module):
    def __init__(self):
        super().__init__()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        return x * self.sigmoid(x)

# Model을 다 구축하고 난 이후에, Model에 돌아가는지, Check를 하면, 오류가 어디서 발생하였는지
# 정확히 파악하는 것이 불가능하므로, 중간중간 확인하는 것이 좋습니다.

x = torch.randn(3, 3, 224, 224)
model = Swish()
output = model(x)
output.size()

torch.Size([3, 3, 224, 224])

In [14]:
class SEBlock(nn.Module):
    def __init__(self, in_channels, r = 4):
        super(SEBlock, self).__init__()

        self.squeeze = nn.AdaptiveAvgPool2d(1)
        self.excitation = nn.Sequential(
            nn.Linear(in_channels, in_channels // r),
            Swish(),
            nn.Linear(in_channels // r, in_channels),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        x = self.squeeze(x)          # (3, 3, 1, 1)
        x = x.view(x.size(0), -1)    # (3, 3)
        x = self.excitation(x)       # (3, 3)
        x = x.view(x.size(0), x.size(1), 1, 1)       # (3, 3, 1, 1)

        return x

In [15]:
# EfficientNet에서 사용되는 MBConv입니다.
class MBConv(nn.Module):
    expand = 6
    def __init__(self, in_channels, out_channels, kernel_size, stride = 1, se_scale = 4, p = 0.5):
        super().__init__()
        self.p = torch.tensor(p).float() if (in_channels == out_channels) else torch.tensor(1).float()

        self.residual = nn.Sequential(
            nn.Conv2d(in_channels, in_channels * MBConv.expand, 1, stride = stride, padding = 0, bias = False),
            nn.BatchNorm2d(in_channels * MBConv.expand, momentum = 0.99, eps = 1e-3),
            Swish(),
            nn.Conv2d(in_channels * MBConv.expand, in_channels * MBConv.expand, kernel_size = kernel_size, 
                      stride = 1, padding = kernel_size // 2, bias = False, groups = in_channels * MBConv.expand),
            nn.BatchNorm2d(in_channels * MBConv.expand, momentum = 0.99, eps = 1e-3),
            Swish()
        )

        self.se = SEBlock(in_channels * MBConv.expand, se_scale)

        self.project = nn.Sequential(
            nn.Conv2d(in_channels * MBConv.expand, out_channels, kernel_size = 1, stride = 1, padding = 0, bias = False),
            nn.BatchNorm2d(out_channels, momentum = 0.99, eps = 1e-3)
        )

        self.shortcut = (stride == 1) and (in_channels == out_channels)
    
    def forward (self, x):
        if self.training:
            if not torch.bernoulli(self.p):
                return x
            
        x_shortcut = x
        x_residual = self.residual(x)
        x_se = self.se(x_residual)

        x = x_se * x_residual
        x = self.project(x)

        if self.shortcut:
            x = x_shortcut + x
        
        return x


torch.Size([3, 3, 224, 224])

In [16]:
class SepConv(nn.Module):
    expand = 1
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, se_scale=4, p=0.5):
        super().__init__()
        # first SepConv is not using stochastic depth
        self.p = torch.tensor(p).float() if (in_channels == out_channels) else torch.tensor(1).float()

        self.residual = nn.Sequential(
            nn.Conv2d(in_channels * SepConv.expand, in_channels * SepConv.expand, kernel_size=kernel_size,
                      stride=1, padding=kernel_size//2, bias=False, groups=in_channels*SepConv.expand),
            nn.BatchNorm2d(in_channels * SepConv.expand, momentum=0.99, eps=1e-3),
            Swish()
        )

        self.se = SEBlock(in_channels * SepConv.expand, se_scale)

        self.project = nn.Sequential(
            nn.Conv2d(in_channels*SepConv.expand, out_channels, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(out_channels, momentum=0.99, eps=1e-3)
        )

        self.shortcut = (stride == 1) and (in_channels == out_channels)

    def forward(self, x):
        # stochastic depth
        if self.training:
            if not torch.bernoulli(self.p):
                return x

        x_shortcut = x
        x_residual = self.residual(x)
        x_se = self.se(x_residual)

        x = x_se * x_residual
        x = self.project(x)

        if self.shortcut:
            x= x_shortcut + x

        return x

In [24]:
class EfficientNet(nn.Module):
    def __init__(self, num_classes = 10, width_coef = 1., depth_coef = 1., scale = 1., dropout = 0.2,
                 se_scale = 4, stochastic_depth = False, p = 0.5):
        super().__init__()
        channels = [32, 16, 24, 40, 80, 112, 192, 320, 1280]
        repeats = [1, 2, 2, 3, 3, 4, 1]
        strides = [1, 2, 2, 2, 1, 2, 1]
        kernel_size = [3, 3, 5, 3, 5, 5, 3]
        depth = depth_coef
        width = width_coef

        channels = [int(x*width) for x in channels]
        repeats = [int(x*depth) for x in repeats]

        if stochastic_depth:
            self.p = p
            self.step = (1 - 0.5) / (sum(repeats) - 1)
        else:
            self.p = 1
            self.step = 0
        

        self.upsample = nn.Upsample(scale_factor = scale, mode = 'bilinear', align_corners = False)
        self.stage1 = nn.Sequential(
            nn.Conv2d(3, channels[0], 3, stride = 2, padding = 1, bias = False),
            nn.BatchNorm2d(channels[0], momentum = 0.99, eps = 1e-3)
        )

        self.stage2 = self._make_Block(SepConv, repeats[0], channels[0], channels[1], kernel_size[0], strides[0], se_scale)

        self.stage3 = self._make_Block(MBConv, repeats[1], channels[1], channels[2], kernel_size[1], strides[1], se_scale)

        self.stage4 = self._make_Block(MBConv, repeats[2], channels[2], channels[3], kernel_size[2], strides[2], se_scale)

        self.stage5 = self._make_Block(MBConv, repeats[3], channels[3], channels[4], kernel_size[3], strides[3], se_scale)

        self.stage6 = self._make_Block(MBConv, repeats[4], channels[4], channels[5], kernel_size[4], strides[4], se_scale)

        self.stage7 = self._make_Block(MBConv, repeats[5], channels[5], channels[6], kernel_size[5], strides[5], se_scale)

        self.stage8 = self._make_Block(MBConv, repeats[6], channels[6], channels[7], kernel_size[6], strides[6], se_scale)

        self.stage9 = nn.Sequential(
            nn.Conv2d(channels[7], channels[8], 1, stride = 1, bias = False),
            nn.BatchNorm2d(channels[8], momentum = 0.99, eps = 1e-3),
            Swish()
        )

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(p = dropout)
        self.linear = nn.Linear(channels[8], num_classes)
    
    def forward(self, x):
        x = self.upsample(x)
        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        x = self.stage5(x)
        x = self.stage6(x)
        x = self.stage7(x)
        x = self.stage8(x)
        x = self.stage9(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.linear(x)
        return x
    
    def _make_Block(self, block, repeats, in_channels, out_channels, kernel_size, stride, se_scale): 
        strides = [stride] + [1] *(repeats - 1)
        layers = []
        for stride in strides:
            layers.append(block(in_channels, out_channels, kernel_size, stride, se_scale, self.p))
            in_channels = out_channels
            self.p -= self.step
        
        return nn.Sequential(*layers)

In [25]:
def efficientnet_b0(num_classes=10):
    return EfficientNet(num_classes=num_classes, width_coef=1.0, depth_coef=1.0, scale=1.0,dropout=0.2, se_scale=4)

def efficientnet_b1(num_classes=10):
    return EfficientNet(num_classes=num_classes, width_coef=1.0, depth_coef=1.1, scale=240/224, dropout=0.2, se_scale=4)

def efficientnet_b2(num_classes=10):
    return EfficientNet(num_classes=num_classes, width_coef=1.1, depth_coef=1.2, scale=260/224., dropout=0.3, se_scale=4)

def efficientnet_b3(num_classes=10):
    return EfficientNet(num_classes=num_classes, width_coef=1.2, depth_coef=1.4, scale=300/224, dropout=0.3, se_scale=4)

def efficientnet_b4(num_classes=10):
    return EfficientNet(num_classes=num_classes, width_coef=1.4, depth_coef=1.8, scale=380/224, dropout=0.4, se_scale=4)

def efficientnet_b5(num_classes=10):
    return EfficientNet(num_classes=num_classes, width_coef=1.6, depth_coef=2.2, scale=456/224, dropout=0.4, se_scale=4)

def efficientnet_b6(num_classes=10):
    return EfficientNet(num_classes=num_classes, width_coef=1.8, depth_coef=2.6, scale=528/224, dropout=0.5, se_scale=4)

def efficientnet_b7(num_classes=10):
    return EfficientNet(num_classes=num_classes, width_coef=2.0, depth_coef=3.1, scale=600/224, dropout=0.5, se_scale=4)

In [26]:
x = torch.randn(3, 3, 224, 224)
model = efficientnet_b0()
output = model(x)
output.size()

torch.Size([3, 10])

In [28]:
!pip install torchsummary
from torchsummary import summary
summary(model, (3, 224, 224))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
          Upsample-1          [-1, 3, 224, 224]               0
            Conv2d-2         [-1, 32, 112, 112]             864
       BatchNorm2d-3         [-1, 32, 112, 112]              64
            Conv2d-4         [-1, 32, 112, 112]             288
       BatchNorm2d-5         [-1, 32, 112, 112]              64
           Sigmoid-6         [-1, 32, 112, 112]               0
             Swish-7         [-1, 32, 112, 112]               0
 AdaptiveAvgPool2d-8             [-1, 32, 1, 1]               0
            Linear-9                    [-1, 8]             264
          Sigmoid-10                    [-1, 8]               0
            Swish-11                    [-1, 8]               0
           Linear-12                   [-1, 32]             288
    