# FCOS: anchor free

## Backbone 
<img src="imgs/backbone.png" width="700" height="400" align="bottom">

### ResNet50

<img src="imgs/stage1.jpg" width="260" height="800" align="left"/>
<img src="imgs/stage2.jpg" width="255" height="750" align="left"/>
<img src="imgs/stage3.jpg" width="180" height="750" align="left"/>
<img src="imgs/stage4.jpg" width="280" height="750" align="left"/>
<img src="imgs/shortcut.png" width="400" height="600" align=""/>
<img src="imgs/resnet50.png" width="800" height="400" align="bottom"/>

In [1]:
import torch
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F
import math

model_urls = {
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
}


def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class Bottleneck(nn.Module):
    # ResNet-B
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, if_include_top=True):
        self.inplanes = 64
        super(ResNet, self).__init__()
        # 7*7 -- maxpool
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        # stage1
        self.layer1 = self._make_layer(block, 64, layers[0])
        # stage2
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        # stage3
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        # stage4
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        # 平均池化
        self.avgpool = nn.AvgPool2d(7, stride=1)
        if if_include_top:  # 默认是接全连接层的
            self.fc = nn.Linear(512 * block.expansion, num_classes)
        self.if_include_top = if_include_top
        # 权重初始化
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        # 7*7 -- maxpool
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x) # 1 / 2
        # stage1
        x = self.layer1(x)  # 1 / 4
        # stage2
        out3 = self.layer2(x)  # 1 / 8
        # stage3
        out4 = self.layer3(out3)  # 1 / 16
        # stage4
        out5 = self.layer4(out4)  #  1 / 32

        if self.if_include_top:  # 是不是保留头部的全连接层
            x = self.avgpool(out5)
            x = x.view(x.size(0), -1)
            x = self.fc(x)
            return x
        else:
            return (out3, out4, out5)
    
    def freeze_bn(self):
        for layer in self.modules():
            if isinstance(layer, nn.BatchNorm2d):
                layer.eval()


    def freeze_stages(self, stage):
        if stage >= 0:
            self.bn1.eval()
            for m in [self.conv1, self.bn1]:
                for param in m.parameters():
                    param.requires_grad = False
        for i in range(1, stage + 1):
            layer = getattr(self, 'layer{}'.format(i))
            layer.eval()
            for param in layer.parameters():
                param.requires_grad = False


def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], if_include_top=False, **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model

In [2]:
res_net = resnet50()

In [3]:
x = torch.randn((2,3,320,480))
y_res = res_net(x)
for i in range(3):
    print(y_res[i].shape)

torch.Size([2, 512, 40, 60])
torch.Size([2, 1024, 20, 30])
torch.Size([2, 2048, 10, 15])


### DarkNet19
对网络推断速度进行优化时，骨干网络可以考虑替换为更轻便的darknet19
<img src="imgs/darknet19.png" width="500" height="800" align="center"/>

In [4]:
cfg1 = [32, 'M', 64, 'M', 128, 64, 128, 'M', 256, 128, 256]
cfg2 = ['M', 512, 256, 512, 256, 512]
cfg3 = ['M', 1024, 512, 1024, 512, 1024]

def make_layers(cfg, in_channels=3, batch_norm=True, flag=True):
    """
    从配置参数中构建网络
    :param cfg:  参数配置
    :param in_channels: 输入通道数,RGB彩图为3, 灰度图为1
    :param batch_norm:  是否使用批正则化
    :return:
    """
    layers = []
    #flag = True             # 用于变换卷积核大小,(True选后面的,False选前面的)
    in_channels= in_channels
    for v in cfg:
        if v == 'M':
            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
        else:
            layers.append(nn.Conv2d(in_channels = in_channels,
                                   out_channels= v,
                                   kernel_size=(1, 3)[flag],
                                   stride=1,
                                   padding=(0,1)[flag],
                                   bias=False))
            if batch_norm:
                layers.append(nn.BatchNorm2d(v))
            in_channels = v

            layers.append(nn.LeakyReLU(negative_slope=0.1, inplace=True))

        flag = not flag

    return nn.Sequential(*layers)

class Darknet19(nn.Module):
    """
    Darknet19 模型
    """
    def __init__(self, in_channels=3, batch_norm=True, pretrained=False):
        """
        模型结构初始化
        :param num_classes: 最终分类数       (nums of classification.)
        :param in_channels: 输入数据的通道数  (input pic`s channel.)
        :param batch_norm:  是否使用正则化    (use batch_norm, True or False;True by default.)
        :param pretrained:  是否导入预训练参数 (use the pretrained weight)
        """
        super(Darknet19, self).__init__()
        # 调用make_layers 方法搭建网络
        # (build the network)
        self.block1 = make_layers(cfg1, in_channels=in_channels, batch_norm=batch_norm, flag=True)
        self.block2 = make_layers(cfg2, in_channels=cfg1[-1], batch_norm=batch_norm, flag=False)
        self.block3 = make_layers(cfg3, in_channels=cfg2[-1], batch_norm=batch_norm, flag=False)
        # 导入预训练模型或初始化
        if pretrained:
            self.load_weight()
        else:
            self._initialize_weights()

    def forward(self, x):
        # 前向传播
        feature1 = self.block1(x)
        feature2 = self.block2(feature1)
        feature3 = self.block3(feature2)
        return [feature1, feature2, feature3]

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def load_weight(self):
        weight_file = '../weight/darknet19-deepBakSu-e1b3ec1e.pth'
        dic = {}
        for now_keys, values in zip(self.state_dict().keys(), torch.load(weight_file).values()):
            dic[now_keys]=values
        self.load_state_dict(dic)

In [5]:
darknet = Darknet19()

In [6]:
x = torch.randn((2,3,320,480))
y = darknet(x)
for i in range(3):
    print(y[i].shape)

torch.Size([2, 256, 40, 60])
torch.Size([2, 512, 20, 30])
torch.Size([2, 1024, 10, 15])


## FPN
<img src="imgs/fpn.png" width="700" height="400" align="bottom">

In [11]:
class FPN(nn.Module):
    """only for resnet50 or darknet19"""

    def __init__(self, features=256, use_p5=True, backbone="resnet50"):
        super(FPN, self).__init__()
        if backbone == "resnet50":
            print("resnet50 backbone")
            self.prj_5 = nn.Conv2d(2048, features, kernel_size=1)  # 不改变特征图的尺寸
            self.prj_4 = nn.Conv2d(1024, features, kernel_size=1)
            self.prj_3 = nn.Conv2d(512, features, kernel_size=1)
        elif backbone == "darknet19":
            print("darnet19 backbone")
            self.prj_5 = nn.Conv2d(1024, features, kernel_size=1)  # 不改变特征图的尺寸
            self.prj_4 = nn.Conv2d(512, features, kernel_size=1)
            self.prj_3 = nn.Conv2d(256, features, kernel_size=1)
        else:
            raise ValueError("arg 'backbone' only support 'resnet50' or 'darknet19'")

        self.conv_5 = nn.Conv2d(features, features, kernel_size=3, padding=1)  # 不改变特征图的尺寸
        self.conv_4 = nn.Conv2d(features, features, kernel_size=3, padding=1)
        self.conv_3 = nn.Conv2d(features, features, kernel_size=3, padding=1)
        if use_p5:
            self.conv_out6 = nn.Conv2d(features, features, kernel_size=3, padding=1, stride=2)  # 将特征图尺寸缩小一半
        else:
            if backbone == "resnet50":
                self.conv_out6 = nn.Conv2d(2048, features, kernel_size=3, padding=1, stride=2)  # 将特征图尺寸缩小一半
            elif backbone == "darknet19":
                self.conv_out6 = nn.Conv2d(1024, features, kernel_size=3, padding=1, stride=2)
            
        self.conv_out7 = nn.Conv2d(features, features, kernel_size=3, padding=1, stride=2)  # 将特征图尺寸缩小一半
        self.use_p5 = use_p5
        self.apply(self.init_conv_kaiming)  # 对FPN结构使用凯明初始化

    def upsamplelike(self, inputs):  # 将src的尺寸大小，上采样到 target的尺寸
        src, target = inputs
        return F.interpolate(src, size=(target.shape[2], target.shape[3]),mode='nearest')

    def init_conv_kaiming(self, module):
        if isinstance(module, nn.Conv2d):  # 判断变量module是不是nn.Conv2d类
            nn.init.kaiming_uniform_(module.weight, a=1)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def forward(self, x):
        C3, C4, C5 = x
        print('输入特征层维度：', C3.shape, C4.shape, C5.shape, '\n')
        # 1 从backbone到fpn的侧连， 从backbone转换到fpn
        P5 = self.prj_5(C5)
        P4 = self.prj_4(C4)
        P3 = self.prj_3(C3)
        # 2 上采样特征融合
        P4 = P4 + self.upsamplelike([P5, P4])  # 先将P5上采样到C4大小，再用元素相加的方式进行融合
        P3 = P3 + self.upsamplelike([P4, P3])  # 先将P4上采样到C3大小，再用元素相加的方式进行融合

        P3 = self.conv_3(P3)  # 融合后再卷积的目的：用卷积操作平滑一下特征图的数值
        P4 = self.conv_4(P4)
        P5 = self.conv_5(P5)
        # 3 fpn下采样
        P5 = P5 if self.use_p5 else C5
        P6 = self.conv_out6(P5)
        P7 = self.conv_out7(F.relu(P6))
        return [P3, P4, P5, P6, P7]  # 返回融合后的特征图

In [12]:
fpn = FPN()
out_fpn = fpn(y_res)
for i in range(len(out_fpn)):
    print(out_fpn[i].shape)

resnet50 backbone
输入特征层维度： torch.Size([2, 512, 40, 60]) torch.Size([2, 1024, 20, 30]) torch.Size([2, 2048, 10, 15]) 

torch.Size([2, 256, 40, 60])
torch.Size([2, 256, 20, 30])
torch.Size([2, 256, 10, 15])
torch.Size([2, 256, 5, 8])
torch.Size([2, 256, 3, 4])


## Head
<img src="imgs/head.png" width="700" height="400" align="bottom">

**[nn.parameter](https://www.jianshu.com/p/d8b77cc02410)**

可以把这个函数理解为类型转换函数，将一个不可训练的数据类型Tensor转换成可以训练的数据类型parameter，并将这个parameter绑定到这个module里面

In [28]:
class ScaleExp(nn.Module):
    '''定义一个指数放缩的模块'''
    def __init__(self, init_value=1.0):
        super(ScaleExp, self).__init__()
        self.scale = nn.Parameter(torch.tensor([init_value], dtype=torch.float32))

    def forward(self, x):
        return torch.exp(x * self.scale)  # 乘一个 可以训练的 缩放因子 scale

In [52]:
def prt(*x):
    print(*x)
    pass

In [53]:
class ClsCntRegHead(nn.Module):
    """检测分枝"""
    def __init__(self, in_channel, class_num, GN=True, cnt_on_reg=True, prior=0.01):
        super(ClsCntRegHead, self).__init__()
        
        self.prior = prior
        self.class_num = class_num
        self.cnt_on_reg = cnt_on_reg
        
        # 1 ================================= 从fpn到head的侧连 ==================================
        cls_branch = []
        reg_branch = []
        for i in range(4):
            # cls_branch： conv--gn--relu  这里的卷积都不改变图像尺寸
            cls_branch.append(nn.Conv2d(in_channel, in_channel, kernel_size=3, padding=1, bias=True))
            if GN:
                cls_branch.append(nn.GroupNorm(32, in_channel))
            cls_branch.append(nn.ReLU(True))
            # reg_branch： conv--gn--relu  这里的卷积都不改变图像尺寸
            reg_branch.append(nn.Conv2d(in_channel, in_channel, kernel_size=3, padding=1, bias=True))
            if GN:
                reg_branch.append(nn.GroupNorm(32, in_channel))
            reg_branch.append(nn.ReLU(True))
        
        # 1.1 分类网络路径 bypass
        self.cls_conv = nn.Sequential(*cls_branch)
        # 1.2 回归网络路径 bypass
        self.reg_conv = nn.Sequential(*reg_branch)
        prt('Bypasses of the detection head:')
        prt(self.cls_conv, '\n')
        prt(self.reg_conv)
        # ===========================================================================================
        
        
        # 2 ================================ 网络输出 ===============================
        # 2.1 网络分类路径输出
        self.cls_logits = nn.Conv2d(in_channel, class_num, kernel_size=3, padding=1)
        # 2.2 网络回归路径输出
        self.reg_pred = nn.Conv2d(in_channel, 4, kernel_size=3, padding=1)
        # 2.3 目标中心输出
        self.cnt_logits = nn.Conv2d(in_channel, 1, kernel_size=3, padding=1)
        # ===========================================================================
        
        # 3 网络参数初始化
        self.apply(self.init_conv_RandomNormal)
        nn.init.constant_(self.cls_logits.bias, - math.log((1 - prior) / prior))
        
        # 4 实例化五个缩放层
        self.scale_exp = nn.ModuleList([ScaleExp(1.0) for _ in range(5)])  

    def init_conv_RandomNormal(self, module, std=0.01):
        if isinstance(module, nn.Conv2d):
            nn.init.normal_(module.weight, std=std)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def forward(self, inputs):
        """inputs:[P3~P7]"""
        cls_logits = []
        cnt_logits = []
        reg_preds = []
        for index, P in enumerate(inputs):
            # 所有的p层都经过cls_conv(),所以这里，模块cls_conv()的参数是共享的
            cls_conv_out = self.cls_conv(P)  
            # 所有的p层都经过reg_conv(),所以这里，模块reg_conv()的参数是共享的
            reg_conv_out = self.reg_conv(P)  

            cls_logits.append(self.cls_logits(cls_conv_out))
            
            if not self.cnt_on_reg:  # 中心回归放在哪一个分支上，是cls_conv_out，还是reg_conv_out
                cnt_logits.append(self.cnt_logits(cls_conv_out))  
            else:
                cnt_logits.append(self.cnt_logits(reg_conv_out))  # 中心回归默认放在 reg_conv_out分支
                
            reg_preds.append(self.scale_exp[index](self.reg_pred(reg_conv_out)))
        return cls_logits, cnt_logits, reg_preds  # 每个返回的list都有5个分量 对应P3-P7的卷积输出

In [58]:
head = ClsCntRegHead(256, 20)

Bypasses of the detection head:
Sequential(
  (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): GroupNorm(32, 256, eps=1e-05, affine=True)
  (2): ReLU(inplace)
  (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): GroupNorm(32, 256, eps=1e-05, affine=True)
  (5): ReLU(inplace)
  (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (7): GroupNorm(32, 256, eps=1e-05, affine=True)
  (8): ReLU(inplace)
  (9): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (10): GroupNorm(32, 256, eps=1e-05, affine=True)
  (11): ReLU(inplace)
) 

Sequential(
  (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): GroupNorm(32, 256, eps=1e-05, affine=True)
  (2): ReLU(inplace)
  (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): GroupNorm(32, 256, eps=1e-05, affine=True)
  (5): ReLU(inplace)
  (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), pad

In [41]:
out_head = head(out_fpn)
for i in range(3):
    for j in range(5):
        print(out_head[i][j].shape)

torch.Size([2, 20, 40, 60])
torch.Size([2, 20, 20, 30])
torch.Size([2, 20, 10, 15])
torch.Size([2, 20, 5, 8])
torch.Size([2, 20, 3, 4])
torch.Size([2, 1, 40, 60])
torch.Size([2, 1, 20, 30])
torch.Size([2, 1, 10, 15])
torch.Size([2, 1, 5, 8])
torch.Size([2, 1, 3, 4])
torch.Size([2, 4, 40, 60])
torch.Size([2, 4, 20, 30])
torch.Size([2, 4, 10, 15])
torch.Size([2, 4, 5, 8])
torch.Size([2, 4, 3, 4])


## FOC model

<img src="imgs/backbone.png" width="700" height="400" align="bottom">

In [45]:
class FCOS(nn.Module):

    def __init__(self, config=None):
        super().__init__()
        if config is None:
            config = DefaultConfig
        if config.backbone == "resnet50":
            self.backbone = resnet50(pretrained=config.pretrained)
        elif config.backbone == "darknet19":    
            self.backbone = Darknet19()
            
        self.fpn = FPN(config.fpn_out_channels, 
                       use_p5=config.use_p5,
                      backbone=config.backbone)
        
        self.head = ClsCntRegHead(config.fpn_out_channels, 
                                  config.class_num,
                                  config.use_GN_head, 
                                  config.cnt_on_reg, 
                                  config.prior)
        self.config = config

    def train(self, mode=True):
        """
        set module training mode, and frozen bn
        """
        super().train(mode=True)

        def freeze_bn(module):
            if isinstance(module, nn.BatchNorm2d):
                module.eval()
            classname = module.__class__.__name__
            if classname.find('BatchNorm') != -1:
                for p in module.parameters(): p.requires_grad = False

        if self.config.freeze_bn:
            self.apply(freeze_bn)
            print("INFO===>success frozen BN")
        if self.config.freeze_stage_1:
            self.backbone.freeze_stages(1)
            print("INFO===>success frozen backbone stage1")

    def forward(self, x):
        """
        Returns
        list [cls_logits,cnt_logits,reg_preds]
        cls_logits  list contains five [batch_size,class_num,h,w]
        cnt_logits  list contains five [batch_size,1,h,w]
        reg_preds   list contains five [batch_size,4,h,w]
        """
        C3, C4, C5 = self.backbone(x)
        all_P = self.fpn([C3, C4, C5])
        cls_logits, cnt_logits, reg_preds = self.head(all_P)
        return [cls_logits, cnt_logits, reg_preds]

In [49]:
class DefaultConfig():
    # backbone
    backbone="darknet19"
#     backbone="resnet50"
    pretrained = False  # 不加载预训练模型
    freeze_stage_1 = True
    freeze_bn = True

    # fpn
    fpn_out_channels = 256
    use_p5 = True
    
    # head
    class_num = 20
    use_GN_head = True
    prior = 0.01
    add_centerness = True
    cnt_on_reg = False

    # training
    strides = [8, 16, 32, 64, 128]
    limit_range = [[-1, 64], [64, 128], [128, 256], [256, 512], [512, 999999]]

    # inference
    score_threshold = 0.3
    nms_iou_threshold = 0.2
    max_detection_boxes_num = 150    

In [59]:
def prt(*x):
#     print(*x)
    pass

In [60]:
fcos = FCOS()

darnet19 backbone


In [61]:
x = torch.randn((2,3,320,480))
y = fcos(x)
for i in range(3):
    for j in range(5):
        print(y[i][j].shape)

输入特征层维度： torch.Size([2, 256, 40, 60]) torch.Size([2, 512, 20, 30]) torch.Size([2, 1024, 10, 15]) 

torch.Size([2, 20, 40, 60])
torch.Size([2, 20, 20, 30])
torch.Size([2, 20, 10, 15])
torch.Size([2, 20, 5, 8])
torch.Size([2, 20, 3, 4])
torch.Size([2, 1, 40, 60])
torch.Size([2, 1, 20, 30])
torch.Size([2, 1, 10, 15])
torch.Size([2, 1, 5, 8])
torch.Size([2, 1, 3, 4])
torch.Size([2, 4, 40, 60])
torch.Size([2, 4, 20, 30])
torch.Size([2, 4, 10, 15])
torch.Size([2, 4, 5, 8])
torch.Size([2, 4, 3, 4])
