In [1]:
import cv2
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import namedtuple
from torch.nn.init import xavier_uniform_, constant_, zeros_, normal_
import math

In [2]:
#常用函数

def weights_init(model):
    """ Initializes the weights of the CNN model using the Xavier
    initialization.
    """
    if isinstance(model, nn.Conv2d):
        xavier_uniform_(model.weight, gain=math.sqrt(2.0))
        constant_(model.bias, 0.1)
    elif isinstance(model, nn.BatchNorm2d):
        normal_(model.weight, 1.0, 0.02)
        zeros_(model.bias)

# AlexNet
- ALEXNET过程：input:3 channel
        input->Conv 11*11s4,96->BN->ReLU->MaxPool 3*3s2->  
        Conv 5*5s1,256->BN->ReLU->MaxPool 3*3s2->  
        Conv 3*3s1,384->Conv 3*3s1,384->Conv 3*3s1,256->MaxPool 3*3s2->  
        FC 4096->FC 4096->FC 1000
- 在本实验中，去掉上述过程中的FC层，只使用全卷积网络部分来提取特征
- 参考网络结构 https://github.com/rafellerc/Pytorch-SiamFC/blob/master/training/models.py

In [3]:
class BaselineEmbeddingNet(nn.Module):
    """ Definition of the embedding network used in the baseline experiment of
    Bertinetto et al in https://arxiv.org/pdf/1704.06036.pdf.
    It basically corresponds to the convolutional stage of AlexNet, with some
    of its hyperparameters changed.
    """
    def __init__(self):
        super(BaselineEmbeddingNet, self).__init__()
        self.fully_conv = nn.Sequential(nn.Conv2d(3, 96, kernel_size=11,
                                                  stride=2, bias=True),
                                        nn.BatchNorm2d(96),
                                        nn.ReLU(),
                                        nn.MaxPool2d(3, stride=2),

                                        nn.Conv2d(96, 256, kernel_size=5,
                                                  stride=1, groups=2,
                                                  bias=True),
                                        nn.BatchNorm2d(256),
                                        nn.ReLU(),
                                        nn.MaxPool2d(3, stride=1),
                                        nn.Conv2d(256, 384, kernel_size=3,
                                                  stride=1, groups=1,
                                                  bias=True),
                                        nn.BatchNorm2d(384),
                                        nn.ReLU(),
                                        nn.Conv2d(384, 384, kernel_size=3,
                                                  stride=1, groups=2,
                                                  bias=True),
                                        nn.BatchNorm2d(384),
                                        nn.ReLU(),
                                        nn.Conv2d(384, 32, kernel_size=3,
                                                  stride=1, groups=2,
                                                  bias=True))

    def forward(self, x):
        output = self.fully_conv(x)
        return output

    def get_embedding(self, x):
        return self.forward(x)


In [4]:
class SiameseNet(nn.Module):
    """ The basic siamese network joining network, that takes the outputs of
    two embedding branches and joins them applying a correlation operation.
    Should always be used with tensors of the form [B x C x H x W], i.e.
    you must always include the batch dimension.
    """

    def __init__(self, embedding_net, upscale=False, corr_map_size=33, stride=4):
        """
        Args:
            embedding_net: 指需要使用的全卷积网络(我使用ALEXNET的全卷积部分)
            corr_map_size：指最后的sore_map的尺寸
        
        """
        super(SiameseNet, self).__init__()
        self.embedding_net = embedding_net
        self.match_batchnorm = nn.BatchNorm2d(1)

        self.upscale = upscale
        # TODO calculate automatically the final size and stride from the
        # parameters of the branch
        self.corr_map_size = corr_map_size
        self.stride = stride
        # Calculates the upscale size based on the correlation map size and
        # the total stride of the network, so as to align the corners of the
        # original and the upscaled one, which also aligns the centers.
        self.upsc_size = (self.corr_map_size-1)*self.stride + 1
        # The upscale_factor is the correspondence between a movement in the output
        # feature map and the input images. So if a network has a total stride of 4
        # and no deconvolutional or upscaling layers, a single pixel displacement
        # in the output corresponds to a 4 pixels displacement in the input
        # image. The easiest way to compensate this effect is to do a bilinear
        # or bicubic upscaling.
        if upscale:
            self.upscale_factor = 1
        else:
            self.upscale_factor = self.stride

    def forward(self, x1, x2):
        """
        Args:
            x1 (torch.Tensor): The reference patch of dimensions [B, C, H, W].
                Usually the shape is [8, 3, 127, 127].
            x2 (torch.Tensor): The search region image of dimensions
                [B, C, H', W']. Usually the shape is [8, 3, 255, 255].
        Returns:
            match_map (torch.Tensor): The score map for the pair. For the usual
                input shapes, the output shape is [8, 1, 33, 33].
        """
        embedding_reference = self.embedding_net(x1)
        embedding_search = self.embedding_net(x2)
        match_map = self.match_corr(embedding_reference, embedding_search)
        return match_map

    def get_embedding(self, x):
        return self.embedding_net(x)

    def match_corr(self, embed_ref, embed_srch):
        """ Matches the two embeddings using the correlation layer. As per usual
        it expects input tensors of the form [B, C, H, W].
        Args:
            embed_ref: (torch.Tensor) The embedding of the reference image, or
                the template of reference (the average of many embeddings for
                example).
            embed_srch: (torch.Tensor) The embedding of the search image.
        Returns:
            match_map: (torch.Tensor) The correlation between
        """
        b, c, h, w = embed_srch.shape
        # 在这里，相关层是使用F.conv2d函数实现，为了处理batch维度，使用groups参数的trick。
        # 简单来说就是把Batch维度concat到channel维度中（使其成为[1 x（B.C）x H'x W']），并将组数设置为batch的大小。
        # 尽管不明显，但这种分组的卷积/相关性等效于两个图像之间的相关性。
        
        # F.conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) → Tensor
        # 效果是把weight作为卷积核对input进行卷积操作
        # 为什么要进行concat操作？
        # input的形状为(batch,in_channels,iH,iW) weight的形状为(out_channels,in_channels/groups,kH,kW) 
        # 等于有out_channels个数个in_channels/groups通道数的kH*kW卷积核对input进行卷积操作
        # 在这里embed_ref形状为(b,c,h/2,w/2),in_channels为b*c，in_channels/groups=c，刚好满足条件。
        # 因此函数得到的match_map的形状为(1,b,oH,oW)，这样通道数b就是批次数
        match_map = F.conv2d(embed_srch.view(1, b * c, h, w),
                             embed_ref, groups=b)
        # Here we reorder the dimensions to get back the batch dimension.
        # 利用permute函数对维度进行重新排序。(1,b,oH,oW)->(b,1,oH,oW)
        match_map = match_map.permute(1, 0, 2, 3)
        #正则化映射至[0,1],此处利用的就是BN
        match_map = self.match_batchnorm(match_map)
        if self.upscale:
            match_map = F.interpolate(match_map, self.upsc_size, mode='bilinear',
                                      align_corners=False)

        return match_map


In [5]:
# Test ： 查看结果的尺寸是否和希望的torch.Size([8, 1, 33, 33])一致
z=torch.randn(8,3,127,127)
x=torch.randn(8,3,255,255)
base=BaselineEmbeddingNet()
net=SiameseNet(base)
output=net(z,x)
print(output.shape)

torch.Size([8, 1, 33, 33])


# 重定义Traker类
- 再次重定义是为了使用GOT-10K工具，这样就不需要写dataset类了:)
- 同时在类内定义了使用的优化器、box
- 参考实现：https://github.com/got-10k/siamfc/blob/master/siamfc.py
- init()的理解参考：https://blog.csdn.net/laizi_laizi/article/details/104622760/

In [7]:
from got10k.trackers import Tracker

In [31]:
class TrackerSiamFC(Tracker):
    def __init__(self,pretrained_model=None,**kargs):
        super(TrackerSiamFC,self).__init__(name='SiamFC',is_deterministic=True)
        # parse_args是下面定义的一个函数，目的是自定义参数。
        self.cfg=self.parse_args(**kargs)
        
        # cuda
        self.device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # setup pretrained model
        # 实例化前面定义的网络
        self.base=BaselineEmbeddingNet()
        self.net=SiameseNet(base)
        if pretrained_model is not None:
            # map_location的作用主要是在不同设备上进行转换(原来的device->cpu)
            self.net.load_state_dict(torch.load(pretrained_model,map_location=lambda storage, loc: storage))
            # 再从cpu->device
        self.net=self.net.to(self.devcie)
        
        
        # setup optimizer
        self.optimizer=optim.SGD(
            self.net.parameters(),
            lr=self.cfg.initial_lr,
            lr=self.cfg.initial_lr,
            weight_decay=self.cfg.weight_decay,
            momentum=self.cfg.momentum
        )
        
        # setup lr scheduler
        # ExponentialLR是按照gamma指数衰减
        self.lr_scheduler=optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=self.cfg.lr_decay)
        
    def parse_args(self,**kargs):
        # 默认的cfg
        cfg = {
            # inference parameters
            'exemplar_sz': 127,
            'instance_sz': 255,
            'context': 0.5,
            'scale_num': 3,
            'scale_step': 1.0375,
            'scale_lr': 0.59,
            'scale_penalty': 0.9745,
            'window_influence': 0.176,
            'response_sz': 17,
            'response_up': 16,
            'total_stride': 8,
            'adjust_scale': 0.001,
            # train parameters
            'initial_lr': 0.01,
            'lr_decay': 0.8685113737513527,
            'weight_decay': 5e-4,
            'momentum': 0.9,
            'r_pos': 16,
            'r_neg': 0}
        
        for key, val in kargs.items():#取出可变字典的内容
            if key in cfg:
                cfg.update({key: val})#更新参数
        
        # return namedtuple的用法：
        # 例如x={'a':1,'b':2} k=namedtuple('GenericDict', x.keys())(**x)
        # 其结果k是GenericDict(a=1, b=2)
        # 此时可以用类似属性值的方式获得值
        # 比如k.a就可以得到1
        # 因此，此处namedtuple('GenericDict', cfg.keys())(**cfg)
        # 是将字典cfg也作为namedtuple，方便self.cfg.x的方式去使用
        return namedtuple('GenericDict', cfg.keys())(**cfg)
    
    def init(self,image,box):
        image=np.asarray(image)
        
        # 输入的是tlwh的box，使用的是center+hw的box，因此需要转化
        box = np.array([
            box[1] - 1 + (box[3] - 1) / 2,
            box[0] - 1 + (box[2] - 1) / 2,
            box[3], box[2]], dtype=np.float32)
        self.center,self.target_sz=box[:2],box[2:] # 取出点和hw
        
        
        
        
        

SyntaxError: keyword argument repeated (<ipython-input-31-2c96e0e8f8b3>, line 25)

# DATASET