In [6]:
import torch
import torch.nn as nn
import numpy as np
import torchvision
from torchvision import models
import torch.nn.functional as F
import torch.nn.init as init
from utils import ssd_helper as ssdhp

In [2]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.relu2 = nn.ReLU(inplace=True)

        self.downsample = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = self.relu1(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.downsample(x)
        out = self.relu2(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=1000):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu1 = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.relu1(self.bn1(self.conv1(x)))
#         print(out.size())
        out = self.maxpool(out)
#         print(out.size())
        out = self.layer1(out)
#         print(out.size())
        out = self.layer2(out)
#         print(out.size())
        out = self.layer3(out)
#         print(out.size())
        out = self.layer4(out)
#         print(out.size())
        out = self.avgpool(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        
        return out


def ResNet18():
    return ResNet(BasicBlock, [2,2,2,2])

In [8]:
class L2Norm(nn.Module):

    def __init__(self, n_channels, scale):
        super(L2Norm, self).__init__()
        self.n_channels = n_channels
        self.gamma = scale or None
        self.eps = 1e-10
        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
        self.reset_parameters()

    def reset_parameters(self):
        init.constant_(self.weight, self.gamma)
        

    def forward(self, x):
        norm = torch.sqrt(x.pow(2).sum(dim=1, keepdim=True)) + self.eps
        x = torch.div(x, norm)
        x = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
        return x    
    
def extra():
    layers = []
    conv8_1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1)
    conv8_2 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
    conv9_1 = nn.Conv2d(512, 128, kernel_size=1, stride=1)
    conv9_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)
    conv10_1 = nn.Conv2d(256, 128, kernel_size=1, stride=1)
    conv10_2 = nn.Conv2d(128, 256, kernel_size=3, stride=1)
    conv11_1 = nn.Conv2d(256, 128, kernel_size=1)
    conv11_2 = nn.Conv2d(128, 256, kernel_size=3, stride=1)

    layers = [conv8_1, conv8_2, conv9_1, conv9_2, conv10_1, conv10_2, conv11_1, conv11_2]

    return layers


def feature_extractor(ver, extral, bboxes, num_classes):
    
    loc_layers = []
    conf_layers = []
    
    if ver == 'RES18_SSD':
        loc_layers += [nn.Conv2d(128, bboxes[0] * 4, kernel_size=3, padding=1)]
        loc_layers += [nn.Conv2d(256, bboxes[1] * 4, kernel_size=3, padding=1)]
        conf_layers += [nn.Conv2d(128, bboxes[0] * num_classes, kernel_size=3, padding=1)]
        conf_layers += [nn.Conv2d(256, bboxes[1] * num_classes, kernel_size=3, padding=1)]
    
    
    for k, v in enumerate(extral[1::2], 2):
        loc_layers += [nn.Conv2d(v.out_channels, bboxes[k]
                                 * 4, kernel_size=3, padding=1)]
        conf_layers += [nn.Conv2d(v.out_channels, bboxes[k]
                                  * num_classes, kernel_size=3, padding=1)]
        
    
    return loc_layers, conf_layers 


class RES18_SSD(nn.Module):

    def __init__(self, num_classes, bboxes, pretrained=None ):
        super(RES18_SSD, self).__init__()
        
        self.ver = 'RES18_SSD'
        self.num_classes = num_classes
        self.bboxes = bboxes      
        self.extra_list = extra()
        self.loc_layers_list, self.conf_layers_list = feature_extractor(self.ver, self.extra_list, self.bboxes, self.num_classes)
        self.L2Norm = L2Norm(128, 20)


        resnet = ResNet18()
        if pretrained:
            net = torch.load('./weights/newresnet.pth')
            print('resnet18 pretrain_model loading...')
            resnet.load_state_dict(net)
        
        self.res = nn.Sequential(
            *list(resnet.children())[:-2],
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6),
            nn.ReLU(inplace=True),
            nn.Conv2d(1024, 1024, kernel_size=1),
            nn.ReLU(inplace=True)
        )
        self.extras = nn.ModuleList(self.extra_list)
        self.loc = nn.ModuleList(self.loc_layers_list)
        self.conf = nn.ModuleList(self.conf_layers_list)
        
        
#  xavier initialization
#         layers = [self.extras, self.loc, self.conf]
#         print(self.vgg)
#         for i in layers:
#             for m in i.modules():
#                 if isinstance(m, nn.Conv2d):
#                     nn.init.xavier_uniform_(m.weight)
#                     nn.init.zeros_(m.bias)

    def forward(self, x):

        source = []
        loc = []
        conf = []
        res_source = [5, 6]
        for i, v in enumerate(self.res):
            x = v(x)
            if i in res_source:
                if i == 5:
                    s = self.L2Norm(x)
                else:
                    s = x
                source.append(s)

        for i, v in enumerate(self.extras):
            x = F.relu(v(x), inplace=True)
            if i % 2 == 1:
                source.append(x)


        for s, l, c in zip(source, self.loc, self.conf):
            loc.append(l(s).permute(0, 2, 3, 1).contiguous())
            conf.append(c(s).permute(0, 2, 3, 1).contiguous())

        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
       

        loc = loc.view(loc.size(0), -1, 4)
        conf = conf.view(conf.size(0), -1, self.num_classes)
        return loc, conf

In [1]:
class MultiBoxEncoder(object):
    
    def __init__(self, opt):
        self.variance = opt.variance
        default_boxes = list()
        
        for k in range(len(opt.grids)):
            for v, u in itertools.product(range(opt.grids[k]), repeat=2):
                cx = (u + 0.5) * opt.steps[k]
                cy = (v + 0.5) * opt.steps[k]

                s = opt.sizes[k]
                default_boxes.append((cx, cy, s, s))

                s = np.sqrt(opt.sizes[k] * opt.sizes[k + 1])
                default_boxes.append((cx, cy, s, s))

                s = opt.sizes[k]
                for ar in opt.aspect_ratios[k]:
                    default_boxes.append(
                        (cx, cy, s * np.sqrt(ar), s / np.sqrt(ar)))
                    default_boxes.append(
                        (cx, cy, s / np.sqrt(ar), s * np.sqrt(ar)))

        default_boxes = np.clip(default_boxes, a_min=0, a_max=1)
        self.default_boxes = np.array(default_boxes)

    def encode(self, boxes, labels, threshold=0.5):
       
        if len(boxes) == 0:
            return (
                np.zeros(self.default_boxes.shape, dtype=np.float32),
                np.zeros(self.default_boxes.shape[:1], dtype=np.int32))

        iou = bbox_iou(point_form(self.default_boxes), boxes)


        gt_idx = iou.argmax(axis=1)
        iou = iou.max(axis=1)
        boxes = boxes[gt_idx]
        labels = labels[gt_idx]

        loc = np.hstack((
            ((boxes[:, :2] + boxes[:, 2:]) / 2 - self.default_boxes[:, :2]) /
            (self.variance[0] * self.default_boxes[:, 2:]),
            np.log((boxes[:, 2:] - boxes[:, :2]) / self.default_boxes[:, 2:]) /
            self.variance[1]))

        conf = 1 + labels
        conf[iou < threshold] = 0
       

        return loc.astype(np.float32), conf.astype(np.int32)

    def decode(self, loc):
        
        boxes = np.hstack((
            self.default_boxes[:, :2] +
            loc[:, :2] * self.variance[0] * self.default_boxes[:, 2:],
            self.default_boxes[:, 2:] * np.exp(loc[:, 2:] * self.variance[1])))
        boxes[:, :2] -= boxes[:, 2:] / 2
        boxes[:, 2:] += boxes[:, :2]

        return boxes

In [4]:
def hard_negtives(logits, labels, pos, neg_radio):
    
    
    num_batch, num_anchors, num_classes = logits.shape
    logits = logits.view(-1, num_classes)
    labels = labels.view(-1)
    
    losses = F.cross_entropy(logits, labels, reduction='none')

    losses = losses.view(num_batch, num_anchors)

    losses[pos] = 0

    
    loss_idx = losses.argsort(1, descending=True)
    rank = loss_idx.argsort(1) 

    num_pos = pos.long().sum(1, keepdim=True)
    num_neg = torch.clamp(neg_radio*num_pos, max=pos.shape[1]-1) #(batch, 1)
    neg = rank < num_neg.expand_as(rank)
    
    
    
    return neg
    
class MultiBoxLoss(nn.Module):

    def __init__(self, num_classes=10, neg_radio=3):
        super(MultiBoxLoss, self).__init__()
        self.num_classes = num_classes
        self.neg_radio = neg_radio
    
    def forward(self, pred_loc, pred_label, gt_loc, gt_label):
        

        num_batch = pred_loc.shape[0]

        pos_idx = gt_label > 0
        pos_loc_idx = pos_idx.unsqueeze(2).expand_as(pred_loc)
        pred_loc_pos = pred_loc[pos_loc_idx].view(-1, 4)
        gt_loc_pos = gt_loc[pos_loc_idx].view(-1, 4)

        loc_loss = F.smooth_l1_loss(pred_loc_pos, gt_loc_pos, reduction='sum')

        
        logits = pred_label.detach()
        labels = gt_label.detach()
        neg_idx = hard_negtives(logits, labels, pos_idx, self.neg_radio) #neg (batch, n)

        pos_cls_mask = pos_idx.unsqueeze(2).expand_as(pred_label)
        neg_cls_mask = neg_idx.unsqueeze(2).expand_as(pred_label)

        conf_p = pred_label[(pos_cls_mask+neg_cls_mask).gt(0)].view(-1, self.num_classes)
        target = gt_label[(pos_idx+neg_idx).gt(0)]

        cls_loss = F.cross_entropy(conf_p, target, reduction='sum')
        N = pos_idx.long().sum()

        loc_loss /= N
        cls_loss /= N


        return loc_loss, cls_loss

In [8]:
class Config:
    #voc root
    # VOC_ROOT = '/SSD_ResNet_Pytorch'

    #class + 1
    num_classes = 4
    #learning rate
    lr = 0.001
    #ssd paper = 32
    batch_size = 32 
    momentum = 0.9
    weight_decay = 0.0005
    # 40k + 10k = 116 epock
    epoch = 116 
    #pre-train VGG root
    #The resnet pre-train model is in lib.res-model...
    save_folder = './weights/'
    # basenet = 'vgg16_reducedfc.pth'
    log_fn = 10 
    neg_ratio = 3   
    #input-image size
    min_size = 300
    #boxe out image size
    grids = (38, 19, 10, 5, 3, 1)
    #boxes num
    anchor_num = [4, 6, 6, 6, 4, 4]
    #255 * R, G, B
    mean = (104, 117, 123)
    aspect_ratios = ((2,), (2, 3), (2, 3), (2, 3), (2,), (2,))
    steps = [s / 300 for s in (8, 16, 32, 64, 100, 300)]
    sizes = [s / 300 for s in (30, 60, 111, 162, 213, 264, 315)] 
    variance = (0.1, 0.2)

opt = Config()

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

def adjust_learning_rate1(optimizer):
    lr = opt.lr * 0.1
    print('change learning rate, now learning rate is :', lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
def adjust_learning_rate2(optimizer):
    lr = opt.lr * 0.01
    print('change learning rate, now learning rate is :', lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [None]:
model = RES18_SSD(opt.num_classes, opt.anchor_num, pretrained=False).to(device)
model.train()

mb = MultiBoxEncoder(opt)

# image_sets = [['2007', 'trainval'], ['2012', 'trainval']]
# dataset = VOCDetection(opt, image_sets=image_sets, is_train=True)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=opt.batch_size, collate_fn=detection_collate, num_workers=4)

criterion = MultiBoxLoss(opt.num_classes, opt.neg_radio).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=opt.momentum,weight_decay=opt.weight_decay)

In [None]:
def train():
    for e in range(opt.epoch):
        if e == 77:
            adjust_learning_rate1(optimizer)
        elif e == 96:
            adjust_learning_rate2(optimizer)
        total_loc_loss = 0
        total_cls_loss = 0
        total_loss = 0
        for i , (img, boxes) in enumerate(dataloader):
            img = img.to(device)
            gt_boxes = []
            gt_labels = []
            for box in boxes:
                labels = box[:, 4]
                box = box[:, :-1]
                match_loc, match_label = mb.encode(box, labels)
            
                gt_boxes.append(match_loc)
                gt_labels.append(match_label)
            
            gt_boxes = torch.FloatTensor(gt_boxes).to(device)
            gt_labels = torch.LongTensor(gt_labels).to(device)


            p_loc, p_label = model(img)


            loc_loss, cls_loss = criterion(p_loc, p_label, gt_boxes, gt_labels)

            loss = loc_loss + cls_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loc_loss += loc_loss.item()
            total_cls_loss += cls_loss.item()
            total_loss += loss.item()
            if i % opt.log_fn == 0:
                avg_loc = total_loc_loss / (i+1)
                avg_cls = total_cls_loss / (i+1)
                avg_loss = total_loss / (i+1)
                print('epoch[{}] | batch_idx[{}] | loc_loss [{:.2f}] | cls_loss [{:.2f}] | total_loss [{:.2f}]'.format(e, i, avg_loc, avg_cls, avg_loss))
        if e > 100:
            torch.save(model.state_dict(), os.path.join(opt.save_folder, 'loss-{:.2f}.pth'.format(total_loss)))