# 1.Prepare section

In [45]:
import os
import math
from collections import OrderedDict
from itertools import product

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import random_split
import torchvision.transforms as transforms
import cv2
import tqdm
import numpy as np
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [46]:
import xml.etree.ElementTree as ET

# 2.Transform section

In [91]:
class ComposeTransform():
    """
    複数のTransformをまとめあげる
    """
    def __init__(self, transforms=None):
        """
        Parameters
        --------------
        transforms: list
            transformのインスタンスをリストにして渡す
        """
        self.transforms = transforms

    def __call__(self, x):
        if self.transforms:
            for transform in self.transforms:
                x = transform(x)
        return x


class BaseTransform():
    """
    自作Transformの基底クラス
    """
    def __init__(self, debug=False):
        self.debug = debug
    
    def __call__(self):
        raise NotImplementedError()


class SimpleTransform(BaseTransform):
    """
    とりあえずのクラス
    よく使うものを入れておく
    扱う関数が増えてきたらテーマごとに分離する
    """
    def __init__(self):
        super().__init__()
        self.applied_transforms = [
            self.pil2cv,
            self.resize
        ]
    def __call__(self, x):
        if self.debug:
            for transform in self.applied_transforms:
                x = transform(x)
                print(str(transform))
                print(x)
                print('-------------------')
        else:
            for transform in self.applied_transforms:
                x = transform(x)
        return x


    def pil2cv(self, image):
        ''' PIL型 -> OpenCV型 '''
        new_image = np.array(image, dtype=np.uint8)
        if new_image.ndim == 2:  # モノクロ
            pass
        elif new_image.shape[2] == 3:  # カラー
            new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
        elif new_image.shape[2] == 4:  # 透過
            new_image = cv2.cvtColor(new_image, cv2.COLOR_RGBA2BGRA)
        return new_image
    
    def resize(self, x):
        return cv2.resize(x, (300, 300))

class SimpleTargetTransform(BaseTransform):
    """
    とりあえずのクラス
    ターゲットに対して行う前処理を記述するクラス
    よく使うものを入れておく
    扱う関数が増えてきたらテーマごとに分離する
    """
    def __init__(self):
        super().__init__()
        self.applied_transforms = [
            self.xml_parser
        ]
    def __call__(self, y):
        if self.debug:
            for transform in self.applied_transforms:
                y = transform(y)
                print(str(transform))
                print(y)
                print('-------------------')
        else:
            for transform in self.applied_transforms:
                y = transform(y)
        return y
    
    def xml_parser(self, y):
        class_index = ['aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 'chair',
               'cow', 'diningtable', 'dog', 'horse',
               'motorbike', 'person', 'pottedplant',
               'sheep', 'sofa', 'train', 'tvmonitor']
        
        annotation = y['annotation']
        size = annotation['size']
        w = float(size['width'])
        h = float(size['height'])
        object_ = annotation['object']

        result = []
        for obj in object_:
            class_ = obj['name']
            target = class_index.index(class_)
            xmin = float(obj['bndbox']['xmin'])/w
            ymin = float(obj['bndbox']['ymin'])/h
            xmax = float(obj['bndbox']['xmax'])/w
            ymax = float(obj['bndbox']['ymax'])/h
            result.append([xmin, ymin, xmax, ymax, target])

        return result


# 3.Dataset section

I will use given dataset in this notebook, so this section is no in use

# 4.Model section

In [48]:
def vgg_layers():
    layers = [
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2, ceil_mode= True),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size= 3, stride= 1, padding=1),
            nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6),
            nn.ReLU(inplace=True),
            nn.Conv2d(1024, 1024, kernel_size=1),
            nn.ReLU(inplace=True),
        ]
    #nn.ModuleListに入れないとうまく層として動かない。例えばto()
    return nn.ModuleList(layers)

In [49]:
def extras_layers():
    layers= [
        nn.Conv2d(1024, 256, kernel_size=(1)),
        nn.Conv2d(256, 512, kernel_size=(3), stride=2, padding=1),
        nn.Conv2d(512, 128, kernel_size=(1)),
        nn.Conv2d(128, 256, kernel_size=(1), stride=2, padding=1),
        nn.Conv2d(256, 128, kernel_size=(1)),
        nn.Conv2d(128, 256, kernel_size=(1)),
        nn.Conv2d(256, 128, kernel_size=(1)),
        nn.Conv2d(128, 256, kernel_size=(1)),
    ]
    return nn.ModuleList(layers)

In [50]:
def loc_layers():
    # 位置の予測のための層
    layers = [
       # out1 に対する処理
        nn.Conv2d(512, 4*4, kernel_size=3, padding=1),
       # out2 に対する処理
        nn.Conv2d(1024, 6*4, kernel_size=3, padding=1),
       # out3 に対する処理
        nn.Conv2d(512, 6*4, kernel_size=3, padding=1),
       # out4 に対する処理
        nn.Conv2d(256, 6*4, kernel_size=3, padding=1),
       # out5 に対する処理
        nn.Conv2d(256, 4*4, kernel_size=3, padding=1),
       # out6 に対する処理
        nn.Conv2d(256, 4*4, kernel_size=3, padding=1),
    ]
    return nn.ModuleList(layers)

In [51]:
def conf_layers(num_classes=10):
    # 信頼度のための予測
    layers = [
       # out1 に対する処理
        nn.Conv2d(512, 4*num_classes, kernel_size=3, padding=1),
       # out2 に対する処理
        nn.Conv2d(1024, 6*num_classes, kernel_size=3, padding=1),
       # out3 に対する処理
        nn.Conv2d(512, 6*num_classes, kernel_size=3, padding=1),
       # out4 に対する処理
        nn.Conv2d(256, 6*num_classes, kernel_size=3, padding=1),
       # out5 に対する処理
        nn.Conv2d(256, 4*num_classes, kernel_size=3, padding=1),
       # out6 に対する処理
       nn.Conv2d(256, 4*num_classes, kernel_size=3, padding=1)
    ]
    return nn.ModuleList(layers)

In [52]:
class L2Norm(nn.Module):
    def __init__(self,n_channels=512, scale=20):
        super(L2Norm,self).__init__()
        self.n_channels = n_channels
        self.gamma = scale
        self.eps = 1e-10
        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
        self.reset_parameters()
        
    def reset_parameters(self):
        nn.init.constant_(self.weight,self.gamma)
    def forward(self, x):
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps
        x = torch.div(x,norm)
        out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
        return out

In [53]:
class PriorBox(object):
    def __init__(self):
        super(PriorBox, self).__init__()
        self.image_size = 300
        self.feature_maps = [38, 19, 10, 5, 3, 1]
        self.steps = [8, 16, 32, 64, 100, 300]
        self.min_sizes = [30, 60, 111, 162, 213, 264]
        self.max_sizes = [60, 111, 162, 213, 264, 315]
        self.aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]

    def forward(self):
        mean = []
        for k, f in enumerate(self.feature_maps):
            for i, j in product(range(f), repeat=2):
                f_k = self.image_size / self.steps[k]
                cx = (j + 0.5) / f_k
                cy = (i + 0.5) / f_k
                s_k = self.min_sizes[k]/self.image_size
                mean += [cx, cy, s_k, s_k]
                s_k_prime = math.sqrt(s_k * (self.max_sizes[k]/self.image_size))
                mean += [cx, cy, s_k_prime, s_k_prime]
                for ar in self.aspect_ratios[k]:
                    mean += [cx, cy, s_k*math.sqrt(ar), s_k/math.sqrt(ar)]
                    mean += [cx, cy, s_k/math.sqrt(ar), s_k*math.sqrt(ar)]
        output = torch.Tensor(mean).view(-1, 4)
        output.clamp_(max=1, min=0)
        return output

In [54]:
class SSD(nn.Module):

    def __init__(self, phase='train',num_classes=21):
        super(SSD,self).__init__()
        self.phase = phase
        self.num_classes = num_classes
        self.vgg_layers= vgg_layers()
        self.extras_layers= extras_layers()
        self.L2Norm = L2Norm()
        self.loc = loc_layers()
        self.conf = conf_layers(num_classes)
        dbox = PriorBox()
        self.priors = dbox.forward()
        if phase == 'test':
            self.detect = Detect() 
    def forward(self, x):
        bs = len(x)
        out, lout, cout = [], [], []
        for layer in self.vgg_layers[0:23]:
            x = layer(x)
        x1 = x
        out.append(self.L2Norm(x1))
        for layer in self.vgg_layers[23:]:
            x = layer(x)
        out.append(x)
        for i in range(0,8,2):
            x = F.relu(self.extras_layers[i](x), inplace=True)
            x = F.relu(self.extras_layers[i+1](x), inplace=True)
            out.append(x)
        # outに層を適用
        for i in range(6):
            lx = self.loc[i](out[i]).permute(0,2,3,1).reshape(bs,-1,4)
            cx = self.conf[i](out[i]).permute(0,2,3,1).reshape(bs,-1,self.num_classes)
            lout.append(lx)
            cout.append(cx)
        lout = torch.cat(lout, 1)
        cout = torch.cat(cout, 1)

        output = (lout, cout, self.priors)
        if self.phase == 'test':
            return self.detect.apply(output,self.num_classes)
        else:
            return output
        return x
 

## 損失関数

In [80]:
def point_form(boxes):
    """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
    representation for comparison to point form ground truth data.
    Args:
        boxes: (tensor) center-size default boxes from priorbox layers.
    Return:
        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
    """
    return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
                     boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax


def center_size(boxes):
    """ Convert prior_boxes to (cx, cy, w, h)
    representation for comparison to center-size form ground truth data.
    Args:
        boxes: (tensor) point_form boxes
    Return:
        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
    """
    return torch.cat((boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
                     boxes[:, 2:] - boxes[:, :2], 1)  # w, h


def intersect(box_a, box_b):
    """ We resize both tensors to [A,B,2] without new malloc:
    [A,2] -> [A,1,2] -> [A,B,2]
    [B,2] -> [1,B,2] -> [A,B,2]
    Then we compute the area of intersect between box_a and box_b.
    Args:
      box_a: (tensor) bounding boxes, Shape: [A,4].
      box_b: (tensor) bounding boxes, Shape: [B,4].
    Return:
      (tensor) intersection area, Shape: [A,B].
    """
    A = box_a.size(0)
    B = box_b.size(0)
    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
    inter = torch.clamp((max_xy - min_xy), min=0)
    return inter[:, :, 0] * inter[:, :, 1]


def jaccard(box_a, box_b):
    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
    is simply the intersection over union of two boxes.  Here we operate on
    ground truth boxes and default boxes.
    E.g.:
        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
    Args:
        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
    Return:
        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
    """
    inter = intersect(box_a, box_b)
    area_a = ((box_a[:, 2]-box_a[:, 0]) *
              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
    area_b = ((box_b[:, 2]-box_b[:, 0]) *
              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
    union = area_a + area_b - inter
    return inter / union  # [A,B]

## match に入力されるのは１つの教師データに対する情報
## 複数の BB と ラベルがある

def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
    """Match each prior box with the ground truth box of the highest jaccard
    overlap, encode the bounding boxes, then return the matched indices
    corresponding to both confidence and location preds.
    Args:
        threshold: (float) The overlap threshold used when mathing boxes.
        truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
            BB と ラベル １データに対するもの複数ある
        priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
            デフォルトボックス、8732 個、４次元で表現
        variances: (tensor) Variances corresponding to each prior coord,
            Shape: [num_priors, 4].
            decode するときの変数
        labels: (tensor) All the class labels for the image, Shape: [num_obj].
            ラベルの集合
        loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
            返り値？？
        conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
            返り値？？
        idx: (int) current batch index
            入力されたデータは batch の何番目のデータか index
    Return:
        The matched indices corresponding to 1)location and 2)confidence preds.
        loc_t[idx] と conf_t[idx] で loc_t と conf_t の配列を作っている
        loc_t[idx] --> [8732,4] 複数 BB があるが１つだけが取り出されている
        conf_t[idx] --> [8732] 複数 BB があるが１つだけが取り出されている
    """
    # jaccard index
    overlaps = jaccard(truths,point_form(priors))
    # ここで返ってくるのは BB の個数分の 8732 の DB との IOU --> [3, 8732]
    # (Bipartite Matching)
    # [1,num_objects] best prior for each ground truth
    best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
    ## --> 各 BB に対する最大の IOU とその DB の index
    # [1,num_priors] best ground truth for each prior
    best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
    ## --> 各 BB に対する各 DB との IOU の一覧で最大値を取ったもの
    ## 何番目の BB だったかが best_truth_idx
    best_truth_idx.squeeze_(0)  ## [1,8732] --> [8732]
    best_truth_overlap.squeeze_(0)  ## [1,8732] --> [8732]
    best_prior_idx.squeeze_(1)  ## [3,1] --> [3]
    best_prior_overlap.squeeze_(1)  ## [3,1] --> [3]
    best_truth_overlap.index_fill_(0, best_prior_idx, 2)  # ensure best prior
    # TODO refactor: index  best_prior_idx with long tensor
    # ensure every gt matches with its prior of max overlap
    for j in range(best_prior_idx.size(0)):
        best_truth_idx[best_prior_idx[j]] = j
    matches = truths[best_truth_idx]          # Shape: [num_priors,4]
    conf = labels[best_truth_idx] + 1         # Shape: [num_priors]  各 DB のクラス
    conf[best_truth_overlap < threshold] = 0  # label as background  thr 以下は背景
    loc = encode(matches, priors, variances)  ## 各 DB に対してオフセットの値へ
    loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
    conf_t[idx] = conf  # [num_priors] top class label for each prior

## それぞれの DBox に対して最も合っている BB とのオフセットとラベルを返している

def encode(matched, priors, variances):
    """Encode the variances from the priorbox layers into the ground truth boxes
    we have matched (based on jaccard overlap) with the prior boxes.
    Args:
        matched: (tensor) Coords of ground truth for each prior in point-form
            Shape: [num_priors, 4].
        priors: (tensor) Prior boxes in center-offset form
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        encoded boxes (tensor), Shape: [num_priors, 4]
    """
    # dist b/t match center and prior's center
    g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
    # encode variance
    g_cxcy /= (variances[0] * priors[:, 2:])
    # match wh / prior wh
    g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
    g_wh = torch.log(g_wh) / variances[1]
    # return target for smooth_l1_loss
    return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]


# Adapted from https://github.com/Hakuyume/chainer-ssd
def decode(loc, priors, variances):
    """Decode locations from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        loc (tensor): location predictions for loc layers,
            Shape: [num_priors,4]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded bounding box predictions
    """

    boxes = torch.cat((
        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    return boxes


def log_sum_exp(x):
    """Utility function for computing log_sum_exp while determining
    This will be used to determine unaveraged confidence loss across
    all examples in a batch.
    Args:
        x (Variable(tensor)): conf_preds from conf layers
    """
    x_max = x.data.max()
    return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max


# Original author: Francisco Massa:
# https://github.com/fmassa/object-detection.torch
# Ported to PyTorch by Max deGroot (02/01/2017)
#def nms(boxes, scores, overlap=0.5, top_k=200):
def nms(boxes, scores, overlap=0.45, top_k=200):
    """Apply non-maximum suppression at test time to avoid detecting too many
    overlapping bounding boxes for a given object.
    Args:
        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
        scores: (tensor) The class predscores for the img, Shape:[num_priors].
        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
        top_k: (int) The Maximum number of box preds to consider.
    Return:
        The indices of the kept boxes with respect to num_priors.
    """

    keep = scores.new(scores.size(0)).zero_().long()
    if boxes.numel() == 0:
        return keep
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    area = torch.mul(x2 - x1, y2 - y1)

    v, idx = scores.sort(0)  # sort in ascending order
    # I = I[v >= 0.01]
    idx = idx[-top_k:]  # indices of the top-k largest vals
    xx1 = boxes.new()
    yy1 = boxes.new()
    xx2 = boxes.new()
    yy2 = boxes.new()
    w = boxes.new()
    h = boxes.new()

    # keep = torch.Tensor()
    count = 0
    while idx.numel() > 0:
        i = idx[-1]  # index of current largest val
        # keep.append(i)
        keep[count] = i
        count += 1
        if idx.size(0) == 1:
            break
        idx = idx[:-1]  # remove kept element from view
        # load bboxes of next highest vals
        torch.index_select(x1, 0, idx, out=xx1)
        torch.index_select(y1, 0, idx, out=yy1)
        torch.index_select(x2, 0, idx, out=xx2)
        torch.index_select(y2, 0, idx, out=yy2)
        # store element-wise max with next highest score
        xx1 = torch.clamp(xx1, min=x1[i])
        yy1 = torch.clamp(yy1, min=y1[i])
        xx2 = torch.clamp(xx2, max=x2[i])
        yy2 = torch.clamp(yy2, max=y2[i])
        w.resize_as_(xx2)
        h.resize_as_(yy2)
        w = xx2 - xx1
        h = yy2 - yy1
        # check sizes of xx1 and xx2.. after each iteration
        w = torch.clamp(w, min=0.0)
        h = torch.clamp(h, min=0.0)
        inter = w*h
        # IoU = i / (area(a) + area(b) - i)
        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
        union = (rem_areas - inter) + area[i]
        IoU = inter/union  # store result in iou
        # keep only elements with an IoU <= overlap
        idx = idx[IoU.le(overlap)]
    return keep, count


In [81]:
class MultiBoxLoss(nn.Module):
    def __init__(self, num_classes=21, overlap_thresh=0.5, neg_pos=3, device='cpu'):
        super(MultiBoxLoss, self).__init__()
        self.num_classes = num_classes
        self.threshold = overlap_thresh
        self.negpos_ratio = neg_pos
        self.variance = [0.1, 0.2]
        self.device = device

    def forward(self, predictions, targets):
        loc_data, conf_data, priors = predictions
        num = loc_data.size(0)  # batch size
        priors = priors[:loc_data.size(1), :]  ## 8732*4 
        num_priors = (priors.size(0))  ## 8732  カッコ必要？
        num_classes = self.num_classes
        ## オフセットを入れる箱 
        loc_t = torch.Tensor(num, num_priors, 4).to(self.device) 
        ## conf_t : torch.Size([4, 8732]) どの DBox か
        conf_t = torch.LongTensor(num, num_priors).to(self.device)
        for idx in range(num):   ## num は bs、各教師 image データに対して
            ## BB の位置情報 4次元、複数個  
            print(targets)
            truths = targets[idx][:, :-1].to(self.device) 
            ## BB のラベル、複数個
            labels = targets[idx][:, -1].to(self.device)
            ## DBox を gpu へ
            defaults = priors.to(self.device)  
            match(self.threshold, truths, defaults,
                  self.variance, labels, loc_t, conf_t, idx)
        pos = conf_t > 0  ## 背景ではないもの
        # Localization Loss (Smooth L1)
        # Shape: [batch,num_priors,4]
        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
        loc_p = loc_data[pos_idx].view(-1, 4)
        loc_t = loc_t[pos_idx].view(-1, 4)
        # loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)
        loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum')
        # Compute max conf across batch for hard negative mining
        batch_conf = conf_data.view(-1, self.num_classes)
        # loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))
        loss_c = F.cross_entropy(batch_conf, conf_t.view(-1), reduction='none')
        # Hard Negative Mining
        num_pos = pos.long().sum(1, keepdim=True)
        loss_c = loss_c.view(num, -1)
        loss_c[pos] = 0  # filter out pos boxes for now
        _, loss_idx = loss_c.sort(1, descending=True)
        _, idx_rank = loss_idx.sort(1)
        num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
        neg = idx_rank < num_neg.expand_as(idx_rank)
        # Confidence Loss Including Positive and Negative Examples
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)
        conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
        targets_weighted = conf_t[(pos+neg).gt(0)]
        # loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)
        loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum')
        # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
        N = num_pos.data.sum()
        loss_l /= N
        loss_c /= N
        return loss_l, loss_c

# 5.Main function section

In [57]:
def train_net(net, train_loader, eval_loader, optim_cls=optim.SGD, loss_fn=nn.CrossEntropyLoss(), n_iter=20, device= 'cpu'):
    optimizer = optim_cls(net.parameters(), lr=0.1)
    train_losses = []
    val_losses = []
    train_acc = []
    val_acc = []
    n = 0
    n_acc = 0
    net = net.to(device)

    for epoch in range(n_iter):
        running_loss=0.0
        net.train()
        with tqdm.tqdm(train_loader) as pbar:
            for i, (x, label) in enumerate(pbar):
                x = x.to(device)
                label = label.to(device)
                h = net(x)
                loss = loss_fn(h, label)
                running_loss+=loss.item()
                n += len(label)
                _, y_pred = h.max(1)
                n_acc += (y_pred==label).float().sum().item()

                # 逆伝播によるパラメータ更新
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                pbar.set_postfix(OrderedDict(
                    epoch= epoch+1,
                    loss=running_loss/(i+1), 
                    ))
            train_losses.append(running_loss / len(train_loader))
            train_acc.append(n_acc / n)
            val_loss, val_acc_ = val_net(net, eval_loader, loss_fn, device=device)
            val_losses.append(val_loss)
            val_acc.append(val_acc_)

    return train_losses, val_losses, train_acc, val_acc

def val_net(net, val_loader, loss_fn, device= 'cpu'):
    net.eval()
    val_acc = 0
    val_loss = 0
    n = 0
    n_acc =0
    running_loss=0.0
    net = net.to(device)
    for i, (x, label) in enumerate(val_loader):
        x = x.to(device)
        label = label.to(device)
        h = net(x)
        loss = loss_fn(h, label)
        running_loss+=loss.item()
        n += len(label)
        _, y_pred = h.max(1)
        n_acc += (y_pred==label).float().sum().item()
    val_acc = n_acc / n
    val_loss = running_loss / len(val_loader)
    return val_loss, val_acc

def pred_net(net, test_loader, device= 'cpu'):
    y_preds = []
    net = net.to(device)
    for i, x in enumerate(test_loader):
        x = x.to(device)
        h = net(x)
        _, y_pred = h.max(1)
        y_preds.append(y_pred)
    return torch.cat(y_preds,dim=0)

# 6.Train section

In [98]:
dataset = torchvision.datasets.VOCDetection(root="./drive/MyDrive/data_root",year="2012",image_set="train", transform=transform, target_transform=target_transform ,download=True)

Using downloaded and verified file: ./drive/MyDrive/data_root/VOCtrainval_11-May-2012.tar
Extracting ./drive/MyDrive/data_root/VOCtrainval_11-May-2012.tar to ./drive/MyDrive/data_root


In [97]:
# Transform組み立て済み
transform = ComposeTransform([
    SimpleTransform(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])
target_transform = ComposeTransform([
    SimpleTargetTransform()                  
])

# Dataset組み立て
#dataset = torchvision.datasets.VOCDetection(root="./drive/MyDrive/data_root",year="2012",image_set="train", transform= transform ,download=True)
print('Uuuuuuuuuuuuuuuu')
print(dataset[0][0])
print(dataset[0][1])
#x = transform(dataset[0][0])
#y = target_transform(dataset[0][1])



train_size = int(len(dataset)*0.8)
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Dataloader組み立て
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, 
                                           shuffle=True, num_workers=4)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, 
                                           shuffle=True, num_workers=4)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

# Model組み立て
net = SSD().to(device)
if device == 'cuda':
    net = nn.DataParallel(net)
    torch.backends.cudnn.benchmark = True
lo = train_loader.__iter__()
x, y = lo.next()
print(x.size)
te =  net(x)
print(te[1][8])
# MainFunction実行
train_losses, val_losses, train_acc, val_acc= train_net(net, train_loader, val_loader, loss_fn=MultiBoxLoss(device=device) ,device=device)

Uuuuuuuuuuuuuuuu
<bound method SimpleTransform.pil2cv of <__main__.SimpleTransform object at 0x7f716b3e9d10>>
[[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [248 205 202]
  [249 206 203]
  [250 207 204]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [249 206 203]
  [249 206 203]
  [248 205 202]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [252 209 206]
  [251 208 205]
  [251 208 205]]

 ...

 [[178 191 205]
  [167 180 194]
  [179 192 206]
  ...
  [119 128 137]
  [101 112 120]
  [101 113 119]]

 [[165 178 192]
  [166 179 193]
  [166 179 193]
  ...
  [100 106 111]
  [ 97 103 108]
  [110 119 122]]

 [[151 164 178]
  [153 166 180]
  [158 171 185]
  ...
  [ 73  77  82]
  [ 64  68  73]
  [ 70  77  80]]]
-------------------
<bound method SimpleTransform.resize of <__main__.SimpleTransform object at 0x7f716b3e9d10>>
[[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [247 205 202]
  [248 205 202]
  [249 206 203]]

 [[255 255 255]
  [255 255 255]
  [255 255 

  cpuset_checked))


<bound method SimpleTransform.pil2cv of <__main__.SimpleTransform object at 0x7f716b3e9d10>>
<bound method SimpleTransform.pil2cv of <__main__.SimpleTransform object at 0x7f716b3e9d10>>
<bound method SimpleTransform.pil2cv of <__main__.SimpleTransform object at 0x7f716b3e9d10>>
[[[213 220 217]
  [212 218 217]
  [214 216 216]
  ...
  [ 76 111 107]
  [ 87 127 122]
  [ 78 118 113]]

 [[216 223 220]
  [212 217 215]
  [218 221 219]
  ...
  [ 80 103  99]
  [ 75  92  89]
  [ 84  95  93]]

 [[204 206 206]
  [217 220 218]
  [213 214 212]
  ...
  [ 83  94  91]
  [ 91 101  95]
  [ 77  87  81]]

 ...

 [[204 233 242]
  [202 228 235]
  [190 215 219]
  ...
  [246 248 249]
  [244 248 249]
  [244 248 249]]

 [[255 253 255]
  [255 253 255]
  [255 252 253]
  ...
  [247 249 249]
  [247 249 249]
  [245 250 249]]

 [[245 251 246]
  [246 250 244]
  [246 248 242]
  ...
  [248 250 250]
  [247 249 249]
  [248 250 250]]]
[[[  2   7   5]
  [  5  10   8]
  [ 14  19  17]
  ...
  [139 140 138]
  [111 117 112]
  [16

RuntimeError: ignored

[[[186 184 191]
  [190 190 192]
  [184 188 186]
  ...
  [122 132 147]
  [122 134 139]
  [113 121 134]]

 [[184 185 189]
  [188 188 193]
  [192 188 189]
  ...
  [152 160 169]
  [117 126 127]
  [138 145 158]]

 [[191 187 188]
  [191 184 193]
  [187 189 190]
  ...
  [156 161 169]
  [131 145 136]
  [160 170 172]]

 ...

 [[105 111 123]
  [102 111 120]
  [103 108 121]
  ...
  [112 113 124]
  [107 108 122]
  [110 114 124]]

 [[101 109 125]
  [103 110 120]
  [100 106 119]
  ...
  [105 109 120]
  [103 108 117]
  [107 112 118]]

 [[102 107 125]
  [114 108 121]
  [108 112 124]
  ...
  [104 106 119]
  [106 108 118]
  [110 112 120]]]
-------------------

<bound method SimpleTargetTransform.xml_parser of <__main__.SimpleTargetTransform object at 0x7f716b3e9750>>
<bound method SimpleTransform.pil2cv of <__main__.SimpleTransform object at 0x7f716b3e9d10>>
[[0.032, 0.072, 0.96, 0.9413333333333334, 1]]
-------------------
-------------------
<bound method SimpleTransform.resize of <__main__.SimpleTrans

In [None]:
# Transform組み立て済み
transform = ComposeTransform([
    SimpleTransform(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

# Dataset組み立て
dataset = torchvision.datasets.VOCDetection(root='./datasets', year='2012', 
                                         image_set='train', download=True)
print(dataset[0][0].size())
train_size = int(len(dataset)*0.8)
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Dataloader組み立て
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, 
                                           shuffle=True, num_workers=4)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, 
                                           shuffle=True, num_workers=4)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

# Model組み立て
net = SSD().to(device)
if device == 'cuda':
    net = nn.DataParallel(net)
    torch.backends.cudnn.benchmark = True
lo = train_loader.__iter__()
x, y = lo.next()
print(x.size)
te =  net(x)
print(te[2][8])
# MainFunction実行
train_losses, val_losses, train_acc, val_acc= train_net(net, train_loader, val_loader, loss_fn=MultiBoxLoss(device=device) ,device=device)

# 7.Validate section

In [None]:
plt.plot(train_losses)
plt.plot(val_losses)
plt.show()
print(train_acc)
print(val_acc)

# 8.Test section

In [None]:
# Transform組み立て済み

# Dataset組み立て
test_set = torchvision.datasets.CIFAR10('./datasets', train=False, 
                                        download=True, transform=transform)

# Dataloader組み立て
test_loader = torch.utils.data.DataLoader(test_set, batch_size=100, 
                                          shuffle=False, num_workers=4)

# Model組み立て済み

# MainFunction実行
y_preds = pred_net(net, test_loader)

# Postprocess
print(y_preds)