In [None]:
## 加载必备库文件
import os
import numpy as np

import mxnet as mx
from mxnet import nd
from mxnet import gluon
from mxnet import autograd

from mxnet.gluon import nn
from mxnet.gluon import data as gdata
from mxnet.gluon.data import DataLoader
from mxnet.gluon.data.vision import transforms as gtransforms

from gluoncv import model_zoo

In [None]:
## 步骤
# 1. 读取样本和标注
# 1. transform
# 1. mini-batch 样本迭代器
# 1. 定义 loss 以及 metric
# 1. 定义网络结构并初始化权重
# 1. 确定最优化方法
# 1. train

In [None]:
## 所有需要调节的超参
img_wight  = 416
img_height = 416

batch_size  = 128
num_workers = 0

# 学习参数
# 调节学习速率衰减的倍数
lr_decay = 0.1
lr_decay_epochs_set = set([200, 400])

epochs = 100


# number of GPUs to use
num_gpus = 1
ctx = [mx.gpu(i) for i in range(num_gpus)]










In [None]:
# 指定 epoch 时更新学习速率
def update_learn_rate(trainer, epoch, lr_decay_epochs_set, lr_decay=0.1):
    if epoch in lr_decay_epochs_set:
        trainer.set_learning_rate(trainer.learning_rate * lr_decay)

# 最优化
optimizer = mx.optimizer.Adam(learning_rate=0.0001,
                             beta1=0.9,
                             beta2=0.999,
                             epsilon=1e-08,
                             lazy_update=True)


from gluoncv.utils import TrainingHistory
train_history = TrainingHistory(['train', 'val'])

1. obj_loss:    sum of objectness logistic loss
1. center_loss: sum of box center logistic regression loss
1. scale_loss:  sum of box scale l1 loss
1. cls_loss:    sum of per class logistic loss

In [None]:
## 损失函数
# l1_loss    = gluon.loss.L1Loss()
# sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
# # 第一个参数是 pred ，第二个参数是 label
# softmax_ce = gluon.loss.SoftmaxCrossEntropyLoss()

class YOLOV3Loss(gluon.loss.Loss):
    """Losses of YOLO v3.

    Parameters
    ----------
    batch_axis : int, default 0
        The axis that represents mini-batch.
    weight : float or None
        Global scalar weight for loss.

    """
    def __init__(self, batch_axis=0, weight=None, **kwargs):
        super(YOLOV3Loss, self).__init__(weight, batch_axis, **kwargs)
        self._sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False)
        self._l1_loss = gluon.loss.L1Loss()

    def hybrid_forward(self, F, objness, box_centers, box_scales, cls_preds,
                       objness_t, center_t, scale_t, weight_t, class_t, class_mask):
        """Compute YOLOv3 losses.

        Parameters
        ----------
        objness : mxnet.nd.NDArray
            Predicted objectness (B, N), range (0, 1).
        box_centers : mxnet.nd.NDArray
            Predicted box centers (x, y) (B, N, 2), range (0, 1).
        box_scales : mxnet.nd.NDArray
            Predicted box scales (width, height) (B, N, 2).
        cls_preds : mxnet.nd.NDArray
            Predicted class predictions (B, N, num_class), range (0, 1).
        objness_t : mxnet.nd.NDArray
            Objectness target, (B, N), 0 for negative 1 for positive, -1 for ignore.
        center_t : mxnet.nd.NDArray
            Center (x, y) targets (B, N, 2).
        scale_t : mxnet.nd.NDArray
            Scale (width, height) targets (B, N, 2).
        weight_t : mxnet.nd.NDArray
            Loss Multipliers for center and scale targets (B, N, 2).
        class_t : mxnet.nd.NDArray
            Class targets (B, N, num_class).
            It's relaxed one-hot vector, i.e., (1, 0, 1, 0, 0).
            It can contain more than one positive class.
        class_mask : mxnet.nd.NDArray
            0 or 1 mask array to mask out ignored samples (B, N, num_class).

        Returns
        -------
        tuple of NDArrays
            obj_loss: sum of objectness logistic loss
            center_loss: sum of box center logistic regression loss
            scale_loss: sum of box scale l1 loss
            cls_loss: sum of per class logistic loss

        """
        # compute some normalization count, except batch-size
        denorm = F.cast(
            F.shape_array(objness_t).slice_axis(axis=0, begin=1, end=None).prod(), 'float32')
        weight_t = F.broadcast_mul(weight_t, objness_t)
        hard_objness_t = F.where(objness_t > 0, F.ones_like(objness_t), objness_t)
        new_objness_mask = F.where(objness_t > 0, objness_t, objness_t >= 0)
        obj_loss = F.broadcast_mul(
            self._sigmoid_ce(objness, hard_objness_t, new_objness_mask), denorm)
        center_loss = F.broadcast_mul(self._sigmoid_ce(box_centers, center_t, weight_t), denorm * 2)
        scale_loss = F.broadcast_mul(self._l1_loss(box_scales, scale_t, weight_t), denorm * 2)
        denorm_class = F.cast(
            F.shape_array(class_t).slice_axis(axis=0, begin=1, end=None).prod(), 'float32')
        class_mask = F.broadcast_mul(class_mask, objness_t)
        cls_loss = F.broadcast_mul(self._sigmoid_ce(cls_preds, class_t, class_mask), denorm_class)
        return obj_loss, center_loss, scale_loss, cls_loss

In [None]:
#from gluoncv.utils.metrics.coco_detection import COCODetectionMetric
class COCODetectionMetric(mx.metric.EvalMetric):
    """Detection metric for COCO bbox task.

    Parameters
    ----------
    dataset : instance of gluoncv.data.COCODetection
        The validation dataset.
    save_prefix : str
        Prefix for the saved JSON results.
    use_time : bool
        Append unique datetime string to created JSON file name if ``True``.
    cleanup : bool
        Remove created JSON file if ``True``.
    score_thresh : float
        Detection results with confident scores smaller than ``score_thresh`` will
        be discarded before saving to results.
    data_shape : tuple of int, default is None
        If `data_shape` is provided as (height, width), we will rescale bounding boxes when
        saving the predictions.
        This is helpful when SSD/YOLO box predictions cannot be rescaled conveniently. Note that
        the data_shape must be fixed for all validation images.

    """
    def __init__(self, dataset, save_prefix, use_time=True, cleanup=False, score_thresh=0.05,
                 data_shape=None):
        super(COCODetectionMetric, self).__init__('COCOMeanAP')
        self.dataset = dataset
        self._img_ids = sorted(dataset.coco.getImgIds())
        self._current_id = 0
        self._cleanup = cleanup
        self._results = []
        self._score_thresh = score_thresh
        if isinstance(data_shape, (tuple, list)):
            assert len(data_shape) == 2, "Data shape must be (height, width)"
        elif not data_shape:
            data_shape = None
        else:
            raise ValueError("data_shape must be None or tuple of int as (height, width)")
        self._data_shape = data_shape

        if use_time:
            import datetime
            t = datetime.datetime.now().strftime('_%Y_%m_%d_%H_%M_%S')
        else:
            t = ''
        self._filename = osp.abspath(osp.expanduser(save_prefix) + t + '.json')
        try:
            f = open(self._filename, 'w')
        except IOError as e:
            raise RuntimeError("Unable to open json file to dump. What(): {}".format(str(e)))
        else:
            f.close()

    def __del__(self):
        if self._cleanup:
            try:
                os.remove(self._filename)
            except IOError as err:
                warnings.warn(str(err))

    def reset(self):
        self._current_id = 0
        self._results = []

    def _update(self):
        """Use coco to get real scores. """
        if not self._current_id == len(self._img_ids):
            warnings.warn(
                'Recorded {} out of {} validation images, incompelete results'.format(
                    self._current_id, len(self._img_ids)))
        import json
        try:
            with open(self._filename, 'w') as f:
                json.dump(self._results, f)
        except IOError as e:
            raise RuntimeError("Unable to dump json file, ignored. What(): {}".format(str(e)))

        pred = self.dataset.coco.loadRes(self._filename)
        gt = self.dataset.coco
        # lazy import pycocotools
        try_import_pycocotools()
        from pycocotools.cocoeval import COCOeval
        coco_eval = COCOeval(gt, pred, 'bbox')
        coco_eval.evaluate()
        coco_eval.accumulate()
        self._coco_eval = coco_eval
        return coco_eval

    def get(self):
        """Get evaluation metrics. """
        # Metric printing adapted from detectron/json_dataset_evaluator.
        def _get_thr_ind(coco_eval, thr):
            ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) &
                           (coco_eval.params.iouThrs < thr + 1e-5))[0][0]
            iou_thr = coco_eval.params.iouThrs[ind]
            assert np.isclose(iou_thr, thr)
            return ind

        # call real update
        coco_eval = self._update()

        IoU_lo_thresh = 0.5
        IoU_hi_thresh = 0.95
        ind_lo = _get_thr_ind(coco_eval, IoU_lo_thresh)
        ind_hi = _get_thr_ind(coco_eval, IoU_hi_thresh)
        # precision has dims (iou, recall, cls, area range, max dets)
        # area range index 0: all area ranges
        # max dets index 2: 100 per image
        precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2]
        ap_default = np.mean(precision[precision > -1])
        names, values = [], []
        names.append('~~~~ Summary metrics ~~~~\n')
        # catch coco print string, don't want directly print here
        _stdout = sys.stdout
        sys.stdout = StringIO()
        coco_eval.summarize()
        coco_summary = sys.stdout.getvalue()
        sys.stdout = _stdout
        values.append(str(coco_summary).strip())
        for cls_ind, cls_name in enumerate(self.dataset.classes):
            precision = coco_eval.eval['precision'][
                ind_lo:(ind_hi + 1), :, cls_ind, 0, 2]
            ap = np.mean(precision[precision > -1])
            names.append(cls_name)
            values.append('{:.1f}'.format(100 * ap))
        # put mean AP at last, for comparing perf
        names.append('~~~~ MeanAP @ IoU=[{:.2f},{:.2f}] ~~~~\n'.format(
            IoU_lo_thresh, IoU_hi_thresh))
        values.append('{:.1f}'.format(100 * ap_default))
        return names, values

    # pylint: disable=arguments-differ, unused-argument
    def update(self, pred_bboxes, pred_labels, pred_scores, *args, **kwargs):
        """Update internal buffer with latest predictions.
        Note that the statistics are not available until you call self.get() to return
        the metrics.

        Parameters
        ----------
        pred_bboxes : mxnet.NDArray or numpy.ndarray
            Prediction bounding boxes with shape `B, N, 4`.
            Where B is the size of mini-batch, N is the number of bboxes.
        pred_labels : mxnet.NDArray or numpy.ndarray
            Prediction bounding boxes labels with shape `B, N`.
        pred_scores : mxnet.NDArray or numpy.ndarray
            Prediction bounding boxes scores with shape `B, N`.

        """
        def as_numpy(a):
            """Convert a (list of) mx.NDArray into numpy.ndarray"""
            if isinstance(a, (list, tuple)):
                out = [x.asnumpy() if isinstance(x, mx.nd.NDArray) else x for x in a]
                return np.concatenate(out, axis=0)
            elif isinstance(a, mx.nd.NDArray):
                a = a.asnumpy()
            return a

        for pred_bbox, pred_label, pred_score in zip(
                *[as_numpy(x) for x in [pred_bboxes, pred_labels, pred_scores]]):
            valid_pred = np.where(pred_label.flat >= 0)[0]
            pred_bbox = pred_bbox[valid_pred, :].astype(np.float)
            pred_label = pred_label.flat[valid_pred].astype(int)
            pred_score = pred_score.flat[valid_pred].astype(np.float)

            imgid = self._img_ids[self._current_id]
            self._current_id += 1
            if self._data_shape is not None:
                entry = self.dataset.coco.loadImgs(imgid)[0]
                orig_height = entry['height']
                orig_width = entry['width']
                height_scale = float(orig_height) / self._data_shape[0]
                width_scale = float(orig_width) / self._data_shape[1]
            else:
                height_scale, width_scale = (1., 1.)
            # for each bbox detection in each image
            for bbox, label, score in zip(pred_bbox, pred_label, pred_score):
                if label not in self.dataset.contiguous_id_to_json:
                    # ignore non-exist class
                    continue
                if score < self._score_thresh:
                    continue
                category_id = self.dataset.contiguous_id_to_json[label]
                # rescale bboxes
                bbox[[0, 2]] *= width_scale
                bbox[[1, 3]] *= height_scale
                # convert [xmin, ymin, xmax, ymax]  to [xmin, ymin, w, h]
                bbox[2:4] -= (bbox[:2] - 1)
                self._results.append({'image_id': imgid,
                                      'category_id': category_id,
                                      'bbox': bbox[:4].tolist(),
                                      'score': score})

In [None]:
## 暂时不进行额外的 trainsfrom ，只进行 resize 操作
class ValTransform(object):
    """
    Parameters
    ----------
    width : int
        Image width.
    height : int
        Image height.
    mean : array-like of size 3
        Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406].
    std : array-like of size 3
        Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225].

    """
    def __init__(self, width, height, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
        self._width = width
        self._height = height
        self._mean = mean
        self._std = std

    def __call__(self, src, label):
        """Apply transform to validation image/label."""
        # resize
        h, w, _ = src.shape
        """
        0: Nearest Neighbors Interpolation.
        1: Bilinear interpolation.
        2: Area-based (resampling using pixel area relation). It may be a
        preferred method for image decimation, as it gives moire-free
        results. But when the image is zoomed, it is similar to the Nearest
        Neighbors method. (used by default).
        3: Bicubic interpolation over 4x4 pixel neighborhood.
        4: Lanczos interpolation over 8x8 pixel neighborhood.
        9: Cubic for enlarge, area for shrink, bilinear for others
        10: Random select from interpolation method metioned above.
        """
        img = mx.image.imresize(src, self._width, self._height, interp=2)
        bbox = self.bbox_resize(bbox=label, in_size=(w, h), out_size=(self._width, self._height))

        img = mx.nd.image.to_tensor(img)
        img = mx.nd.image.normalize(img, mean=self._mean, std=self._std)
        return img, bbox.astype(img.dtype)

    def bbox_resize(self, bbox, in_size, out_size):
        """Resize bouding boxes according to image resize operation.

        Parameters
        ----------
        bbox : numpy.ndarray
            :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
        in_size : tuple
            Tuple of length 2: (width, height) for input.
        out_size : tuple
            Tuple of length 2: (width, height) for output.

        Returns
        -------
        numpy.ndarray
            Resized bounding boxes with original shape.
        """

        bbox = nd.array(bbox.copy())
        x_scale = out_size[0] / in_size[0]
        y_scale = out_size[1] / in_size[1]
        bbox[:, 1] = y_scale * bbox[:, 1]
        bbox[:, 3] = y_scale * bbox[:, 3]
        bbox[:, 0] = x_scale * bbox[:, 0]
        bbox[:, 2] = x_scale * bbox[:, 2]
        return bbox


In [None]:
def bbox_clip_xyxy(xyxy, width, height):
    """Clip bounding box with format (xmin, ymin, xmax, ymax) to specified boundary.

    All bounding boxes will be clipped to the new region `(0, 0, width, height)`.

    Parameters
    ----------
    xyxy : list, tuple or numpy.ndarray
        The bbox in format (xmin, ymin, xmax, ymax).
        If numpy.ndarray is provided, we expect multiple bounding boxes with
        shape `(N, 4)`.
    width : int or float
        Boundary width.
    height : int or float
        Boundary height.

    Returns
    -------
    type
        Description of returned object.

    """
    if isinstance(xyxy, (tuple, list)):
        if not len(xyxy) == 4:
            raise IndexError(
                "Bounding boxes must have 4 elements, given {}".format(len(xyxy)))
        x1 = np.minimum(width - 1, np.maximum(0, xyxy[0]))
        y1 = np.minimum(height - 1, np.maximum(0, xyxy[1]))
        x2 = np.minimum(width - 1, np.maximum(0, xyxy[2]))
        y2 = np.minimum(height - 1, np.maximum(0, xyxy[3]))
        return (x1, y1, x2, y2)
    elif isinstance(xyxy, np.ndarray):
        if not xyxy.size % 4 == 0:
            raise IndexError(
                "Bounding boxes must have n * 4 elements, given {}".format(xyxy.shape))
        x1 = np.minimum(width - 1, np.maximum(0, xyxy[:, 0]))
        y1 = np.minimum(height - 1, np.maximum(0, xyxy[:, 1]))
        x2 = np.minimum(width - 1, np.maximum(0, xyxy[:, 2]))
        y2 = np.minimum(height - 1, np.maximum(0, xyxy[:, 3]))
        return np.hstack((x1, y1, x2, y2))
    else:
        raise TypeError(
            'Expect input xywh a list, tuple or numpy.ndarray, given {}'.format(type(xyxy)))

def bbox_xywh_to_xyxy(xywh):
    """Convert bounding boxes from format (x, y, w, h) to (xmin, ymin, xmax, ymax)

    Parameters
    ----------
    xywh : list, tuple or numpy.ndarray
        The bbox in format (x, y, w, h).
        If numpy.ndarray is provided, we expect multiple bounding boxes with
        shape `(N, 4)`.

    Returns
    -------
    tuple or numpy.ndarray
        The converted bboxes in format (xmin, ymin, xmax, ymax).
        If input is numpy.ndarray, return is numpy.ndarray correspondingly.

    """
    if isinstance(xywh, (tuple, list)):
        if not len(xywh) == 4:
            raise IndexError(
                "Bounding boxes must have 4 elements, given {}".format(len(xywh)))
        w, h = np.maximum(xywh[2] - 1, 0), np.maximum(xywh[3] - 1, 0)
        return (xywh[0], xywh[1], xywh[0] + w, xywh[1] + h)
    elif isinstance(xywh, np.ndarray):
        if not xywh.size % 4 == 0:
            raise IndexError(
                "Bounding boxes must have n * 4 elements, given {}".format(xywh.shape))
        xyxy = np.hstack((xywh[:, :2], xywh[:, :2] + np.maximum(0, xywh[:, 2:4] - 1)))
        return xyxy
    else:
        raise TypeError(
            'Expect input xywh a list, tuple or numpy.ndarray, given {}'.format(type(xywh)))


class LoadCOCO(object):
    """解析 ann 文件，保存所有 label 信息，并在 get_item 函数中读取图片，返回图片 img, label
    img 是 ndarray
    label 格式 ([xmin, ymin, xmax, ymax, contiguous_cid])
    MS COCO detection dataset.

    Parameters
    ----------
    root : str, default '~/mxnet/datasets/coco'
        Path to folder storing the dataset.
    splits : list of str, default ['instances_val2017']
        Json annotations name.
        Candidates can be: instances_val2017, instances_train2017.
    transform : callable, defaut None
        A function that takes data and label and transforms them. Refer to
        :doc:`./transforms` for examples.

        A transform function for object detection should take label into consideration,
        because any geometric modification will require label to be modified.
    min_object_area : float
        Minimum accepted ground-truth area
    skip_empty : bool, default is True
        Whether skip images with no valid object. This should be `True` in training, otherwise
        it will cause undefined behavior.
    use_crowd : bool, default is True
        Whether use boxes labeled as crowd instance.

    """
    CLASSES = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
               'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
               'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
               'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
               'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
               'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
               'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
               'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
               'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
               'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
               'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
               'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
               'scissors', 'teddy bear', 'hair drier', 'toothbrush']

    def __init__(self, root=os.path.join('~', 'datasets', 'coco'),
                 splits=('instances_val2017',), transform=None, min_object_area=0,
                 skip_empty=True, use_crowd=True):
        self._root = os.path.expanduser(root)
        self._transform = transform
        self._min_object_area = min_object_area
        self._skip_empty = skip_empty
        self._use_crowd = use_crowd
        self.num_class = len(type(self).CLASSES)
        if isinstance(splits, mx.base.string_types):
            splits = [splits]
        self._splits = splits
        # to avoid trouble, we always use contiguous IDs except dealing with cocoapi
        self.index_map = dict(zip(type(self).CLASSES, range(self.num_class)))
        self.json_id_to_contiguous = None
        self.contiguous_id_to_json = None
        self._coco = []
        self._items, self._labels = self._load_jsons()

    def __str__(self):
        detail = ','.join([str(s) for s in self._splits])
        return self.__class__.__name__ + '(' + detail + ')'

    @property
    def coco(self):
        """Return pycocotools object for evaluation purposes."""
        if not self._coco:
            raise ValueError("No coco objects found, dataset not initialized.")
        elif len(self._coco) > 1:
            raise NotImplementedError(
                "Currently we don't support evaluating {} JSON files".format(len(self._coco)))
        return self._coco[0]

    @property
    def classes(self):
        """Category names."""
        return type(self).CLASSES

    def __len__(self):
        return len(self._items)

    def __getitem__(self, idx):
        img_path = self._items[idx]
        label = self._labels[idx]
        img = mx.image.imread(img_path, 1)
        if self._transform is not None:
            return self._transform(img, label)
        return img, np.array(label)

    def _load_jsons(self):
        """Load all image paths and labels from JSON annotation files into buffer."""
        items = []
        labels = []
        # lazy import pycocotools
        from cocoapi.PythonAPI.pycocotools.coco import COCO
        for split in self._splits:
            anno = os.path.join(self._root, 'annotations', split) + '.json'
            _coco = COCO(anno)
            self._coco.append(_coco)
            classes = [c['name'] for c in _coco.loadCats(_coco.getCatIds())]
            if not classes == self.classes:
                raise ValueError("Incompatible category names with COCO: ")
            assert classes == self.classes
            json_id_to_contiguous = {
                v: k for k, v in enumerate(_coco.getCatIds())}
            if self.json_id_to_contiguous is None:
                self.json_id_to_contiguous = json_id_to_contiguous
                self.contiguous_id_to_json = {
                    v: k for k, v in self.json_id_to_contiguous.items()}
            else:
                assert self.json_id_to_contiguous == json_id_to_contiguous

            # iterate through the annotations
            image_ids = sorted(_coco.getImgIds())
            for entry in _coco.loadImgs(image_ids):
                dirname, filename = entry['coco_url'].split('/')[-2:]
                abs_path = os.path.join(self._root, dirname, filename)
                if not os.path.exists(abs_path):
                    raise IOError('Image: {} not exists.'.format(abs_path))
                label = self._check_load_bbox(_coco, entry)
                if not label:
                    continue
                items.append(abs_path)
                labels.append(label)
        return items, labels

    def _check_load_bbox(self, coco, entry):
        """Check and load ground-truth labels"""
        ann_ids = coco.getAnnIds(imgIds=entry['id'], iscrowd=None)
        objs = coco.loadAnns(ann_ids)
        # check valid bboxes
        valid_objs = []
        width = entry['width']
        height = entry['height']
        for obj in objs:
            if obj['area'] < self._min_object_area:
                continue
            if obj.get('ignore', 0) == 1:
                continue
            if not self._use_crowd and obj.get('iscrowd', 0):
                continue
            # convert from (x, y, w, h) to (xmin, ymin, xmax, ymax) and clip bound
            xmin, ymin, xmax, ymax = bbox_clip_xyxy(bbox_xywh_to_xyxy(obj['bbox']), width, height)
            # require non-zero box area
            if obj['area'] > 0 and xmax > xmin and ymax > ymin:
                contiguous_cid = self.json_id_to_contiguous[obj['category_id']]
                valid_objs.append([xmin, ymin, xmax, ymax, contiguous_cid])
        if not valid_objs:
            if not self._skip_empty:
                # dummy invalid labels if no valid objects are found
                valid_objs.append([-1, -1, -1, -1, -1])
        return valid_objs

In [None]:
# 内存太小，只能使用验证集进行 fine-tune
#train_dataset = LoadCOCO(root='~/data/coco', splits=['instances_train2017'], transform=None)
val_dataset = LoadCOCO(root='~/data/coco', splits=['instances_val2017'], transform=ValTransform(img_wight, img_height))
#val_dataset = LoadCOCO(root='~/data/coco', splits=['instances_val2017'], transform=None)

In [None]:
from gluoncv.data.batchify import Tuple, Stack, Pad
# behavior of batchify_fn: stack images, and pad labels
# 如果将不同长度的数组堆叠，会直接异常报错
batchify_fn = Tuple(Stack(), Pad(pad_val=-1))

## 数据加载
# from mxnet.gluon.data import DataLoader
# 这个函数最主要实现的目的就是每次返回 batch_size 大小的样本 [ Loads data from a dataset and returns mini-batches of data. ]
# 阅读 DataLoader 代码发现，这个函数就是先依据入参生成所有样本 batch_sampler (经过 shuffle 或者顺序读取)，
# 然后生成一个迭代器，每次返回 batch_size 大小的样本，且会将这些样本进行函数 batchify_fn 处理 
# 当然还可以使用多个线程同时读取，可以预先读取一定数量的样本等等
# 可以自己去实现，但是感觉没有必要
# 参数
#    dataset : ndarray or numpy array. 应该是经过 transform 之后的数据
#    batch_size : int
#    shuffle : bool
#    sampler : Sampler
#    last_batch : {'keep', 'discard', 'rollover'}
#    batch_sampler : Sampler
#    batchify_fn : callable. 用户自定义组装样本的方法
#    num_workers : int, default 0. 使用 num_workers 个线程来读取样本
#    pin_memory : boolean, default False. 使用函数 mxnet.ndarray.ndarray.NDArray.as_in_context(context) 实现，加快从 CPU 到 GPU 的拷贝速度
#    prefetch : int, default is `num_workers * 2`. 预处理样本的个数，会消耗较大的 shared_memory （应该是 GPU 的），当 num_workers > 0 时生效

loader_train = DataLoader(val_dataset, 
                          batch_size=batch_size, 
                          shuffle=True, 
                          batchify_fn=batchify_fn,
                          last_batch='roolover', 
                          num_workers=num_workers)

#loader_val   = DataLoader(val_dataset,
#                          batch_size=batch_size, 
#                          shuffle=False, 
#                          batchify_fn=batchify_fn,
#                          last_batch='discard', 
#                          num_workers=num_workers)




In [None]:
# default configurations
residual_block_num = [1, 2, 8, 8, 4] # 残差块的个数
darknet_channels = [32, [32, 64, 128, 256, 512]] # 对应残差块 1x1 卷积输出 channel 个数，3x3 卷积输出 channel 个数翻倍
class_num_imagenet = 1000 # for imagenet

# 三个输出分别使用的检测通道数
det_channels = [512, 256, 128]

# 这里都进行了反序
strides = [32, 16, 8]
anchors = [[116, 90, 156, 198, 373, 326], [30, 61, 62, 45, 59, 119], [10, 13, 16, 30, 33, 23]]    

# coco 80 个类
classes_name = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 
                'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 
                'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 
                'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 
                'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 
                'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 
                'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 
                'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 
                'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 
                'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 
                'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 
                'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 
                'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 
                'scissors', 'teddy bear', 'hair drier', 'toothbrush']



def cbl_gen(channels, kernel_size, strides, padding):
    '''conv-BN-LeakyReLU cell'''
    cbl_unit = nn.HybridSequential()
    # 所有卷积后面都有 BN ，所以 bias 始终为 False
    cbl_unit.add(
        nn.Conv2D(channels, kernel_size=kernel_size, strides=strides, padding=padding, groups=1, use_bias=False),
        nn.BatchNorm(),
        nn.LeakyReLU(0.1)
    )
    
    return cbl_unit

# 残差网络需要重新定义前向传播的方式，必须自己定义网络层类
class DarknetBasicBlockV3(gluon.HybridBlock):
    '''darknetV3 basic block
    net.hybridize() 可以为继承 HybridBlock 类的层优化计算性能
    '''
    
    def __init__(self, channels, **kwargs):        
        super(DarknetBasicBlockV3, self).__init__(**kwargs)        
        self.body = nn.HybridSequential()
        self.body.add(
            # 1x1 conv; 看 darknet 中 yolov3.cfg 文件中 1x1 卷积的 padding 也是 1 ？？？ gluoncv 中并没有
            cbl_gen(channels, (1,1), (1,1), (0,0)),
            # 3x3 conv
            cbl_gen(channels*2, (3,3), (1,1), (1,1))
        )
    # 需要在 hybrid_forward 函数中添加额外的输入F。由于 MXNet 既有基于命令式编程的 NDArray 类，
    # 又有基于符号式编程的 Symbol 类。由于这两个类的函数基本一致，MXNet会根据输入来决定 F 使用 NDArray 或 Symbol。    
    def hybrid_forward(self, F, x):
        return x + self.body(x)

        
class Darknet53(gluon.HybridBlock):
    '''darknet53'''
    
    def __init__(self, residual_block_num, channels, class_num=1000, **kwargs):
        super(Darknet53, self).__init__(**kwargs)
        self.features = nn.HybridSequential()
        
        # 网络最开始有一个卷积操作
        self.features.add(cbl_gen(channels[0], (3,3), (1,1), (1,1)))
        
        # 重复的残差块
        for residual_block, channel in zip(residual_block_num, channels[1]):
            # 使用步长为 2 的卷积实现下采样，在每一个残差块的开始都有一个下采样层
            self.features.add(cbl_gen(channel*2, (3,3), (2,2), (1,1)))
            # 一个残差块
            for _ in range(residual_block):
                self.features.add(DarknetBasicBlockV3(channel))
        
        # global average pooling
        self.pooling = nn.GlobalAvgPool2D()
        # 全连接的输出层
        self.output = nn.Dense(class_num)
        
    def hybrid_forward(self, F, x):
        x = self.features(x)
        x = self.pooling(x)
        return self.output(x)

    
class Concates(gluon.HybridBlock):
    """不同 stage 的 feature maps 串接的时候，先经过了一个 1x1 卷积和一个上采样
    """
    def __init__(self, channels, **kwargs):
        super(Concates, self).__init__(*kwargs)
        self.concate = nn.HybridSequential(prefix='')
        self.concate.add(cbl_gen(channels, (1,1), (1,1), (0,0)))
        
    def upsample_rept(self, x, stride):
        '''
        不同的检测层输入堆叠的时候需要上采样，上采样的方式也很简单，
        只是将 feature maps 沿着水平和垂直方向 repeat 指定的倍数。
        '''
        assert type(x) == mx.ndarray.ndarray.NDArray or type(x) == np.ndarray
        return x.repeat(axis=-1, repeats=stride).repeat(axis=-2, repeats=stride)    
    
    def hybrid_forward(self, F, x):
        x = self.concate(x)
        x = self.upsample_rept(x, 2)
        return x

    
class Detection(gluon.HybridBlock):
    '''
    检测网络，三个检测网络的结构相同，只是 filter 个数不同,
    完全可以使用一个 for 循环实现，
    但是需要在倒数第二层引出分支，和前面层的特征合并后，用于前面的检测网络，所以只能分开写

    越靠近输入层， feature maps 越大，所以检测网络使用的 channel 相应的较少，防止较大运算量
    '''
    def __init__(self, channels, classes_num=80, anchors_num=3, **kwargs):
        super(Detection, self).__init__(**kwargs)
        self.channels=channels
        self.anchors_num=anchors_num
        self.pred_num=1+4+classes_num
        self.body=nn.HybridSequential(prefix='')
        self.tip=nn.HybridSequential(prefix='')

        for i in range(2):
            self.body.add(cbl_gen(channels, (1,1), (1,1), (0,0)))
            self.body.add(cbl_gen(channels*2, (3,3), (1,1), (1,1)))
            
        self.body.add(cbl_gen(channels, (1,1), (1,1), (0,0)))        
        self.tip.add(cbl_gen(channels*2, (3,3), (1,1), (1,1)))        

    def hybrid_forward(self, F, x):
        x = self.body(x)
        return self.tip(x)


class Output(gluon.HybridBlock):
    """YOLOv3 输出
    """
    def __init__(self, anchors, stride, classes_num=80, **kwargs):
        super(Output, self).__init__(**kwargs)
        self.stride = stride
        self.anchors_num = len(anchors) // 2
        self.classes_num = classes_num
        self.pred_num = 1+4+classes_num
        anchors = nd.array(anchors).astype('float32')
        self.anchors = anchors.reshape(1, 1, -1, 2)

        self.output = nn.HybridSequential(prefix='')
        # 这里是线性激活函数，默认 nn.Conv2D 的 activation=None，两者等效
        # 输出 channel 的个数 (4+1+classes)*anchors
        self.output.add(nn.Conv2D(self.pred_num*self.anchors_num, (1,1), (1,1), (0,0), groups=1, use_bias=True))        

        # offsets will be added to predictions
        grid_x = np.arange(52)
        grid_y = np.arange(52)
        grid_x, grid_y = np.meshgrid(grid_x, grid_y)
        # stack to (n, n, 2)
        offsets = np.concatenate((grid_x[:, :, np.newaxis], grid_y[:, :, np.newaxis]), axis=-1)
        # expand dims to (1, 1, n, n, 2) so it's easier for broadcasting
        offsets = np.expand_dims(np.expand_dims(offsets, axis=0), axis=0)
        self.offsets = nd.array(offsets)#self.params.get_constant('offset_%d'%(index), offsets)
        
    def hybrid_forward(self, F, x):
        pred = self.output(x)

        # prediction flat to (batch, pred per pixel, height * width)
        pred = pred.reshape((0, self.anchors_num * self.pred_num, -1))
        # transpose to (batch, height * width, num_anchor, num_pred)
        pred = pred.transpose(axes=(0, 2, 1)).reshape((0, -1, self.anchors_num, self.pred_num))
        # components
        raw_box_centers = pred.slice_axis(axis=-1, begin=0, end=2)
        raw_box_scales = pred.slice_axis(axis=-1, begin=2, end=4)
        objness = pred.slice_axis(axis=-1, begin=4, end=5)
        class_pred = pred.slice_axis(axis=-1, begin=5, end=None)

        # valid offsets, (1, 1, height, width, 2)
        offsets = nd.slice_like(self.offsets, x * 0, axes=(2, 3))
        # reshape to (1, height*width, 1, 2)
        offsets = offsets.reshape((1, -1, 1, 2))

        box_centers = nd.broadcast_add(nd.sigmoid(raw_box_centers), offsets) * self.stride
        box_scales = nd.broadcast_mul(nd.exp(raw_box_scales), self.anchors)
        confidence = nd.sigmoid(objness)
        class_score = nd.broadcast_mul(nd.sigmoid(class_pred), confidence)
        wh = box_scales / 2.0
        # `corner`: [xmin, ymin, xmax, ymax]
        # `center`: [x, y, width, height]
        # center to corner
        bbox = nd.concat(box_centers - wh, box_centers + wh, dim=-1)

        if autograd.is_training():
            # during training, we don't need to convert whole bunch of info to detection results
            return (bbox.reshape((0, -1, 4)), raw_box_centers, raw_box_scales,
                    objness, class_pred, self.anchors, offsets)

        # prediction per class
        bboxes = nd.tile(bbox, reps=(self.classes_num, 1, 1, 1, 1))
        scores = nd.transpose(class_score, axes=(3, 0, 1, 2)).expand_dims(axis=-1)
        ids = nd.broadcast_add(scores * 0, F.arange(0, self.classes_num).reshape((0, 1, 1, 1, 1)))
        detections = nd.concat(ids, scores, bboxes, dim=-1)
        # reshape to (B, xx, 6)
        detections = nd.reshape(detections.transpose(axes=(1, 0, 2, 3, 4)), (0, -1, 6))
        return detections


class YOLOv3(gluon.HybridBlock):
    """生成 YOLOv3 网络，只适用于 Darknet53 ，
    """
    def __init__(self, **kwargs):
        super(YOLOv3, self).__init__(**kwargs)

        # 基本网络框架
        darknet53 = Darknet53(residual_block_num, darknet_channels)
        # residual_block_num = [1, 2, 8, 8, 4] , 每一个残差块的开始都有一个下采样层
        feature1_layer = 1 + (1+1) + (1+2) + (1+8)
        feature2_layer = feature1_layer + (1+8)
        feature3_layer = feature2_layer + (1+4) # 可以直接到末尾，

        self.features = nn.HybridSequential(prefix='')
        self.features.add(darknet53.features[:feature1_layer])
        self.features.add(darknet53.features[feature1_layer:feature2_layer])
        self.features.add(darknet53.features[feature2_layer:feature3_layer])

        # 从基本网络框架引出的检测网络层，包含输出
        self.detection_net = nn.HybridSequential(prefix='')
        for det_channel in det_channels:
            self.detection_net.add(Detection(det_channel))

        # 串接不同 stage 
        self.concates = nn.HybridSequential(prefix='')
        for det_channel in det_channels[1:]:
            self.concates.add(Concates(det_channel))
 
        # 输出
        self.output = nn.HybridSequential(prefix='')
        for anchor, stride in zip(anchors, strides):
            self.output.add(Output(anchor, stride))

    def hybrid_forward(self, F, x):
        # 先计算出所有 stage 的 features
        featuremaps = []
        for net in self.features:
            x = net(x)
            featuremaps.append(x)

        # 反序
        featuremaps = featuremaps[::-1]

        output = []
        det = nd.array([])
        for i in range(len(featuremaps)):
            if i == 0:
                det = featuremaps[i]
            else:
                det = self.concates[i-1](det)
                det = nd.concat(det, featuremaps[i], dim=1)

            det = self.detection_net[i].body(det)   
            out = self.detection_net[i].tip(det)
            out = self.output[i](out)
            output.append(out)

        return output

In [None]:
# 加载参数
net = YOLOv3()

def load_yolov3_param(net):
    net.features.load_parameters('features.params')

    # 由于和 gluoncv 的网络结构不同（自己的网络 tip 外面有两层 HybridSequential ），只能分开加载
    for i in range(3):
        name = "body_%d.params" % (i)
        net.detection_net[i].body.load_parameters(name)
        name = "tip_%d.params" % (i)
        net.detection_net[i].tip[0].load_parameters(name)

    for i in range(2):
        name = "concate_%d.params" % (i)
        net.concates[i].concate[0].load_parameters(name)

    for i in range(3):
        name = "outputs_%d.params" % (i)
        net.output[i].output[0].load_parameters(name)
    
    return net

net = load_yolov3_param(net)

In [None]:
## 冻结原有层的权重
#resnet18_cifar10[0].collect_params().setattr('grad_req', 'null')
## 初始化自定义层的权重
#resnet18_cifar10[1].initialize(init=mx.init.Xavier(), ctx=ctx)

# 通过正则表达式选择层
#net.collect_params('.*dense')
# 通过 lr_mult 设置学习速率
#net.collect_params('.*dense').setattr('lr_mult',0.1)

In [None]:
## gluon.Trainer 使用指定的最优化方法来更新参数
trainer = gluon.Trainer(yolov3_model.collect_params(), optimizer)

In [None]:
## 网络训练
def train_net(net, data_train, data_val, trainer, epochs):
    
    for epoch in range(epochs):
        #metric.reset()
        train_loss = 0
        update_learn_rate(trainer, epoch, lr_decay_epochs, lr_decay)
        tic = time.time()
        
        # dataset 是可迭代对象
        for i, batch in enumerate(loader_train):
            # 将数据切片分别加载到不同的设备上 ； 主要操作就是
            # data = nd.array(data, ctx=ctx[0]).as_in_context(ctx[0])
            # 终于发现自己的愚蠢，不可以将不同维度的数据一起加载
            # 合并两个不同大小的 array 有问题，将导致 ZMQbg/1 占用 CPU 100% 
            # 假如有 n 个 GPU ，前面的 DataLoader 中 batch_size 是不是应该是 batch*n ? TODO
            data_train = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
            label_train = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
            # label 为 (xmin,ymin,xmax,ymax,cid)
            
            # 自动求导: record() 函数使得 mxnet 记录并计算梯度，需要训练的参数都需要计算梯度
            with autograd.record():
                output = [net(X) for X in data_train]
                loss = [loss_fn(yhat, y) for yhat, y in zip(output, label_train)]

            # loss 是 list ，当有多个 loss 的时候，所有的 loss 都需要反向传播
            # l 是 mxnet.ndarray.ndarray.NDArray 格式的数据，本身就有 backward() 函数
            # 调用 backward() 函数用于计算梯度
            for l in loss:
                l.backward()
            
            # 更新参数， 通过调用 allreduce_grads() 和 update() 来实现参数的更新
            # 必须在 autograd.backward() 之后，以及 record() 之外调用
            # allreduce_grads() 必须在 trainer.update() 之前调用
            # 这里更新的时候是怎样用到上面计算的 loss 的？
            # trainer 已经中指定了需要训练的参数，应该是可以在某个位置找到这些参数的梯度，从而使用指定的最优化方法来更新参数
            trainer.step(batch_size)
                        
            for l in loss:
                train_loss += l.sum().asscalar() / batch_size
            
            # 每个 batch 更新一下训练的准确率
            metric.update(label_train, output)
            
        _, acc = metric.get()
        _, val_acc = metric_val(net, dataset_val, ctx=ctx)
        
        # 这里记录的是错误率
        train_history.update([1-acc, 1-val_acc])
        
        toc = time.time()
        
        print('[epoch %d] train_loss=%f, acc=%f, val_acc=%f, lr=%.9f, time: %fs' % 
              (epoch, train_loss, acc, val_acc, trainer.learning_rate, toc-tic))

In [None]:
## gluoncv 提供的 transform 函数
#from gluoncv.data.transforms import presets
#presets.yolo.YOLO3DefaultTrainTransform
#presets.rcnn.FasterRCNNDefaultTrainTransform

## 数据预处理
## 牢记 mxnet 使用 BCHW 形式

# Dataset 类提供了两个转换函数： transform_first 和 transform ；
# transform_first 只变换 data ； transform 同时变换样本和标签（一个样本的所有数据）

# imgnet 数据集
mean = [0.485, 0.456, 0.406]
std  = [0.229, 0.224, 0.225]

# 写两个函数，一个是 train_transform 一个 test_transform
def train_transform(img, label, mean, std):
    """
    YOLOv3 默认的数据预处理，图像和标签都需要处理
    1. random color jittering
    1. random expansion with prob 0.5
    1. random cropping
    1. resize with random interpolation
    1. random horizontal flip
    1. to tensor
    1. nomalize
    """
    
    # random color jittering
    mx.nd.image.random_color_jitter()
    
    # random expansion with prob 0.5
    
    # random cropping
    mx.image.fixed_crop()
    mx.image.random_crop()
    
    # resize with random interpolation
    mx.image.imresize() # interp 利用入参进行了设置 (img, width, height, interp=interp)
    
    # random horizontal flip
    mx.nd.image.random_flip_left_right()
    
    # to tensor WHC -> CHW
    img = mx.nd.image.to_tensor(img)
    
    # nomalize
    img = mx.nd.image.normalize(img, mean, std)
    
    return img, label

def test_transform(img, lable, mean, std):
    
    # resize with random interpolation
    mx.image.imresize() # interp 利用入参进行了设置 (img, width, height, interp=interp)
    
    # to tensor WHC -> CHW
    img = mx.nd.image.to_tensor(img)
    
    # nomalize
    img = mx.nd.image.normalize(img, mean, std)
    
    return img, label


# gluoncv 代码实现
def transform(src, label, width, height, mean, std):
    """Apply transform to training image/label."""
    # random color jittering
    img = experimental.image.random_color_distort(src)

    # random expansion with prob 0.5
    if np.random.uniform(0, 1) > 0.5:
        img, expand = timage.random_expand(img, fill=[m * 255 for m in mean])
        bbox = tbbox.translate(label, x_offset=expand[0], y_offset=expand[1])
    else:
        img, bbox = img, label

    # random cropping
    h, w, _ = img.shape
    bbox, crop = experimental.bbox.random_crop_with_constraints(bbox, (w, h))
    x0, y0, w, h = crop
    img = mx.image.fixed_crop(img, x0, y0, w, h)

    # resize with random interpolation
    h, w, _ = img.shape
    interp = np.random.randint(0, 5)
    img = timage.imresize(img, width, height, interp=interp)
    bbox = tbbox.resize(bbox, (w, h), (width, height))

    # random horizontal flip
    h, w, _ = img.shape
    img, flips = timage.random_flip(img, px=0.5)
    bbox = tbbox.flip(bbox, (w, h), flip_x=flips[0])

    # to tensor
    img = mx.nd.image.to_tensor(img)
    img = mx.nd.image.normalize(img, mean=mean, std=std)

    return img, bbox.astype(img.dtype)

In [None]:
## 性能度量函数
def metric_val(net, dataset_val, metric=None, ctx=mx.cpu(0)):
    
    if metric is None:
        metric = mx.metric.Accuracy()
        
    for _, batch in enumerate(dataset_val):
        # 将数据切片分别加载到不同的设备上
        data_val = gluon.utils.split_and_load(batch[0], ctx, batch_axis=0)
        label_val = gluon.utils.split_and_load(batch[1], ctx, batch_axis=0)
        
        yhat = [net(x) for x in data_val]
        
        # 第一个参数是 label ， 第二个参数是 output
        metric.update(label_val, yhat)    
  
    return metric.get()