In [1]:
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from data import VOCDetection
import sys
import os
import time
import numpy as np
import pickle

if sys.version_info[0] == 2:
    import xml.etree.cElementTree as ET
else:
    import xml.etree.ElementTree as ET


class VOCAPIEvaluator():
    """ VOC AP Evaluation class """
    def __init__(self, data_root, img_size, device, transform, labelmap, set_type='test', year='2007', display=False):
        self.data_root = data_root
        self.img_size = img_size
        self.device = device
        self.transform = transform
        self.labelmap = labelmap
        self.set_type = set_type
        self.year = year
        self.display = display

        # path
        self.devkit_path = data_root + 'VOC' + year
        self.annopath = os.path.join(data_root, 'VOC2007', 'Annotations', '%s.xml')
        self.imgpath = os.path.join(data_root, 'VOC2007', 'JPEGImages', '%s.jpg')
        self.imgsetpath = os.path.join(data_root, 'VOC2007', 'ImageSets', 'Main', set_type+'.txt')
        self.output_dir = self.get_output_dir('voc_eval/', self.set_type)

        # dataset
        self.dataset = VOCDetection(root=data_root, 
                                    image_sets=[('2007', set_type)],
                                    transform=transform
                                    )

    def evaluate(self, net):
        net.eval()
        num_images = len(self.dataset)
        # all detections are collected into:
        #    all_boxes[cls][image] = N x 5 array of detections in
        #    (x1, y1, x2, y2, score)
        self.all_boxes = [[[] for _ in range(num_images)] for _ in range(len(self.labelmap))]

        # timers
        det_file = os.path.join(self.output_dir, 'detections.pkl')

        for i in range(num_images):
            im, gt, h, w = self.dataset.pull_item(i)

            x = Variable(im.unsqueeze(0)).to(self.device)
            t0 = time.time()
            # forward
            bboxes, scores, cls_inds = net(x)
            detect_time = time.time() - t0
            scale = np.array([[w, h, w, h]])
            bboxes *= scale

            for j in range(len(self.labelmap)):
                inds = np.where(cls_inds == j)[0]
                if len(inds) == 0:
                    self.all_boxes[j][i] = np.empty([0, 5], dtype=np.float32)
                    continue
                c_bboxes = bboxes[inds]
                c_scores = scores[inds]
                c_dets = np.hstack((c_bboxes,
                                    c_scores[:, np.newaxis])).astype(np.float32,
                                                                    copy=False)
                self.all_boxes[j][i] = c_dets

            if i % 500 == 0:
                print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, num_images, detect_time))

        with open(det_file, 'wb') as f:
            pickle.dump(self.all_boxes, f, pickle.HIGHEST_PROTOCOL)

        print('Evaluating detections')
        self.evaluate_detections(self.all_boxes)

        print('Mean AP: ', self.map)
  

    def parse_rec(self, filename):
        """ Parse a PASCAL VOC xml file """
        tree = ET.parse(filename)
        objects = []
        for obj in tree.findall('object'):
            obj_struct = {}
            obj_struct['name'] = obj.find('name').text
            obj_struct['pose'] = obj.find('pose').text
            obj_struct['truncated'] = int(obj.find('truncated').text)
            obj_struct['difficult'] = int(obj.find('difficult').text)
            bbox = obj.find('bndbox')
            obj_struct['bbox'] = [int(bbox.find('xmin').text),
                                int(bbox.find('ymin').text),
                                int(bbox.find('xmax').text),
                                int(bbox.find('ymax').text)]
            objects.append(obj_struct)

        return objects


    def get_output_dir(self, name, phase):
        """Return the directory where experimental artifacts are placed.
        If the directory does not exist, it is created.
        A canonical path is built using the name from an imdb and a network
        (if not None).
        """
        filedir = os.path.join(name, phase)
        if not os.path.exists(filedir):
            os.makedirs(filedir)
        return filedir


    def get_voc_results_file_template(self, cls):
        # VOCdevkit/VOC2007/results/det_test_aeroplane.txt
        filename = 'det_' + self.set_type + '_%s.txt' % (cls)
        filedir = os.path.join(self.devkit_path, 'results')
        if not os.path.exists(filedir):
            os.makedirs(filedir)
        path = os.path.join(filedir, filename)
        return path


    def write_voc_results_file(self, all_boxes):
        for cls_ind, cls in enumerate(self.labelmap):
            if self.display:
                print('Writing {:s} VOC results file'.format(cls))
            filename = self.get_voc_results_file_template(cls)
            with open(filename, 'wt') as f:
                for im_ind, index in enumerate(self.dataset.ids):
                    dets = all_boxes[cls_ind][im_ind]
                    if dets == []:
                        continue
                    # the VOCdevkit expects 1-based indices
                    for k in range(dets.shape[0]):
                        f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
                                format(index[1], dets[k, -1],
                                    dets[k, 0] + 1, dets[k, 1] + 1,
                                    dets[k, 2] + 1, dets[k, 3] + 1))


    def do_python_eval(self, use_07=True):
        cachedir = os.path.join(self.devkit_path, 'annotations_cache')
        aps = []
        # The PASCAL VOC metric changed in 2010
        use_07_metric = use_07
        print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
        if not os.path.isdir(self.output_dir):
            os.mkdir(self.output_dir)
        for i, cls in enumerate(self.labelmap):
            filename = self.get_voc_results_file_template(cls)
            rec, prec, ap = self.voc_eval(detpath=filename, 
                                          classname=cls, 
                                          cachedir=cachedir, 
                                          ovthresh=0.5, 
                                          use_07_metric=use_07_metric
                                        )
            aps += [ap]
            print('AP for {} = {:.4f}'.format(cls, ap))
            with open(os.path.join(self.output_dir, cls + '_pr.pkl'), 'wb') as f:
                pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
        if self.display:
            self.map = np.mean(aps)
            print('Mean AP = {:.4f}'.format(np.mean(aps)))
            print('~~~~~~~~')
            print('Results:')
            for ap in aps:
                print('{:.3f}'.format(ap))
            print('{:.3f}'.format(np.mean(aps)))
            print('~~~~~~~~')
            print('')
            print('--------------------------------------------------------------')
            print('Results computed with the **unofficial** Python eval code.')
            print('Results should be very close to the official MATLAB eval code.')
            print('--------------------------------------------------------------')
        else:
            self.map = np.mean(aps)
            print('Mean AP = {:.4f}'.format(np.mean(aps)))


    def voc_ap(self, rec, prec, use_07_metric=True):
        """ ap = voc_ap(rec, prec, [use_07_metric])
        Compute VOC AP given precision and recall.
        If use_07_metric is true, uses the
        VOC 07 11 point method (default:True).
        """
        if use_07_metric:
            # 11 point metric
            ap = 0.
            for t in np.arange(0., 1.1, 0.1):
                if np.sum(rec >= t) == 0:
                    p = 0
                else:
                    p = np.max(prec[rec >= t])
                ap = ap + p / 11.
        else:
            # correct AP calculation
            # first append sentinel values at the end
            mrec = np.concatenate(([0.], rec, [1.]))
            mpre = np.concatenate(([0.], prec, [0.]))

            # compute the precision envelope
            for i in range(mpre.size - 1, 0, -1):
                mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

            # to calculate area under PR curve, look for points
            # where X axis (recall) changes value
            i = np.where(mrec[1:] != mrec[:-1])[0]

            # and sum (\Delta recall) * prec
            ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
        return ap


    def voc_eval(self, detpath, classname, cachedir, ovthresh=0.5, use_07_metric=True):
        if not os.path.isdir(cachedir):
            os.mkdir(cachedir)
        cachefile = os.path.join(cachedir, 'annots.pkl')
        # read list of images
        with open(self.imgsetpath, 'r') as f:
            lines = f.readlines()
        imagenames = [x.strip() for x in lines]
        if not os.path.isfile(cachefile):
            # load annots
            recs = {}
            for i, imagename in enumerate(imagenames):
                recs[imagename] = self.parse_rec(self.annopath % (imagename))
                if i % 100 == 0 and self.display:
                    print('Reading annotation for {:d}/{:d}'.format(
                    i + 1, len(imagenames)))
            # save
            if self.display:
                print('Saving cached annotations to {:s}'.format(cachefile))
            with open(cachefile, 'wb') as f:
                pickle.dump(recs, f)
        else:
            # load
            with open(cachefile, 'rb') as f:
                recs = pickle.load(f)

        # extract gt objects for this class
        class_recs = {}
        npos = 0
        for imagename in imagenames:
            R = [obj for obj in recs[imagename] if obj['name'] == classname]
            bbox = np.array([x['bbox'] for x in R])
            difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
            det = [False] * len(R)
            npos = npos + sum(~difficult)
            class_recs[imagename] = {'bbox': bbox,
                                    'difficult': difficult,
                                    'det': det}

        # read dets
        detfile = detpath.format(classname)
        with open(detfile, 'r') as f:
            lines = f.readlines()
        if any(lines) == 1:

            splitlines = [x.strip().split(' ') for x in lines]
            image_ids = [x[0] for x in splitlines]
            confidence = np.array([float(x[1]) for x in splitlines])
            BB = np.array([[float(z) for z in x[2:]] for x in splitlines])

            # sort by confidence
            sorted_ind = np.argsort(-confidence)
            sorted_scores = np.sort(-confidence)
            BB = BB[sorted_ind, :]
            image_ids = [image_ids[x] for x in sorted_ind]

            # go down dets and mark TPs and FPs
            nd = len(image_ids)
            tp = np.zeros(nd)
            fp = np.zeros(nd)
            for d in range(nd):
                R = class_recs[image_ids[d]]
                bb = BB[d, :].astype(float)
                ovmax = -np.inf
                BBGT = R['bbox'].astype(float)
                if BBGT.size > 0:
                    # compute overlaps
                    # intersection
                    ixmin = np.maximum(BBGT[:, 0], bb[0])
                    iymin = np.maximum(BBGT[:, 1], bb[1])
                    ixmax = np.minimum(BBGT[:, 2], bb[2])
                    iymax = np.minimum(BBGT[:, 3], bb[3])
                    iw = np.maximum(ixmax - ixmin, 0.)
                    ih = np.maximum(iymax - iymin, 0.)
                    inters = iw * ih
                    uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
                        (BBGT[:, 2] - BBGT[:, 0]) *
                        (BBGT[:, 3] - BBGT[:, 1]) - inters)
                    overlaps = inters / uni
                    ovmax = np.max(overlaps)
                    jmax = np.argmax(overlaps)

                if ovmax > ovthresh:
                    if not R['difficult'][jmax]:
                        if not R['det'][jmax]:
                            tp[d] = 1.
                            R['det'][jmax] = 1
                        else:
                            fp[d] = 1.
                else:
                    fp[d] = 1.

            # compute precision recall
            fp = np.cumsum(fp)
            tp = np.cumsum(tp)
            rec = tp / float(npos)
            # avoid divide by zero in case the first detection matches a difficult
            # ground truth
            prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
            ap = self.voc_ap(rec, prec, use_07_metric)
        else:
            rec = -1.
            prec = -1.
            ap = -1.

        return rec, prec, ap


    def evaluate_detections(self, box_list):
        self.write_voc_results_file(box_list)
        self.do_python_eval()


In [2]:
from data.voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT
from data.cocodataset import coco_class_index, coco_class_labels, COCODataset, coco_root
from data.config import *
import torch
import cv2
import numpy as np

def detection_collate(batch):
    """Custom collate fn for dealing with batches of images that have a different
    number of associated object annotations (bounding boxes).

    Arguments:
        batch: (tuple) A tuple of tensor images and lists of annotations

    Return:
        A tuple containing:
            1) (tensor) batch of images stacked on their 0 dim
            2) (list of tensors) annotations for a given image are stacked on
                                 0 dim
    """
    targets = []
    imgs = []
    # 这个batch是什么呢？batch=[dataset[0],dataset[1],...,dataset[batch_size-1]]
    # dataset[0]其实就是调用了__getitem__()方法取出一个img和一个target，组成的一个tuple
    # sample[0]对应img，sample[1]对应target或者说label，ground truth
    # 具体看voc0712.py里的im, gt, h, w = self.pull_item(index)，注意：这里不需要用的sample[2]和[3]
    for sample in batch:
        imgs.append(sample[0])
        targets.append(torch.FloatTensor(sample[1]))
    # torch.stack(imgs, 0)就是实现(batch_size, W, H)
    # targets这里的形状不需要再做改动了，已经处理好了，具体看voc0712.py
    # target的具体形状[xmin, ymin, xmax, ymax, label_ind]，是的，现在还没处理成(cx,cy,w,h)
    return torch.stack(imgs, 0), targets 


def base_transform(image, size, mean, std):
    x = cv2.resize(image, (size, size)).astype(np.float32)
    x /= 255.
    x -= mean
    x /= std
    return x


class BaseTransform:
    def __init__(self, size, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
        self.size = size
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)

    def __call__(self, image, boxes=None, labels=None):
        return base_transform(image, self.size, self.mean, self.std), boxes, labels


In [3]:
evaluator = VOCAPIEvaluator(data_root='/Users/lan/Downloads/VOCdevkit',
                                    img_size=416, # val_size是416（意思是图片大小416*416）
                                    device=torch.device("cpu"),
                                    transform=BaseTransform(416), # 预测时只需要做基本的数据增强（归一化），跟训练不一样
                                    labelmap=VOC_CLASSES
                                    )

In [8]:
evaluator.output_dir

'voc_eval/test'

In [7]:
evaluator.imgsetpath

'/Users/lan/Downloads/VOCdevkit/VOC2007/ImageSets/Main/test.txt'

In [6]:
evaluator.imgpath

'/Users/lan/Downloads/VOCdevkit/VOC2007/JPEGImages/%s.jpg'

In [5]:
evaluator.annopath

'/Users/lan/Downloads/VOCdevkit/VOC2007/Annotations/%s.xml'

In [4]:
evaluator.devkit_path

'/Users/lan/Downloads/VOCdevkitVOC2007'

In [11]:
evaluator.dataset.ids

[('/Users/lan/Downloads/VOCdevkit/VOC2007', '000001'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000002'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000003'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000004'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000006'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000008'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000010'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000011'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000013'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000014'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000015'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000018'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000022'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000025'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000027'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000028'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000029'),
 ('/Users/lan/Downloads/VOCdevkit/VOC2007', '000031'),
 ('/Users/

In [14]:
self = evaluator
num_images = len(self.dataset)
num_images

4952

In [15]:
len(self.labelmap)

20

In [21]:
a = list()


<function list.index(value, start=0, stop=9223372036854775807, /)>

In [22]:
self.all_boxes = [[[] for _ in range(num_images)] for _ in range(len(self.labelmap))]

In [23]:
import numpy as np
np.array(self.all_boxes) # 20 * 4952的空壳子

array([], shape=(20, 4952, 0), dtype=float64)

In [24]:
os.path.join(self.output_dir, 'detections.pkl')

'voc_eval/test/detections.pkl'

In [25]:
im, gt, h, w = self.dataset.pull_item(0)

In [27]:
im.shape

torch.Size([3, 416, 416])

In [29]:
im.unsqueeze(0).shape

torch.Size([1, 3, 416, 416])

In [None]:
x = Variable(im.unsqueeze(0)).to(self.device)

In [31]:
import torch
import torch.nn as nn
from utils import Conv, SPP
from backbone import resnet18
import numpy as np
import tools

class myYOLO(nn.Module):
    def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.01, nms_thresh=0.5):
        super(myYOLO, self).__init__()
        self.device = device                           # cuda或者是cpu
        self.num_classes = num_classes                 # 类别的数量
        self.trainable = trainable                     # 训练的标记
        self.conf_thresh = conf_thresh                 # 得分阈值
        self.nms_thresh = nms_thresh                   # NMS阈值
        self.stride = 32                               # 网络的最大步长
        self.grid_cell = self.create_grid(input_size)  # 网格坐标矩阵
        self.input_size = input_size                   # 输入图像大小
        
        # backbone: resnet18
        self.backbone = resnet18(pretrained=False)
        c5 = 512

        # neck: SPP
        self.neck = nn.Sequential(
            SPP(),
            Conv(c5*4, c5, k=1), # 通过1*1卷积降维到512个channel
        )

        # detection head
        self.convsets = nn.Sequential(
            # SPP output size: 13*13*512
            Conv(c5, 256, k=1), # size: 13*13, channel: 256
            Conv(256, 512, k=3, p=1), # size: (13-3+2*1)/1 + 1 = 13, channel: 512
            Conv(512, 256, k=1), # size: 13*13, channel: 256
            Conv(256, 512, k=3, p=1) # size: (13-3+2*1)/1 + 1 = 13, channel: 512
        )

        # pred
        self.pred = nn.Conv2d(512, 1 + self.num_classes + 4, 1) # size: 13*13*(1+C+4)
    

    def create_grid(self, input_size):
        """ 
            用于生成G矩阵，其中每个元素都是特征图上的像素坐标。
        """
        w, h = input_size, input_size
        # generate grid cells
        ws, hs = w // self.stride, h // self.stride
        grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)])
        grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
        grid_xy = grid_xy.view(1, hs*ws, 2).to(self.device)
        
        return grid_xy


    def set_grid(self, input_size):
        """
            用于重置G矩阵。
        """
        self.input_size = input_size
        self.grid_cell = self.create_grid(input_size)


    def decode_boxes(self, pred):
        """
            将txtytwth转换为常用的x1y1x2y2形式。
        """
        output = torch.zeros_like(pred)
        # 得到所有bbox 的中心点坐标和宽高
        pred[:, :, :2] = torch.sigmoid(pred[:, :, :2]) + self.grid_cell
        pred[:, :, 2:] = torch.exp(pred[:, :, 2:])

        # 将所有bbox的中心带你坐标和宽高换算成x1y1x2y2形式
        output[:, :, 0] = pred[:, :, 0] * self.stride - pred[:, :, 2] / 2
        output[:, :, 1] = pred[:, :, 1] * self.stride - pred[:, :, 3] / 2
        output[:, :, 2] = pred[:, :, 0] * self.stride + pred[:, :, 2] / 2
        output[:, :, 3] = pred[:, :, 1] * self.stride + pred[:, :, 3] / 2
        
        return output


    def nms(self, dets, scores):
        """"Pure Python NMS baseline."""
        x1 = dets[:, 0]  #xmin
        y1 = dets[:, 1]  #ymin
        x2 = dets[:, 2]  #xmax
        y2 = dets[:, 3]  #ymax

        areas = (x2 - x1) * (y2 - y1)
        order = scores.argsort()[::-1]
        

        keep = []                                             
        while order.size > 0:
            i = order[0]
            keep.append(i)
            # 计算交集的左上角点和右下角点的坐标
            xx1 = np.maximum(x1[i], x1[order[1:]])
            yy1 = np.maximum(y1[i], y1[order[1:]])
            xx2 = np.minimum(x2[i], x2[order[1:]])
            yy2 = np.minimum(y2[i], y2[order[1:]])
            # 计算交集的宽高
            w = np.maximum(1e-28, xx2 - xx1)
            h = np.maximum(1e-28, yy2 - yy1)
            # 计算交集的面积
            inter = w * h

            # 计算交并比
            ovr = inter / (areas[i] + areas[order[1:]] - inter)
            # 滤除超过nms阈值的检测框
            inds = np.where(ovr <= self.nms_thresh)[0]
            order = order[inds + 1]

        return keep


    def postprocess(self, bboxes, scores):
        """
        bboxes: (HxW, 4), bsize = 1
        scores: (HxW, num_classes), bsize = 1
        """

        cls_inds = np.argmax(scores, axis=1)
        scores = scores[(np.arange(scores.shape[0]), cls_inds)]
        
        # threshold
        keep = np.where(scores >= self.conf_thresh)
        bboxes = bboxes[keep]
        scores = scores[keep]
        cls_inds = cls_inds[keep]

        # NMS
        keep = np.zeros(len(bboxes), dtype=np.int)
        for i in range(self.num_classes):
            inds = np.where(cls_inds == i)[0]
            if len(inds) == 0:
                continue
            c_bboxes = bboxes[inds]
            c_scores = scores[inds]
            c_keep = self.nms(c_bboxes, c_scores)
            keep[inds[c_keep]] = 1

        keep = np.where(keep > 0)
        bboxes = bboxes[keep]
        scores = scores[keep]
        cls_inds = cls_inds[keep]

        return bboxes, scores, cls_inds


    def forward(self, x, target=None):
        # backbone主干网络
        c5 = self.backbone(x)

        # neck网络
        p5 = self.neck(c5)

        # detection head网络
        p5 = self.convsets(p5)

        # 预测层
        pred = self.pred(p5)
        # 最后的输出是：Batch_size*13*13*(1+C+4)

        # 对pred 的size做一些view调整，便于后续的处理
        # [B, C, H, W] -> [B, C, H*W] -> [B, H*W, C]
        pred = pred.view(p5.size(0), 1 + self.num_classes + 4, -1).permute(0, 2, 1)

        # 从pred中分离出objectness预测、类别class预测、bbox的txtytwth预测  
        # [B, H*W, 1]
        conf_pred = pred[:, :, :1]
        # [B, H*W, num_cls]
        cls_pred = pred[:, :, 1 : 1 + self.num_classes]
        # [B, H*W, 4]
        txtytwth_pred = pred[:, :, 1 + self.num_classes:]

        # train
        if self.trainable:
            conf_loss, cls_loss, bbox_loss, total_loss = tools.loss(pred_conf=conf_pred, 
                                                                    pred_cls=cls_pred,
                                                                    pred_txtytwth=txtytwth_pred,
                                                                    label=target
                                                                    )

            return conf_loss, cls_loss, bbox_loss, total_loss            
        # test
        else:
            with torch.no_grad():
                # batch size = 1
                # 测试时，笔者默认batch是1，因此，我们不需要用batch这个维度，用[0]将其取走。
                # [B, H*W, 1] -> [H*W, 1]
                conf_pred = torch.sigmoid(conf_pred)[0]
                # [B, H*W, 4] -> [H*W, 4], 并做归一化处理
                bboxes = torch.clamp((self.decode_boxes(txtytwth_pred) / self.input_size)[0], 0., 1.)
                # [B, H*W, 1] -> [H*W, num_class]，得分=<类别置信度>乘以<objectness置信度>
                scores = (torch.softmax(cls_pred[0, :, :], dim=1) * conf_pred)
                
                # 将预测放在cpu处理上，以便进行后处理
                scores = scores.to('cpu').numpy()
                bboxes = bboxes.to('cpu').numpy()
                
                # 后处理
                bboxes, scores, cls_inds = self.postprocess(bboxes, scores)

                return bboxes, scores, cls_inds


In [35]:
model = myYOLO(torch.device("cpu"), input_size=416, num_classes=20, trainable=False)
model.load_state_dict(torch.load("yolo_150.pth", map_location=torch.device('cpu')))

<All keys matched successfully>

In [38]:
Variable(im.unsqueeze(0))

tensor([[[[-2.1008, -2.1008, -2.1008,  ..., -2.0837, -2.0837, -2.1008],
          [-2.1008, -2.1008, -2.1008,  ..., -2.0665, -2.0837, -2.1008],
          [-2.1008, -2.1008, -2.1008,  ..., -2.0665, -2.1008, -2.0837],
          ...,
          [-1.4672, -1.5870, -1.5870,  ..., -1.6898, -1.6727, -1.6213],
          [-1.6898, -1.8268, -1.5185,  ..., -1.6042, -1.6042, -1.6213],
          [-2.0494, -1.8268, -1.4843,  ..., -1.5870, -1.6213, -1.6727]],

         [[-2.0182, -2.0182, -2.0182,  ..., -1.9657, -1.9657, -1.9832],
          [-2.0182, -2.0182, -2.0182,  ..., -1.9482, -1.9657, -1.9832],
          [-2.0182, -2.0182, -2.0182,  ..., -1.9482, -1.9832, -2.0007],
          ...,
          [-1.1779, -1.2654, -1.2479,  ..., -1.0728, -1.0903, -1.0903],
          [-1.4405, -1.5280, -1.1779,  ..., -1.0028, -1.0203, -1.0903],
          [-1.8081, -1.5105, -1.1429,  ..., -0.9853, -1.0378, -1.1429]],

         [[-1.8044, -1.8044, -1.8044,  ..., -1.7522, -1.7522, -1.7696],
          [-1.8044, -1.8044, -

In [40]:
x = Variable(im.unsqueeze(0)).to(self.device)

In [46]:
net=model
self=net
c5 = self.backbone(x)
p5 = self.neck(c5)
p5 = self.convsets(p5)
pred = self.pred(p5)
pred = pred.view(p5.size(0), 1 + self.num_classes + 4, -1).permute(0, 2, 1)
conf_pred = pred[:, :, :1]
cls_pred = pred[:, :, 1 : 1 + self.num_classes]
txtytwth_pred = pred[:, :, 1 + self.num_classes:]


In [47]:
conf_pred.shape

torch.Size([1, 169, 1])

In [55]:
conf_pred = torch.sigmoid(conf_pred)[0].detach()
conf_pred.shape


torch.Size([])

In [56]:
bboxes = torch.clamp((self.decode_boxes(txtytwth_pred) / self.input_size)[0], 0., 1.).detach()
bboxes.shape

torch.Size([169, 4])

In [75]:
scores = (torch.softmax(cls_pred[0, :, :], dim=1) * conf_pred).detach()
scores.shape

torch.Size([169, 20])

In [81]:
#scores = scores.to('cpu').numpy()
#bboxes = bboxes.to('cpu').numpy()
scores.shape, bboxes.shape

((169, 20), (169, 4))

In [82]:
cls_inds = np.argmax(scores, axis=1)
cls_inds.shape

(169,)

In [83]:
scores = scores[(np.arange(scores.shape[0]), cls_inds)]

In [84]:
scores.shape

(169,)

In [91]:
keep = np.where(scores >= self.conf_thresh)
keep[0].shape

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168]),)

In [93]:
bboxes = bboxes[keep]
bboxes.shape

(169, 4)

In [94]:
scores = scores[keep] 
scores.shape

(169,)

In [97]:
cls_inds = cls_inds[keep]
cls_inds.shape

(169,)

In [98]:
keep = np.zeros(len(bboxes), dtype=np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  keep = np.zeros(len(bboxes), dtype=np.int)


In [101]:
np.where(cls_inds == 2)

(array([  0,   1,   2,   3,   4,  14,  15,  16,  28,  39,  40, 139, 164,
        165]),)

In [102]:
inds = np.where(cls_inds == 2)

In [104]:
c_bboxes = bboxes[inds]
c_scores = scores[inds]
c_bboxes.shape, c_scores.shape

((14, 4), (14,))

In [105]:
dets = c_bboxes
scores = c_scores

In [107]:
x1 = dets[:, 0]  #xmin
y1 = dets[:, 1]  #ymin
x2 = dets[:, 2]  #xmax
y2 = dets[:, 3]  #ymax
x1.shape

(14,)

In [110]:
areas = (x2 - x1) * (y2 - y1)
areas.shape

(14,)

In [111]:
areas

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
      dtype=float32)

In [112]:
order = scores.argsort()[::-1]
order.shape

(14,)

In [114]:
scores

array([0.20706882, 0.2861662 , 0.3447466 , 0.33171812, 0.32655048,
       0.32176718, 0.38391808, 0.28923357, 0.38467342, 0.27817774,
       0.30056864, 0.22722596, 0.39856124, 0.33363044], dtype=float32)

In [113]:
order

array([12,  8,  6,  2, 13,  3,  4,  5, 10,  7,  1,  9, 11,  0])

In [115]:
keep = []                                             
while order.size > 0:
    i = order[0]
    keep.append(i)
    # 计算交集的左上角点和右下角点的坐标
    xx1 = np.maximum(x1[i], x1[order[1:]])
    yy1 = np.maximum(y1[i], y1[order[1:]])
    xx2 = np.minimum(x2[i], x2[order[1:]])
    yy2 = np.minimum(y2[i], y2[order[1:]])
    # 计算交集的宽高
    w = np.maximum(1e-28, xx2 - xx1)
    h = np.maximum(1e-28, yy2 - yy1)
    # 计算交集的面积
    inter = w * h

    # 计算交并比
    ovr = inter / (areas[i] + areas[order[1:]] - inter)
    # 滤除超过nms阈值的检测框
    inds = np.where(ovr <= self.nms_thresh)[0]
    order = order[inds + 1]

In [116]:
keep

[12]