# Inference mmdetection model from other format

## 1. Inference mmdetection pytorch model

In [17]:
from mmdet.apis import init_detector, inference_detector, async_inference_detector
from mmdet.utils import register_all_modules
import torch
import torchvision.transforms as transforms
from PIL import Image
import mmcv
from mmcv.transforms import Compose
import numpy as np
import os
import xmltodict
from mmdet.datasets.voc import VOCDataset
from mmengine.fileio import list_from_file

# 指定模型的配置文件和 checkpoint 文件路径
config_file = 'configs/yolo/yolov3_mobilenetv2_8xb24-ms-416-300e_coco.py'
checkpoint_file = 'work_dirs/yolov3_mobilenetv2_pretrained/best_pascal_voc_mAP_epoch_27.pth'
class_dict = {
    'aeroplane': 0,
    'bicycle': 1,
    'bird': 2,
    'boat': 3,
    'bottle': 4,
    'bus': 5,
    'car': 6,
    'cat': 7,
    'chair': 8,
    'cow': 9,
    'diningtable': 10,
    'dog': 11,
    'horse': 12,
    'motorbike': 13,
    'person': 14,
    'pottedplant': 15,
    'sheep': 16,
    'sofa': 17,
    'train': 18,
    'tvmonitor': 19
}

#Register all modules in mmdet into the registries
register_all_modules()
# 若检测到有GPU则使用GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

# 根据配置文件和 checkpoint 文件构建模型
model = init_detector(config_file, checkpoint_file, device=device)
model.eval()

test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(type='Resize', scale=(224, 224), keep_ratio=True),
    # avoid bboxes being resized
    dict(type='LoadAnnotations', with_bbox=True),
    dict(
        type='PackDetInputs',
        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                   'scale_factor'))
]

data = VOCDataset(ann_file='../data/VOCdevkit/VOC2007/ImageSets/Main/test.txt',
        img_subdir='../data/VOCdevkit/VOC2007/test_img/',
        ann_subdir='../data/VOCdevkit/VOC2007/Annotations/',
        test_mode=True,
        pipeline=test_pipeline)

# 指定要遍历的文件夹路径
img_ids = list_from_file("../data/VOCdevkit/VOC2007/ImageSets/Main/test.txt")

annotations, det_results  = [], []
for i in range(len(data)):

    ann = {'bboxes': data[i]['data_samples'].gt_instances.bboxes.cpu().numpy(),
           'labels': data[i]['data_samples'].gt_instances.labels.cpu().numpy(),
           'bboxes_ignore': data[i]['data_samples'].ignored_instances.bboxes.cpu().numpy(),
           'labels_ignore': data[i]['data_samples'].ignored_instances.labels.cpu().numpy()}
    annotations.append(ann)

    img_path = os.path.join('../data/VOCdevkit/VOC2007/JPEGImages/', img_ids[i] + '.jpg')
    # 获取预测结果
    img = mmcv.imread(img_path, channel_order='rgb')
    output2 = inference_detector(model, img)

    pred_bboxes = output2.pred_instances.bboxes.cpu().numpy()
    pred_scores = output2.pred_instances.scores.cpu().numpy()
    pred_labels = output2.pred_instances.labels.cpu().numpy()
    dets = []
    for label in range(len(class_dict)):
        index = np.where(pred_labels == label)[0]
        pred_bbox_scores = np.hstack(
            [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
        dets.append(pred_bbox_scores)
    
    det_results.append(dets)


print(det_results)
print(annotations)


Loads checkpoint by local backend from path: work_dirs/yolov3_mobilenetv2_pretrained/best_pascal_voc_mAP_epoch_27.pth
[[array([], shape=(0, 5), dtype=float32), array([], shape=(0, 5), dtype=float32), array([], shape=(0, 5), dtype=float32), array([], shape=(0, 5), dtype=float32), array([], shape=(0, 5), dtype=float32), array([], shape=(0, 5), dtype=float32), array([], shape=(0, 5), dtype=float32), array([[6.3758644e+01, 2.4518375e+02, 1.7130310e+02, 3.6519455e+02,
        3.3476326e-01]], dtype=float32), array([], shape=(0, 5), dtype=float32), array([], shape=(0, 5), dtype=float32), array([], shape=(0, 5), dtype=float32), array([[ 63.758644 , 245.18375  , 171.3031   , 365.19455  ,   0.6673685]],
      dtype=float32), array([], shape=(0, 5), dtype=float32), array([], shape=(0, 5), dtype=float32), array([[  1.4123664,   2.3509777, 347.7869   , 496.03958  ,   0.999199 ]],
      dtype=float32), array([], shape=(0, 5), dtype=float32), array([], shape=(0, 5), dtype=float32), array([], shape=(

In [18]:
annotations[4]

{'bboxes': array([[186., 134., 281., 241.],
        [153., 208., 368., 374.],
        [254., 206., 365., 374.],
        [137., 210., 248., 374.]], dtype=float32),
 'labels': array([15, 10,  8,  8]),
 'bboxes_ignore': array([[297., 194., 331., 246.],
        [278., 189., 307., 230.],
        [136., 191., 150., 198.],
        [136., 197., 155., 211.]], dtype=float32),
 'labels_ignore': array([8, 8, 8, 8])}

## 2. Evaluation preds with ground truth

In [19]:
from mmdet.evaluation.functional.mean_ap import eval_map

mean_ap, eval_results = eval_map(det_results, annotations, eval_mode='11points', use_legacy_coordinate=True)



+-------+------+------+--------+-------+
| class | gts  | dets | recall | ap    |
+-------+------+------+--------+-------+
| 0     | 285  | 705  | 0.582  | 0.496 |
| 1     | 337  | 1842 | 0.748  | 0.613 |
| 2     | 459  | 720  | 0.497  | 0.399 |
| 3     | 263  | 1654 | 0.517  | 0.319 |
| 4     | 469  | 1205 | 0.260  | 0.208 |
| 5     | 213  | 712  | 0.732  | 0.591 |
| 6     | 1201 | 2840 | 0.649  | 0.566 |
| 7     | 358  | 773  | 0.696  | 0.587 |
| 8     | 756  | 3625 | 0.549  | 0.311 |
| 9     | 244  | 477  | 0.537  | 0.404 |
| 10    | 206  | 1486 | 0.718  | 0.491 |
| 11    | 489  | 1325 | 0.714  | 0.577 |
| 12    | 348  | 899  | 0.721  | 0.591 |
| 13    | 325  | 1460 | 0.806  | 0.690 |
| 14    | 4528 | 8503 | 0.624  | 0.533 |
| 15    | 480  | 1324 | 0.448  | 0.315 |
| 16    | 242  | 511  | 0.550  | 0.435 |
| 17    | 239  | 1156 | 0.770  | 0.521 |
| 18    | 282  | 750  | 0.716  | 0.638 |
| 19    | 308  | 1147 | 0.630  | 0.482 |
+-------+------+------+--------+-------+
| mAP   |      