In [1]:
%env OMP_NUM_THREADS=2

env: OMP_NUM_THREADS=2


In [2]:
from queue import Queue
from threading import Thread

In [3]:
from mmdet.apis import init_detector, inference_detector
import mmcv

import numpy as np

In [4]:
import warnings

import matplotlib.pyplot as plt
import mmcv
import torch
from mmcv.parallel import collate, scatter
from mmcv.runner import load_checkpoint

from mmdet.core import get_classes
from mmdet.datasets.pipelines import Compose
from mmdet.models import build_detector
from mmdet.ops import RoIAlign, RoIPool

In [5]:
import time

In [6]:
config_file = 'mmdetection/configs/icartoonface/fr50_lite_dcn_att_gn_scratch_icf_wf.py'
checkpoint_file = 'work_dirs/fr50_lite_dcn_att_gn_scratch_icf_wf/latest.pth'

In [7]:
class LoadImage(object):

    def __call__(self, results):
        if isinstance(results['img'], str):
            results['filename'] = results['img']
            results['ori_filename'] = results['img']
        else:
            results['filename'] = None
            results['ori_filename'] = None
        img = mmcv.imread(results['img'])
        results['img'] = img
        results['img_fields'] = ['img']
        results['img_shape'] = img.shape
        results['ori_shape'] = img.shape
        return results

In [8]:
# data preprocessing loader
def data_loader(model, img_q, data_q):
    while True:
        img = img_q.get()
        cfg = model.cfg
        device = next(model.parameters()).device  # model device
        # build the data pipeline
        test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
        test_pipeline = Compose(test_pipeline)
        # prepare data
        data = dict(img=img)
        data = test_pipeline(data)
        data = collate([data], samples_per_gpu=1)
        if next(model.parameters()).is_cuda:
            # scatter to specified GPU
            data = scatter(data, [device])[0]
        else:
            # Use torchvision ops for CPU mode instead
            for m in model.modules():
                if isinstance(m, (RoIPool, RoIAlign)):
                    if not m.aligned:
                        # aligned=False is not implemented on CPU
                        # set use_torchvision on-the-fly
                        m.use_torchvision = True
            warnings.warn('We set use_torchvision=True in CPU mode.')
            # just get the actual data from DataContainer
            data['img_metas'] = data['img_metas'][0].data
        
        data_q.put(data)
        img_q.task_done()

In [9]:
# solo worker definition
def single_inference_detector(model, data_q, result_q):
    while True:
        data  =data_q.get()
         # forward the model
        with torch.no_grad():
            result = model(return_loss=False, rescale=True, **data)
        
        result_q.put(result)
    
        data_q.task_done()

In [10]:
# corcurrent number of gpu workers
n_corcurrent = 4

# corcurrent models
models = [init_detector(config_file, checkpoint_file, device='cuda:0') for _ in range(n_corcurrent)]

In [11]:
# duplicate same image for testing
img = np.random.randint(0, 255, size=(1920, 1080, 3)).astype(np.uint8)
# img = np.random.randint(0, 255, size=(1333, 800, 3)).astype(np.uint8)



n_imgs= 2000

img_q= Queue()
data_q = Queue(n_corcurrent * 2)
result_q = Queue()




# put imgs into queue
for _ in range(n_imgs):
    img_q.put(img)

In [12]:
# multi thread data loader
n_data_loader = n_corcurrent * 2
data_workers = [Thread(target=data_loader, args=[models[0], img_q, data_q]) for _ in range(n_data_loader)]



In [13]:

# warm up
# warmup = 5
# for model in models:
#     for _ in range(warmup):
#         inference_detector(model, img)


gpu_workers = [
    Thread(target=single_inference_detector, args=[model, data_q, result_q])
             for model in models]


In [14]:
# start data loaders
for worker in data_workers:
    worker.start()
# %time img_q.join()

In [15]:
# wait for data queue
time.sleep(1)

# start threads
for worker in gpu_workers:
    worker.start()
# wait to finish
%time data_q.join()

CPU times: user 12min 6s, sys: 1min 49s, total: 13min 56s
Wall time: 2min 1s


In [16]:
result_q.qsize()

2000