In [6]:
import argparse
import os
import sys
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import common
import tensorrt as trt
import cv2 as cv
import torchvision
import torch
import torchvision.transforms.functional as TF
import time

# from utils.general import non_max_suppression

ctx = cuda.Device(0).make_context()


def img_process(img_path,batch_size):
    img = cv.imread(img_path, cv.IMREAD_COLOR)
    img  = cv.cvtColor(img, cv.COLOR_BGR2RGB)
    img = torch.from_numpy(img).cuda()
    img = img.permute(2, 0, 1)
    img = TF.resize(img,(384,288))
    img = img.div(255)
    img = TF.normalize(img,(0.485, 0.456, 0.406), (0.229, 0.224, 0.225))         
    imgs = torch.stack([img])
    return imgs

# input / output buffer 생성
def allocate_buffers(engine, batch_size, buffer_type="output"): 
    res = None
    host_mem = None
    device_mem = None
    
    for binding in engine:
        b_shape = engine.get_binding_shape(binding)
        b_shape[0] = batch_size
        size = trt.volume(b_shape)
        dtype = trt.nptype(engine.get_binding_dtype(binding)) #numpy.float32

        if buffer_type == 'input':
            if engine.binding_is_input(binding):
                host_mem = cuda.pagelocked_empty(size, dtype)
                device_mem = cuda.mem_alloc(host_mem.nbytes)
            else:
                continue            
            
        if buffer_type == 'output':
            if engine.binding_is_input(binding):
                continue
            else:
                host_mem = cuda.pagelocked_empty(size, dtype)
                device_mem = cuda.mem_alloc(host_mem.nbytes)
            
    res = {
        'host_mem' : host_mem
        ,'device_mem' : device_mem
    }
    
    return res
      

def do_inference_v2(context, input_data, input, output, stream):
    res = None
    ctx.push()
    bindings = None     
    if input == None : # 입력 버퍼 할당 하지 않은경우(input_data == tensor cuda)
        bindings = [
            int(input_data.contiguous().data_ptr())
            ,int(output['device_mem'])
        ]        
    else :
        bindings = [
            int(input['device_mem'])
            ,int(output['device_mem'])
        ]        
        
        input['host_mem'] = input_data        
        cuda.memcpy_htod_async(input['device_mem'], input['host_mem'], stream)
    
    context.execute_async_v2(bindings,stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(output['host_mem'], output['device_mem'], stream)

    stream.synchronize()
    ctx.pop()

    res = output['host_mem']
    return res

def make_context(trt_engine_path, batch_size):
    logger = trt.Logger(trt.Logger.WARNING)
    runtime = trt.Runtime(logger)
    #엔진 로드 
    print(trt_engine_path)
    engine = load_engine(runtime, trt_engine_path)
    print(engine)
    #inference를 위한 context 만들기
    context = engine.create_execution_context()
    #context 사이즈 지정 해주기
    context.set_binding_shape(0, (batch_size, 3, 384, 256)) #바인딩의 dynamic shape을 설정한다 
    stream = cuda.Stream()
    
    return engine ,context, stream



def load_engine(trt_runtime, engine_path):
    trt.init_libnvinfer_plugins(None, "")             
    with open(engine_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine

def make_output(result,batch_size):
    result = np.reshape(result,(batch_size,17,-1,-1))    
    outputs = torch.Tensor(result)
    outputs = outputs.view([batch_size, 17,-1,]) 
    num_classes = 80
    confthre =0.5
    nmsthre  =0.3
    outputs = postprocess(
        outputs, num_classes, confthre,
        nmsthre, class_agnostic=True)

    return outputs



#엔진 경로 설정해주기 
trt_engine_path = '/DATA_17/trt_test/engines/hrnet_0524/hrnet_fp16_004.trt'

#배치사이즈 설정하기
batch_size = 1
#이미지 경로 설정     
# img_path = '/DATA_17/ij/worker.jpg'   
img_path = '/DATA_17/hjjo/selftest/deep-high-resolution-net.pytorch/person_23_0_1.jpg'  




engine ,context, stream = make_context(trt_engine_path, batch_size)

#버퍼 할당해주기 
inputs = allocate_buffers(engine, batch_size, buffer_type="input")
output = allocate_buffers(engine, batch_size, buffer_type="output")

img_stack = img_process(img_path,batch_size)


# input_data = torch.tensor(img_stack).cuda() #input 버퍼할당해 주지 않고 데이터를 바로 보낼때 
result = do_inference_v2(context, img_stack, None, output, stream) #결과 생성
result = torch.from_numpy(result).cuda()
print('result', result)
pred = result.argmax(0)
print(pred)
# pred = pred.cpu().numpy()
# pred = pred.reshape(-1)

# print(pred)
# pred = pred.cpu().numpy()
# pred = pred.reshape(-1)
# pred_zip.append(pred[0])


# output = make_output(result,batch_size)








/DATA_17/trt_test/engines/hrnet_0524/hrnet_fp16_004.trt
[05/24/2022-09:36:55] [TRT] [E] 1: [stdArchiveReader.cpp::StdArchiveReader::35] Error Code 1: Serialization (Serialization assertion safeVersionRead == safeSerializationVersion failed.Version tag does not match. Note: Current Version: 0, Serialized Engine Version: 43)
None
[05/24/2022-09:36:55] [TRT] [E] 4: [runtime.cpp::deserializeCudaEngine::50] Error Code 4: Internal Error (Engine deserialization failed.)


AttributeError: 'NoneType' object has no attribute 'create_execution_context'

In [3]:
a = [1,2,3]
ac = ['a','b','c']
for idx, (i,x) in enumerate(zip(a,ac)):
    print(idx,i,x)

0 1 a
1 2 b
2 3 c
