# Object Detection with YoloV4 and TensorRT

CSI Camera is hw-acc encoding jpg frames

YoloV4 imported from ONNX.

TensorRT conversion is cached.
FP32 is cast to FP16 here.

TODO activate observer for object_detection

## Pipeline
- nvargus
- GStreamer 
`nvarguscamerasrc sensor-id=%d ! video/x-raw(memory:NVMM), width=%d, height=%d, format=(string)NV12, framerate=(fraction)%d/1 ! nvvidconv flip-method=0 !  nvjpegenc`
- manual / observer
- preprocessor
    - scaling and padding
    (camera size matches)
- TensorRT yoloV4
- postprocessor
    - threshold
    - intersection over union
    - nms
- jupyter image widget

In [1]:
!pip3 install wget
!pip3 install git+https://github.com/NVIDIA-AI-IOT/jetcam

Collecting git+https://github.com/NVIDIA-AI-IOT/jetcam
  Cloning https://github.com/NVIDIA-AI-IOT/jetcam to /tmp/pip-g27uuvee-build


In [2]:
from common import download_file

YOLOv4_FILE = '../data/yolov4.onnx'
YOLOv4_URL = 'https://media.githubusercontent.com/media/onnx/models/master/vision/object_detection_segmentation/yolov4/model/yolov4.onnx'

YOLOv4_ANCHORS_FILE = '../data/yolov4.anchors'
YOLOv4_ANCHORS_URL = 'https://raw.githubusercontent.com/onnx/models/master/vision/object_detection_segmentation/yolov4/dependencies/yolov4_anchors.txt'

COCO_NAMES_FILE = '../data/coco.names'
COCO_NAMES_URL = 'https://raw.githubusercontent.com/onnx/models/master/vision/object_detection_segmentation/yolov4/dependencies/coco.names'

display(
    download_file(YOLOv4_FILE, YOLOv4_URL)
)
display(
    download_file(YOLOv4_ANCHORS_FILE, YOLOv4_ANCHORS_URL)
)
display(
    download_file(COCO_NAMES_FILE, COCO_NAMES_URL)
)

'../data/yolov4.onnx'

'../data/yolov4.anchors'

'../data/coco.names'

In [3]:
ENGINE_FILE = '../data/yolov4.trt'

# FIXME https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#work_dynamic_shapes
## network.get_input(0).shape = [1, 416, 416, 3]
YOLOv4_DIMS = (416, 416)

# del engine
try: 
    engine
except NameError:
    from pycuda.tools import make_default_context
    from onnx_to_tensorrt import get_engine
    from common import allocate_buffers
    
    cfx = make_default_context()
    engine = get_engine(YOLOv4_FILE, ENGINE_FILE)
    inputs, outputs, bindings, stream = allocate_buffers(engine)

Reading engine from file ../data/yolov4.trt


In [4]:
from data_processing import PreprocessYOLO, PostprocessYOLO, ALL_CATEGORIES

try: preprocessor
except NameError:
    preprocessor = PreprocessYOLO(YOLOv4_DIMS)

def reshape_output(trt_output):
    if len(trt_output) % (52*52) == 0:
        return trt_output.reshape(1, 52, 52, 3, 85)
    elif len(trt_output) % (26*26) == 0:
        return trt_output.reshape(1, 26, 26, 3, 85)
    elif len(trt_output) % (13*13) == 0:
        return trt_output.reshape(1, 13, 13, 3, 85)
    else:
        print('unknown trt_output size {}'.format(len(trt_output)))
        return []

def infer_from_camera(widget, camera):
    camera.running = False
    image = camera.read_image()
    infer_from_bytes(widget, image)
    
def infer_from_change(widget, change):
    # print("infering for widget {} with change {}".format(widget, change))
    infer_from_bytes(widget, change.new)

def infer_from_bytes(widget, change):
    from common import do_inference_v2
    from yolo4_inference import image_ppreprocess, postprocess_bbbox, postprocess_boxes, nms, draw_bbox, get_anchors
    import numpy as np

    cfx.push()
    
    ## size matches, jpeg encoded
    # image_raw, image_preprocessed = preprocessor.processLoaded(change)
    pre_img, np_img = image_ppreprocess(change, YOLOv4_DIMS)

    with engine.create_execution_context() as context:
        inputs[0].host = pre_img
        trt_outputs = do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)            
        # fixed by applying preprocessors shuffle https://forums.developer.nvidia.com/t/yolo-v3-output-boxes-are-nan-both-in-python-and-c/142289/2

        ## print("Output shape:", list(map(lambda trt_output: trt_output.shape, trt_outputs)))
        trt_outputs_reshaped = list(map(reshape_output, [trt_outputs[2]]))
        # print("Output re-shape:", list(map(lambda trt_output: trt_output.shape, trt_outputs_reshaped)))

        ANCHORS = get_anchors(YOLOv4_ANCHORS_FILE)
        STRIDES = np.array([8, 16, 32])
        XYSCALE = [0.1,0.1,1.] # ORIG [1.2, 1.1, 1.05]

        pred_bbox = postprocess_bbbox(trt_outputs_reshaped, ANCHORS, STRIDES, XYSCALE)
        pp_bboxes = postprocess_boxes(pred_bbox, YOLOv4_DIMS, YOLOv4_DIMS[0], score_threshold=0.2)
        bboxes = nms(pp_bboxes, iou_threshold=0.25, method='nms')

        # bboxes.append([0,0,YOLOv4_DIMS[0]//2, YOLOv4_DIMS[1]//2, 1, 56])
        # print('draw_bbox({}, {}, classes={})'.format(image_raw, bboxes, ALL_CATEGORIES))

        boxed_image = draw_bbox(np_img, bboxes, classes=ALL_CATEGORIES)
        widget.value = cv2.imencode('.jpg', boxed_image)[1].tobytes()
    cfx.pop()

In [5]:
!ls -ltrh /dev/video*

crw-rw---- 1 root video 81, 3 Sep  7 18:12 /dev/video1
crw-rw---- 1 root video 81, 0 Sep  7 18:12 /dev/video0


In [6]:
import tensorrt
print('tensorrt %s' % tensorrt.__version__)

import cv2
print('cv2 %s' % cv2.__version__)

tensorrt 7.1.3.0
cv2 4.1.1


In [7]:
from importlib import reload
# from object_detection_utilities import MyCamera, update_image, transform_image

import object_detection_utilities
reload(object_detection_utilities)

<module 'object_detection_utilities' from '/nvdli-nano/jupyter_notebooks/object_detection_utilities.py'>

In [8]:
# https://www.waveshare.com/wiki/IMX219-83_Stereo_Camera
# Resolution: 3280 × 2464 (per camera)
WIDTH = YOLOv4_DIMS[0]
HEIGHT = YOLOv4_DIMS[1]
FPS = 1

try: cameras
except NameError:
    cameraLeft = object_detection_utilities.MyCamera(capture_device=0, capture_width=WIDTH, capture_height=HEIGHT, capture_fps=FPS)
    cameraRight = object_detection_utilities.MyCamera(capture_device=1, capture_width=WIDTH, capture_height=HEIGHT, capture_fps=FPS)

    cameras = [cameraLeft, cameraRight]
    
display(cameras)

[<object_detection_utilities.MyCamera at 0x7f50d86b70>,
 <object_detection_utilities.MyCamera at 0x7f8c060f28>]

In [12]:
from importlib import reload
import yolo4_inference
reload(yolo4_inference)

import ipywidgets
from IPython.display import display, Image
import cv2

# del widgets
try: widgets
except NameError:
    widgets = []
    
if not widgets:
    for camera in cameras:
        camera.running = False
        image_data = camera.read()

        image_widget = ipywidgets.Image(
            format = 'jpeg',
            # value = Image('https://upload.wikimedia.org/wikipedia/commons/4/4b/What_Is_URL.jpg').data
        )
        
        encoding_state, encoded_image = cv2.imencode('.jpg', image_data) 
        image_widget.value = encoded_image.tobytes()
        
        widgets.append(image_widget)

for camera, widget in zip(cameras, widgets):
    infer_from_camera(widget, camera)
    camera.running = False
    display(widget)

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
from importlib import reload
import yolo4_inference
reload(yolo4_inference)

from functools import partial

try: 
    cameras[0].unobserve(obs, names=['value'])
    cameras[0].running=False
except NameError:
    pass
except ValueError:
    pass

# del obs
try: obs
except NameError:
    obs = partial(infer_from_change, widgets[0])

#cameras[0].observe(obs, names=['value'])
cameras[0].running=True

print(obs)
print(cameras[0])

In [None]:
import cv2

releaseCams = False

if releaseCams:
    for camera in cameras:
        camera.release()
        cameras.remove(camera)
cv2.destroyAllWindows()        
        
display(cameras)