In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pytorch-lightning==1.9.3

# Inference with torch

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip3 install opencv-python

In [None]:
import time
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
import torchvision
import pytorch_lightning as pl
import cv2

In [None]:
torch.cuda.get_device_name(0)

In [None]:
device = 'cuda:0'

In [None]:
class RetinaRehead(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.model = torchvision.models.detection.retinanet_resnet50_fpn_v2(weights='DEFAULT')
        self.detector = torch.nn.Conv2d(256, 10, kernel_size=3, padding=1)


    def forward(self, input):
        res = self.model.backbone.forward(input)
        res = res['0']
        res = self.detector.forward(res)
        return res

In [None]:
class PLModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input):
        return self.model.forward(input)

In [None]:
state = torch.load(
    '/content/drive/MyDrive/course_to_middle_old/weights/detector_checkpoint.ckpt',
    map_location='cpu')
state = state['state_dict']

model = PLModel(RetinaRehead())
model.load_state_dict(state)

## YOUR CODE HERE
model = model.model.to(device).eval().half() # model = model.model.to(device).eval()

In [None]:
## YOUR CODE HERE
_ = model.forward(torch.rand(1, 3, 512, 512).half().to(device))  #warm up # _ = model.forward(torch.rand(1, 3, 512, 512).to(device))  #warm up
print(_.shape)

In [None]:
torch.rand(1, 3, 512, 512, dtype=torch.half).to(device)

In [None]:
img = Image.open('/content/drive/MyDrive/course_to_middle_old/test_images_videos/photo1681218949.jpeg')
shape = np.array(img.size)
shape = (shape / shape[1] * 512).astype(int)
shape = shape // 32 * 32

In [None]:
shape

In [None]:
times_for_preproc = []

for i in range(100):
  torch.cuda.synchronize(device=device)
  t0 = time.time()

  img = img.resize(size=shape)
  t_img = (torch.tensor(np.array(img)).permute([2, 0, 1]).unsqueeze(0) / 255.0 - 0.5)/0.25
  ## YOUR CODE HERE
  t_img = t_img.half().to(device) # t_img = t_img.to(device)

  torch.cuda.synchronize(device=device)
  times_for_preproc.append(time.time() - t0)

print(f'mean time for preprocessing {np.mean(np.array(times_for_preproc))}')

In [None]:
t_img.shape

In [None]:
times_for_inf_torch = []

for i in range(100):
  torch.cuda.synchronize(device=device)
  t0 = time.time()

  ## YOUR CODE HERE
  res = model.half().forward(t_img) # res = model.forward(t_img)

  torch.cuda.synchronize(device=device)
  times_for_inf_torch.append(time.time() - t0)

print(f'mean time for inference torch {np.mean(np.array(times_for_inf_torch))}')

In [None]:
def decode_result(datum, threshold=1.0, r=8, iou_threshold=0.7):
    bboxes = {'boxes': [], 'scores': [], 'labels': []}
    datum = {0: datum[:5, :, :], 
             1: datum[5:, :, :]}

    for label in [0, 1]:
        mask = (datum[label][0, :, :] >= threshold)

        x_cell = torch.arange(mask.shape[1], device=datum[label].device)
        y_cell = torch.arange(mask.shape[0], device=datum[label].device)

        y_cell, x_cell = torch.meshgrid(y_cell, x_cell)

        x_cell = x_cell[mask]
        y_cell = y_cell[mask]
        
        x_shift = datum[label][2, :, :][mask]
        y_shift = datum[label][1, :, :][mask]

        x = (x_cell + x_shift) * r
        y = (y_cell + y_shift) * r

        w = datum[label][4, :, :][mask].exp() * r
        h = datum[label][3, :, :][mask].exp() * r

        scores = datum[label][0, :, :][mask]


        for index in range(len(x)):
            bboxes['boxes'].append([x[index] - w[index]/2, 
                         y[index] - h[index]/2, 
                         x[index] + w[index]/2, 
                         y[index] + h[index]/2])
            bboxes['scores'].append(scores[index])
            bboxes['labels'].append(label)

    bboxes['boxes'] = torch.tensor(bboxes['boxes']).reshape([-1, 4])
    bboxes['scores'] = torch.tensor(bboxes['scores'])
    bboxes['labels'] = torch.tensor(bboxes['labels'])

    to_keep = torchvision.ops.nms(bboxes['boxes'], bboxes['scores'], iou_threshold=iou_threshold)

    bboxes['boxes'] = bboxes['boxes'][to_keep]
    bboxes['scores'] = bboxes['scores'][to_keep]
    bboxes['labels'] = bboxes['labels'][to_keep]

    return bboxes


def decode_batch(batch, threshold=0.1, iou_threshold=0.3):
    res = []
    for index in range(batch.shape[0]):
        res.append(decode_result(batch[index], 
                   threshold=threshold, 
                   iou_threshold=iou_threshold))
    return res

def draw_box(coords, label):
    # print(coords)
    # print(label)
    # return None
    x = np.array((coords[0], coords[2]))
    y = np.array((coords[1], coords[3]))
    color = 'g'
    if label == 0:
        color = 'r'

    plt.plot(x.mean(), y.mean(), '*' + color)

    plt.plot([x[0], x[0]], [y[0], y[1]], color)
    plt.plot([x[1], x[1]], [y[0], y[1]], color)
    plt.plot([x[0], x[1]], [y[0], y[0]], color)
    plt.plot([x[0], x[1]], [y[1], y[1]], color)
    # plt.text(x[0], y[0], label, backgroundcolor='red')

In [None]:
clone_res = res.clone().detach()

In [None]:
times_for_postproc = []

for i in range(100):
  torch.cuda.synchronize(device=device)
  t0 = time.time()

  ## YOUR CODE HERE
  clone_res_cpu = clone_res # clone_res_cpu = clone_res.cpu()
  ## YOUR CODE HERE
  clone_res_cpu[:, [0, 1, 2, 5, 6, 7], :, :] = torch.sigmoid(clone_res_cpu[:, [0, 1, 2, 5, 6, 7], :, :].float()).half() # clone_res_cpu[:, [0, 1, 2, 5, 6, 7], :, :] = torch.sigmoid(clone_res_cpu[:, [0, 1, 2, 5, 6, 7], :, :])
  ## YOUR CODE HERE
  bboxes = decode_result(clone_res_cpu[0].float(), threshold=0.2, iou_threshold=0.2) # bboxes = decode_result(clone_res_gpu[0], threshold=0.2, iou_threshold=0.2)

  torch.cuda.synchronize(device=device)
  times_for_postproc.append(time.time() - t0)

print(f'mean time for postprocessing {np.mean(np.array(times_for_postproc))}')

In [None]:
plt.imshow(img)
for index in range(len(bboxes['boxes'])):
    draw_box(bboxes['boxes'][index], bboxes['labels'][index])

#Inference with torchscript

https://pytorch.org/docs/stable/jit.html

In [None]:
# README https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html
# By specifying decoding option with ftp16 false ,it will fix this error
# options = whisper.DecodingOptions(fp16 = False)
# Half is not supported by CPU, only CUDA. https://stackoverflow.com/a/75144903

class CarPlatesDetector(nn.Module):
    def __init__(self, model: nn.Module, classes: list, size: tuple, nms_thres: float, nms_iou_thres, fp16: bool):
        super(CarPlatesDetector, self).__init__()
        ## YOUR CODE HERE
        self.model = torch.jit.trace(model.float(), torch.unsqueeze(torch.rand(size).half(), 0).float()) # self.model = torch.jit.trace(model, torch.unsqueeze(torch.rand(size), 0))
        self.size = size
        self.classes = classes
        self.nms_thres = nms_thres
        self.nms_iou_thres = nms_iou_thres
        self.fp16 = fp16

    def forward(self, x):
        ## YOUR CODE HERE
        return self.model.forward(x).half() # return self.model.forward(x)

In [None]:
state = torch.load(
    '/content/drive/MyDrive/course_to_middle_old/weights/detector_checkpoint.ckpt',
    map_location='cpu')
state = state['state_dict']
model = PLModel(RetinaRehead())
model.load_state_dict(state)

## YOUR CODE HERE
model = model.model.eval().half() # model = model.model.eval()

In [None]:
## YOUR CODE HERE
wrapped_model = CarPlatesDetector(
    model=model,
    size=(3, 512, 736),
    classes=['car', 'plate'],
    nms_thres=0.2,
    nms_iou_thres=0.2,
    fp16=True # fp16=False
)

In [None]:
scripted_model = torch.jit.script(wrapped_model)

In [None]:
torch.jit.save(scripted_model, '/content/drive/MyDrive/course_to_middle_old/weights/detector_scripted.pt')

In [None]:
scripted_model = torch.jit.load('/content/drive/MyDrive/course_to_middle_old/weights/detector_scripted.pt')

## YOUR CODE HERE
scripted_model = scripted_model.eval().half().to(device) # scripted_model = scripted_model.eval().to(device)

In [None]:
_ = scripted_model.forward(t_img)

In [None]:
times_for_inf_torch_script = []

for i in range(100):
  torch.cuda.synchronize(device=device)
  t0 = time.time()

  ## YOUR CODE HERE
  res = scripted_model.forward(t_img) # res = scripted_model.forward(t_img).half()

  torch.cuda.synchronize(device=device)
  times_for_inf_torch_script.append(time.time() - t0)

print(f'mean time for inference torch script {np.mean(np.array(times_for_inf_torch_script))}')

In [None]:
nms_thres, iou_threshold = scripted_model.nms_thres, scripted_model.nms_iou_thres

In [None]:
clone_res = res.clone().detach()

In [None]:
times_for_postproc = []

for i in range(100):
  torch.cuda.synchronize(device=device)
  t0 = time.time()

  ## YOUR CODE HERE
  clone_res_cpu = clone_res.cpu() # clone_res_cpu = clone_res.cpu()
  ## YOUR CODE HERE
  clone_res_cpu[:, [0, 1, 2, 5, 6, 7], :, :] = torch.sigmoid(clone_res_cpu[:, [0, 1, 2, 5, 6, 7], :, :].float()).half() # clone_res_cpu[:, [0, 1, 2, 5, 6, 7], :, :] = torch.sigmoid(clone_res_cpu[:, [0, 1, 2, 5, 6, 7], :, :])
  ## YOUR CODE HERE
  bboxes = decode_result(clone_res_cpu[0].float(), threshold=nms_thres, iou_threshold=iou_threshold) # bboxes = decode_result(clone_res_cpu[0], threshold=nms_thres, iou_threshold=iou_threshold)

  torch.cuda.synchronize(device=device)
  times_for_postproc.append(time.time() - t0)

print(f'mean time for postpocessing torchscript {np.mean(np.array(times_for_postproc))}')

In [None]:
plt.imshow(img)
for index in range(len(bboxes['boxes'])):
    draw_box(bboxes['boxes'][index], bboxes['labels'][index])

#Convert to ONNX

In [None]:
!pip install onnx

In [None]:
import torch.onnx

In [None]:
dummy_input=torch.randn(1, 3, 512, 736).to(device)

In [None]:
model = model.eval().to(device)

In [None]:
torch.onnx.export(model, dummy_input, "/content/drive/MyDrive/course_to_middle_old/weights/detector.onnx", verbose=False) # , opset_version=12

#Inference cv2 DNN

In [None]:
import cv2

Инференс на cpu: <br>
https://github.com/openvinotoolkit/openvino

In [None]:
model_cv = cv2.dnn.readNetFromONNX("/content/drive/MyDrive/course_to_middle_old/weights/detector.onnx")

In [None]:
frame = cv2.imread('/content/drive/MyDrive/course_to_middle_old/test_images_videos/photo1681218949.jpeg') # frame = cv2.imread('/content/drive/MyDrive/course_to_middle_old/test_images_videos/photo1681218949.jpeg')
frame_rs = cv2.resize(frame, (736, 512))
frame_rs = ((frame_rs/255) - 0.5) * 4
frame_rs = np.expand_dims(frame_rs.transpose(2, 0, 1), axis=0)

In [None]:
model_cv.setInput(frame_rs)

In [None]:
outputs = model_cv.forward()

In [None]:
outputs.shape

In [None]:
clone_res_cpu = torch.from_numpy(outputs)
clone_res_cpu[:, [0, 1, 2, 5, 6, 7], :, :] = torch.sigmoid(clone_res_cpu[:, [0, 1, 2, 5, 6, 7], :, :])
bboxes = decode_result(clone_res_cpu[0], threshold=0.2, iou_threshold=0.2)

In [None]:
plt.imshow(img)
for index in range(len(bboxes['boxes'])):
    draw_box(bboxes['boxes'][index], bboxes['labels'][index])

Convert ONNX to TensorRT

https://developer.nvidia.com/tensorrt <br>
https://github.com/NVIDIA/TensorRT

In [None]:
!pip install nvidia-tensorrt pycuda

In [None]:
# ! /usr/src/tensorrt/bin/trtexec --help

In [None]:
# ! /usr/src/tensorrt/bin/trtexec --onnx=/mnt/jupyter/weights/yolov5l_640_lp_new.onnx --fp16 --saveEngine=/mnt/jupyter/weights/yolov5l_640_lp_new_trt.engine 

In [None]:
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

In [None]:
logger = trt.Logger(trt.Logger.WARNING)

In [None]:
builder = trt.Builder(logger)

In [None]:
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))

In [None]:
parser = trt.OnnxParser(network, logger)

In [None]:
success = parser.parse_from_file('/content/drive/MyDrive/course_to_middle_old/weights/detector.onnx')
for idx in range(parser.num_errors):
    print(parser.get_error(idx))
if not success:
  print('error while parse ONNX')

In [None]:
config = builder.create_builder_config()

In [None]:
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30)

In [None]:
## YOUR CODE HERE
config.set_flag(trt.BuilderFlag.FP16) # # config.set_flag(trt.BuilderFlag.FP16)

In [None]:
with builder.build_engine(network, config) as engine, open('/content/drive/MyDrive/course_to_middle_old/weights/detector.engine', 'wb') as t:
    t.write(engine.serialize())

# Inference with TensorRT

In [None]:
logger = trt.Logger(trt.Logger.WARNING)

In [None]:
with open('/content/drive/MyDrive/course_to_middle_old/weights/detector.engine', 'rb') as f:
    serialized_engine = f.read()

In [None]:
runtime = trt.Runtime(logger)

In [None]:
engine = runtime.deserialize_cuda_engine(serialized_engine)

In [None]:
model_context = engine.create_execution_context()

In [None]:
model_input_name = model_context.engine.get_tensor_name(0)
model_input_shape = model_context.engine.get_tensor_shape(model_input_name)
model_output_name = model_context.engine.get_tensor_name(1)
model_output_shape = model_context.engine.get_tensor_shape(model_output_name)

In [None]:
print(model_input_name, model_input_shape)

In [None]:
print(model_output_name, model_output_shape)

In [None]:
frame = cv2.imread('/content/drive/MyDrive/course_to_middle_old/test_images_videos/photo1681218949.jpeg')

In [None]:
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

In [None]:
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    out_shapes = []
    input_shapes = []
    out_names = []
    max_batch_size = engine.get_profile_shape(0, 0)[2][0]
    for binding in engine:
        binding_shape = engine.get_binding_shape(binding)
        #Fix -1 dimension for proper memory allocation for batch_size > 1
        if binding_shape[0] == -1:
            binding_shape = (1,) + binding_shape[1:]
        size = trt.volume(binding_shape) * max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            input_shapes.append(engine.get_binding_shape(binding))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
            #Collect original output shapes and names from engine
            out_shapes.append(engine.get_binding_shape(binding))
            out_names.append(binding)
    return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size

In [None]:
inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size = allocate_buffers(engine)

In [None]:
model_context.active_optimization_profile = 0

In [None]:
def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

In [None]:
times_for_preprocessing_trt = []
for i in range(100):
  t0 = time.time()
  frame_rs = cv2.resize(frame, (736, 512))
  frame_rs = ((frame_rs/255) - 0.5) * 4
  frame_rs = np.expand_dims(frame_rs.transpose(2, 0, 1), axis=0)
  frame_rs = np.ascontiguousarray(frame_rs)

  batch_size = frame_rs.shape[0]
  allocate_place = np.prod(frame_rs.shape)
  inputs[0].host[:allocate_place] = frame_rs.flatten(order='C').astype(np.float32)
  model_context.set_binding_shape(0, frame_rs.shape)

  times_for_preprocessing_trt.append(time.time() - t0)

print(f'mean time for preprocessing for TensorRT: {np.mean(np.array(times_for_preprocessing_trt))}')

In [None]:
times_for_inf_trt = []
for i in range(100):
  t0 = time.time()
  batch_size = frame_rs.shape[0]
  allocate_place = np.prod(frame_rs.shape)
  inputs[0].host[:allocate_place] = frame_rs.flatten(order='C').astype(np.float32)
  model_context.set_binding_shape(0, frame_rs.shape)
  trt_outputs = do_inference(
      model_context, 
      bindings=bindings,
      inputs=inputs, 
      outputs=outputs, 
      stream=stream
  )

  times_for_inf_trt.append(time.time() - t0)

print(f'mean time for inference in TensorRT: {np.mean(np.array(times_for_inf_trt))}')

In [None]:
out = trt_outputs[0].reshape((1, 10, 64, 92))

In [None]:
out.shape

In [None]:
res = torch.from_numpy(out.copy())

In [None]:
res[:, [0, 1, 2, 5, 6, 7], :, :] = torch.sigmoid(res[:, [0, 1, 2, 5, 6, 7], :, :])
bboxes = decode_result(res[0], threshold=0.2, iou_threshold=0.2)

plt.imshow(img)
for index in range(len(bboxes['boxes'])):
    draw_box(bboxes['boxes'][index], bboxes['labels'][index])

#Работа с видео. Захват видео с cv2

In [None]:
import cv2

In [None]:
print(cv2.getBuildInformation())

In [None]:
cap = cv2.VideoCapture('/content/drive/MyDrive/course_to_middle_old/test_images_videos/driving_out_30sec.mp4')

In [None]:
times_to_grab_images = []
while cap.isOpened():
  t0 = time.time()
  ret, image_np = cap.read()
  times_to_grab_images.append(time.time() - t0)

  if not ret:
    break
  # else:
  #   print(image_np.shape)

print(f'mean time for grab image: {np.mean(np.array(times_to_grab_images))}')

# GStreamer

https://gstreamer.freedesktop.org/documentation/tutorials/index.html?gi-language=c <br>
https://docs.gstreamer.com/ <br>
https://gist.github.com/hum4n0id/cda96fb07a34300cdb2c0e314c14df0a <br>
https://docs.nvidia.com/jetson/archives/r35.2.1/DeveloperGuide/text/SD/Multimedia/AcceleratedGstreamer.html

In [None]:
# README https://github.com/pyannote/pyannote-audio/issues/1269
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip uninstall opencv-python -y

*Тут нужно перезагрузить среду*

In [None]:
!apt-get install libgstreamer1.0-dev libgstreamer-plugins-base1.0-dev libgstreamer-plugins-bad1.0-dev gstreamer1.0-plugins-base gstreamer1.0-plugins-good gstreamer1.0-plugins-bad gstreamer1.0-plugins-ugly gstreamer1.0-libav gstreamer1.0-doc gstreamer1.0-tools gstreamer1.0-x gstreamer1.0-alsa gstreamer1.0-gl gstreamer1.0-gtk3 gstreamer1.0-qt5 gstreamer1.0-pulseaudio

In [None]:
!apt-get install gstreamer1.0-tools

In [None]:
# %cd /content
# !git clone https://github.com/opencv/opencv
# !git clone https://github.com/opencv/opencv_contrib
# !mkdir /content/build
# %cd /content/build
# !cmake -DOPENCV_EXTRA_MODULES_PATH=/content/opencv_contrib/modules  -DWITH_GSTREAMER=ON -DBUILD_SHARED_LIBS=OFF  -DBUILD_TESTS=OFF  -DBUILD_PERF_TESTS=OFF -DBUILD_EXAMPLES=OFF -DWITH_OPENEXR=OFF -DWITH_CUDA=ON -DWITH_CUBLAS=ON -DWITH_CUDNN=ON -DOPENCV_DNN_CUDA=ON /content/opencv
# !make -j8 install
# %cd ../

In [None]:
# !ls /content/build/lib/python3/

In [None]:
# !cp /content/build/lib/python3/cv2.cpython-39-x86_64-linux-gnu.so ./

In [None]:
# !cp ./cv2.cpython-39-x86_64-linux-gnu.so /content/drive/MyDrive/course_to_middle_old/cv2_lib/

In [None]:
# !ls /content/drive/MyDrive/course_to_middle/cv2_lib/

In [None]:
!cp /content/drive/MyDrive/course_to_middle/cv2_lib/cv2.cpython-39-x86_64-linux-gnu.so ./

In [None]:
import cv2
import time
import numpy as np

In [None]:
print(cv2.getBuildInformation())

In [None]:
# !gst-launch-1.0 uridecodebin uri=file:///content/drive/MyDrive/course_to_middle_old/test_images_videos/driving_out_30sec.mp4  ! videoconvert ! video/x-raw, format=BGRx, width=1280, height=720 ! videoconvert ! video/x-raw, format=BGR ! fakesink

In [None]:
import cv2
import time
import numpy as np

In [None]:
cap = cv2.VideoCapture(
    f"uridecodebin uri=file:///content/drive/MyDrive/course_to_middle_old/test_images_videos/driving_out_30sec.mp4  ! videoconvert ! video/x-raw, format=BGRx, width=1280, height=720 ! videoconvert ! video/x-raw, format=BGR ! appsink sync=False", 
    cv2.CAP_GSTREAMER,
)

In [None]:
times_to_grab_images_with_gst = []
while cap.isOpened():
  t0 = time.time()
  ret, image_np = cap.read()
  times_to_grab_images_with_gst.append(time.time() - t0)
  if not ret:
    break
  # else:
  #   print(image_np.shape)

print(f'mean time for grab images with GStreamer: {np.mean(np.array(times_to_grab_images_with_gst))}')

# Bonus

In [None]:
# README https://discuss.pytorch.org/t/converting-to-onnx-raises-cuda-out-of-memory-error/111589/4

In [None]:
import torch
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
from matplotlib import pyplot as plt
from typing import Union, Tuple

In [None]:
device = 'cuda:0'

In [None]:
checkpoint_path = '/content/drive/MyDrive/course_to_middle_old/weights/lpr_epoch_42_ts.pth'
IMG_HEIGHT, IMG_WIDTH = 224, 224

In [None]:
## YOUR CODE HERE
model = torch.jit.load(checkpoint_path, map_location='cuda').eval().half() # model = torch.jit.load(checkpoint_path, map_location='cuda').eval()

In [None]:
## YOUR CODE HERE
warm_up_sample = torch.rand(3, IMG_HEIGHT, IMG_WIDTH).unsqueeze(0).half().to(device) # warm_up_sample = torch.rand(3, IMG_HEIGHT, IMG_WIDTH).unsqueeze(0).to(device)
print(warm_up_sample.shape)

In [None]:
# README https://stackoverflow.com/a/65442993
# Reducing num_workers worked for me :D # torch.set_num_threads(1)
# The error RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR is notoriously difficult to debug, but surprisingly often it's an out of memory problem.
# torch.set_num_threads(1)
# model = torch.onnx.export(
#     model,                                    
#     warm_up_sample,                           
#     "/content/drive/MyDrive/course_to_middle_old/weights/lpr_epoch_42_ts.onnx",         
#     opset_version=12,               
#     verbose=False,
# )

In [None]:
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)

success = parser.parse_from_file('/content/drive/MyDrive/course_to_middle_old/weights/lpr_epoch_42_ts.onnx')
for idx in range(parser.num_errors):
    print(parser.get_error(idx))
if not success:
  print('error while parse ONNX')

config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30)

with builder.build_engine(network, config) as engine, open('/content/drive/MyDrive/course_to_middle_old/weights/lpr_epoch_42_ts.engine', 'wb') as t:
    t.write(engine.serialize())

In [None]:
logger = trt.Logger(trt.Logger.WARNING)

In [None]:
with open('/content/drive/MyDrive/course_to_middle_old/weights/lpr_epoch_42_ts.engine', 'rb') as f:
    serialized_engine = f.read()

In [None]:
runtime = trt.Runtime(logger)

In [None]:
engine = runtime.deserialize_cuda_engine(serialized_engine)

In [None]:
model_context = engine.create_execution_context()

In [None]:
model_input_name = model_context.engine.get_tensor_name(0)
model_input_shape = model_context.engine.get_tensor_shape(model_input_name)
model_output_name = model_context.engine.get_tensor_name(1)
model_output_shape = model_context.engine.get_tensor_shape(model_output_name)

In [None]:
print(model_input_name, model_input_shape)

In [None]:
print(model_output_name, model_output_shape)

In [None]:
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

In [None]:
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    out_shapes = []
    input_shapes = []
    out_names = []
    max_batch_size = engine.get_profile_shape(0, 0)[2][0]
    for binding in engine:
        binding_shape = engine.get_binding_shape(binding)
        #Fix -1 dimension for proper memory allocation for batch_size > 1
        if binding_shape[0] == -1:
            binding_shape = (1,) + binding_shape[1:]
        size = trt.volume(binding_shape) * max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            input_shapes.append(engine.get_binding_shape(binding))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
            #Collect original output shapes and names from engine
            out_shapes.append(engine.get_binding_shape(binding))
            out_names.append(binding)
    return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size

In [None]:
inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size = allocate_buffers(engine)

In [None]:
model_context.active_optimization_profile = 0

In [None]:
def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

In [None]:
frame = cv2.imread('/content/drive/MyDrive/course_to_middle_old/test_images_videos/plate.png')
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

In [None]:
plt.imshow(frame)
plt.show()

In [None]:
def resize_and_pad(image: np.ndarray, shape: Tuple[int]) -> np.ndarray:
    '''
    Resize image maintaining aspect ration and add pads to desired shape.
    Inputs:
    image: np.ndarray - target image
    shape: Tuple[int] - target shape (x,y)
    '''

    t_h, t_w = shape  # change x and y
    zeros = np.zeros((t_h, t_w, 3)).astype(np.uint8)
    h, w, _ = image.shape
    if w > h:
        resized = cv2.resize(image.copy(), (t_w, int(t_h * (h / w))))
        y = (zeros.shape[0] - resized.shape[0]) // 2
        zeros[y:y + resized.shape[0], ...] = resized
    elif w < h:
        resized = cv2.resize(image.copy(), (int(t_w * (w / h)), t_h))
        x = (zeros.shape[1] - resized.shape[1]) // 2
        zeros[:, x:x + resized.shape[1], ...] = resized
    else:
        resized = cv2.resize(image.copy(), (t_w, t_h))
        zeros = resized

    return zeros

In [None]:
frame_rs = resize_and_pad(frame, (224, 224))
frame_rs = frame_rs.transpose((2, 0, 1))
frame_rs = frame_rs / 255
frame_rs = np.expand_dims(frame_rs, axis=0)
frame_rs = np.ascontiguousarray(frame_rs)

# batch_size = frame_rs.shape[0]
# allocate_place = np.prod(frame_rs.shape)
# inputs[0].host[:allocate_place] = frame_rs.flatten(order='C').astype(np.float32)
# model_context.set_binding_shape(0, frame_rs.shape)

In [None]:
batch_size = frame_rs.shape[0]
allocate_place = np.prod(frame_rs.shape)
inputs[0].host[:allocate_place] = frame_rs.flatten(order='C').astype(np.float32)
model_context.set_binding_shape(0, frame_rs.shape)
trt_outputs = do_inference(
    model_context, 
    bindings=bindings,
    inputs=inputs, 
    outputs=outputs, 
    stream=stream
)

In [None]:
trt_outputs[0].shape

In [None]:
trt_outputs = trt_outputs[0].reshape((1, 57, 23))

In [None]:
alphabet = '0123456789ABCEHKMOPTXY'

In [None]:
def topk(array, k, axis=-1, sorted=True):
    partitioned_ind = (
        np.argpartition(array, -k, axis=axis)
        .take(indices=range(-k, 0), axis=axis)
    )
    partitioned_scores = np.take_along_axis(array, partitioned_ind, axis=axis)
    
    if sorted:
        sorted_trunc_ind = np.flip(
            np.argsort(partitioned_scores, axis=axis), axis=axis
        )
        
        ind = np.take_along_axis(partitioned_ind, sorted_trunc_ind, axis=axis)
        scores = np.take_along_axis(partitioned_scores, sorted_trunc_ind, axis=axis)
    else:
        ind = partitioned_ind
        scores = partitioned_scores
    
    return scores, ind

In [None]:
confidences, symbols = [i.flatten() for i in topk(trt_outputs[0], 1, axis=1)]

In [None]:
symbols

In [None]:
blank = len(alphabet)
label, conf_list, buf = '', list(), blank

In [None]:
for i in range(symbols.shape[0]):
    if symbols[i] == blank or symbols[i] == buf:
        buf = symbols[i]
        continue

    buf = symbols[i]
    label += alphabet[buf]

In [None]:
label

In [None]:
plt.imshow(frame)
plt.show()

# Application

In [None]:
!(rm -rf /content/drive/MyDrive/course_to_middle && git clone https://github.com/innovator1984/course_to_middle /content/drive/MyDrive/course_to_middle)

In [None]:
!(cd /content/drive/MyDrive/course_to_middle && git checkout feature/course_to_middle)

In [None]:
!cp /content/drive/MyDrive/Edu/5.\ Inference/cv2_lib/cv2.cpython-39-x86_64-linux-gnu.so /usr/lib/python3/dist-packages/

In [None]:
!pip install opencv-python==4.6.0.66

In [None]:
!rm -rf /content/drive/MyDrive/course_to_middle/weights/ && cp -r /content/drive/MyDrive/Edu/5.\ Inference/weights/ /content/drive/MyDrive/course_to_middle/weights/

In [None]:
!(rm -rf /content/drive/MyDrive/course_to_middle/videos/ && cp -r /content/drive/MyDrive/Edu/5.\ Inference/test_images_videos/ /content/drive/MyDrive/course_to_middle/videos/ )

In [None]:
!(ls /content/drive/MyDrive/course_to_middle/videos/)

In [None]:
!python3 /content/drive/MyDrive/course_to_middle/pursuit_detection/pursuit_detection.py --input-video /content/drive/MyDrive/course_to_middle/videos/driving_out_30sec.mp4 --output-video /content/drive/MyDrive/course_to_middle/videos/driving_out_30sec_result.mp4