In [1]:
%cd "../"

d:\Documents\GitHub\onnx


In [2]:
import onnxruntime as ort

import torch
import numpy as np

import matplotlib.pyplot as plt
from scipy.ndimage import zoom

## Intel Compressor

### Compressed ONNX 

In [11]:
# Input and preprocessing
img_size = 128
vol = []
for index in range(166, 178, 1):
    slice = np.load(f'samples/0001_{index:04d}.npy')
    x, y = slice.shape
    if x != img_size and y != img_size:
        slice = zoom(slice, (img_size / x, img_size / y), order=0)
    vol.append(slice)

vol = np.array(vol)
inputs = np.array(vol)[:, np.newaxis, :, :]

# Initialize model
provider = ['CUDAExecutionProvider', 'CPUExecutionProvider']
ort_session = ort.InferenceSession("OnnxModels/onnx_model_compressed2.onnx", providers=provider)

# Warmup step
inp = {ort_session.get_inputs()[0].name: inputs}
out = ort_session.run(None, inp)
out = np.argmax(out[0], axis=1)

In [13]:
%%timeit
inp = {ort_session.get_inputs()[0].name: inputs}
out = ort_session.run(None, inp)

55.8 ms ± 2.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Pytorch

In [16]:
# Input and preprocessing
img_size = 128
vol = []
for index in range(166, 178, 1):
    slice = np.load(f'samples/0001_{index:04d}.npy')
    x, y = slice.shape
    if x != img_size and y != img_size:
        slice = zoom(slice, (img_size / x, img_size / y), order=0)
    vol.append(slice)

vol = np.array(vol)
inputs = torch.tensor(vol).unsqueeze(1).cuda()

# Initialize model
model = torch.load("TorchModels/model1.pth").cuda()
model.eval()
model = model.cuda()
model = torch.compile(model, backend="eager")

# Warmup step
with torch.inference_mode():
    output = model(inputs)

In [17]:
%%timeit
with torch.inference_mode():
    output = model(inputs)

52.6 ms ± 5.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### UNet++

#### ONNX Runtime

In [9]:
# Input and preprocessing
image = np.load('samples/0001_0170.npy')
img_size = 128
x, y = image.shape
if x != img_size and y != img_size:
    inputs = zoom(image, (img_size / x, img_size / y), order=0)
    
inputs = inputs[np.newaxis, np.newaxis, :, :]
inputs = np.float16(inputs)

# Initialize model
provider = ['CUDAExecutionProvider', 'CPUExecutionProvider']
ort_session = ort.InferenceSession("OnnxModels/onnx_model_sim2.onnx", providers=provider)

# Warmup step
inp = {ort_session.get_inputs()[0].name: inputs}
out = ort_session.run(None, inp)
out = np.argmax(out[0], axis=1)

In [11]:
%%timeit
inp = {ort_session.get_inputs()[0].name: inputs}
out = ort_session.run(None, inp)

8.14 ms ± 40.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Pytorch runtime

In [5]:
# Input and preprocessing
image = np.load('samples/0001_0170.npy')
img_size = 128
x, y = image.shape
if x != img_size and y != img_size:
    inputs = zoom(image, (img_size / x, img_size / y), order=0)
    
inputs = torch.tensor(inputs).unsqueeze(0).unsqueeze(0).cuda()

# Intialize model
model = torch.load("TorchModels/model1.pth").cuda()
model.eval()
model = model.cuda()
model = torch.compile(model, backend="eager")

# Warmup step
with torch.inference_mode():
    output = model(inputs)

In [6]:
%%timeit
with torch.inference_mode():
    output = model(inputs)

5.82 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


## RotCAtt-TransUNet-plusplus

### ONNX Runtime

In [3]:
# Input and preprocessing
img_size = 128
vol = []
for index in range(167, 174, 1):
    slice = np.load(f'samples/0001_{index:04d}.npy')
    x, y = slice.shape
    if x != img_size and y != img_size:
        slice = zoom(slice, (img_size / x, img_size / y), order=0)
    vol.append(slice)

inputs = np.array(vol)[:, np.newaxis, :, :]

# Initialize model
provider = ['CUDAExecutionProvider', 'CPUExecutionProvider']
ort_session = ort.InferenceSession("OnnxModels/onnx_model4.onnx", providers=provider)

# Warmup step
inp = {ort_session.get_inputs()[0].name: inputs}
out = ort_session.run(None, inp)

In [4]:
%%timeit
out = ort_session.run(None, inp)

37.7 ms ± 2.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Pytorch runtime

### GPU

In [5]:
# Input and preprocessing
img_size = 128
vol = []
for index in range(167, 174, 1):
    slice = np.load(f'samples/0001_{index:04d}.npy')
    x, y = slice.shape
    if x != img_size and y != img_size:
        slice = zoom(slice, (img_size / x, img_size / y), order=0)
    vol.append(slice)

vol = np.array(vol)
inputs = torch.tensor(vol).unsqueeze(1).cuda()


# Intialize model
model = torch.load("TorchModels/model2.pth").cuda()
model.eval()
model = model.cuda()
model = torch.compile(model, backend="eager")

# Warmup step
with torch.inference_mode():
    output = model(inputs)

In [6]:
%%timeit
with torch.inference_mode():
    output = model(inputs)

86.5 ms ± 5.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### CPU

In [5]:
# Input and preprocessing
img_size = 128
vol = []
for index in range(167, 174, 1):
    slice = np.load(f'samples/0001_{index:04d}.npy')
    x, y = slice.shape
    if x != img_size and y != img_size:
        slice = zoom(slice, (img_size / x, img_size / y), order=0)
    vol.append(slice)

vol = np.array(vol)
inputs = torch.tensor(vol).unsqueeze(1)


# Intialize model
model = torch.load("TorchModels/model2.pth").cuda()
model.eval()
model = model.cpu()
model = torch.compile(model, backend="eager")

# Warmup step
with torch.inference_mode():
    output = model(inputs)

In [6]:
%%timeit
with torch.inference_mode():
    output = model(inputs)

621 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### ONNX OPENVINO CPU

In [3]:
import openvino as ov

img_size = 128
vol = []
for index in range(167, 174, 1):
    slice = np.load(f'samples/0001_{index:04d}.npy')
    x, y = slice.shape
    if x != img_size and y != img_size:
        slice = zoom(slice, (img_size / x, img_size / y), order=0)
    vol.append(slice)

inputs = np.array(vol)[:, np.newaxis, :, :]
inputs.shape

core = ov.Core()
compiled_model = core.compile_model("OnnxModels\onnx_model4.onnx", "CPU")
infer_request = compiled_model.create_infer_request()

In [4]:
input_tensor = ov.Tensor(array=inputs, shared_memory=True)
infer_request.set_input_tensor(input_tensor)


In [5]:
%%timeit
output_tensor = infer_request.infer()

404 ms ± 8.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
