In [1]:
import numpy as np
import cv2
import tritonclient.grpc as grpcclient
import sys
import argparse


def get_triton_client(url: str = 'localhost:8001'):
    try:
        keepalive_options = grpcclient.KeepAliveOptions(
            keepalive_time_ms=2**31 - 1,
            keepalive_timeout_ms=20000,
            keepalive_permit_without_calls=False,
            http2_max_pings_without_data=2
        )
        triton_client = grpcclient.InferenceServerClient(
            url=url,
            verbose=False,
            keepalive_options=keepalive_options)
    except Exception as e:
        print("channel creation failed: " + str(e))
        sys.exit()
    return triton_client


def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
    label = f'({class_id}: {confidence:.2f})'
    color = (255, 0, )
    cv2.rectangle(img, (x, y), (x_plus_w, y_plus_h), color, 2)
    cv2.putText(img, label, (x - 10, y - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)


def read_image(image_path: str, expected_image_shape) -> np.ndarray:
    expected_width = expected_image_shape[0]
    expected_height = expected_image_shape[1]
    expected_length = min((expected_height, expected_width))
    original_image: np.ndarray = cv2.imread(image_path)
    [height, width, _] = original_image.shape
    length = max((height, width))
    image = np.zeros((length, length, 3), np.uint8)
    image[0:height, 0:width] = original_image
    scale = length / expected_length

    input_image = cv2.resize(image, (expected_width, expected_height))
    input_image = (input_image / 255.0).astype(np.float32)

    # Channel first
    input_image = input_image.transpose(2, 0, 1)

    # Expand dimensions
    input_image = np.expand_dims(input_image, axis=0)
    return original_image, input_image, scale


def run_inference(model_name: str, input_image: np.ndarray,
                  triton_client: grpcclient.InferenceServerClient):
    inputs = []
    outputs = []
    inputs.append(grpcclient.InferInput('images', input_image.shape, "FP32"))
    # Initialize the data
    inputs[0].set_data_from_numpy(input_image)

    outputs.append(grpcclient.InferRequestedOutput('num_detections'))
    outputs.append(grpcclient.InferRequestedOutput('detection_boxes'))
    outputs.append(grpcclient.InferRequestedOutput('detection_scores'))
    outputs.append(grpcclient.InferRequestedOutput('detection_classes'))

    # Test with outputs
    results = triton_client.infer(model_name=model_name,
                                  inputs=inputs,
                                  outputs=outputs)

    num_detections = results.as_numpy('num_detections')
    detection_boxes = results.as_numpy('detection_boxes')
    detection_scores = results.as_numpy('detection_scores')
    detection_classes = results.as_numpy('detection_classes')
    return num_detections, detection_boxes, detection_scores, detection_classes


def main(image_path, model_name, url):
    triton_client = get_triton_client(url)
    expected_image_shape = triton_client.get_model_metadata(model_name).inputs[0].shape[-2:]
    original_image, input_image, scale = read_image(image_path, expected_image_shape)
    num_detections, detection_boxes, detection_scores, detection_classes = run_inference(
        model_name, input_image, triton_client)

    for index in range(num_detections):
        box = detection_boxes[index]

        draw_bounding_box(original_image,
                          detection_classes[index],
                          detection_scores[index],
                          round(box[0] * scale),
                          round(box[1] * scale),
                          round((box[0] + box[2]) * scale),
                          round((box[1] + box[3]) * scale))

    cv2.imwrite('output.jpg', original_image)

In [None]:
main(args.image_path, args.model_name, args.url)

In [1]:
from ultralytics import YOLO
import cv2

In [5]:
model_pt = YOLO('Models/yolodet_fp16/best.pt')
img = cv2.imread('frame.jpg')
results = model_pt.predict(img)
# Add this debug line before the failing code
print("Prediction tensor shape:", results)


0: 384x640 17 potatos, 5.2ms
Speed: 1.8ms preprocess, 5.2ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Prediction tensor shape: [ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: None
names: {0: 'potato'}
obb: None
orig_img: array([[[ 96, 132,  55],
        [ 96, 132,  55],
        [ 96, 132,  55],
        ...,
        [119, 150,  95],
        [119, 150,  95],
        [119, 150,  95]],

       [[ 96, 132,  55],
        [ 96, 132,  55],
        [ 96, 132,  55],
        ...,
        [119, 150,  95],
        [119, 150,  95],
        [119, 150,  95]],

       [[ 96, 132,  55],
        [ 96, 132,  55],
        [ 96, 132,  55],
        ...,
        [119, 150,  95],
        [119, 150,  95],
        [119, 150,  95]],

       ...,

       [[117, 147,  71],
        [117, 147,  71],
        [117, 147,  71],
        ...,
        [134, 171, 101],
        [134, 171, 101],
        [134, 171, 101]],


In [None]:
# Add this debug line before the failing code
print("Prediction tensor shape:", results.shape)

In [7]:
import tritonclient.http as httpclient

client = httpclient.InferenceServerClient(url="localhost:8000")
inputs = [httpclient.InferInput("images", [1, 3, 640, 640], "FP32")]
outputs = [httpclient.InferRequestedOutput("output0")]

results = client.infer(model_name="model_det", inputs=inputs, outputs=outputs)
print(results.as_numpy("output0").shape)  # Should be (1, 5, 8400)

InferenceServerException: [400] Input must set only one of the following fields: 'data', 'binary_data_size' in 'parameters', 'shared_memory_region' in 'parameters'. But no field is set

In [8]:
# Load the Triton Server model
model = YOLO("http://localhost:8000/model_det_ensemble", task="detect")

img = cv2.imread('frame.jpg')
imgs = [img for i in range(20)]
results = model.predict(img)




IndexError: amax(): Expected reduction dim 1 to have non-zero size.

In [17]:
import cv2
import torch

# Read the image
img = cv2.imread('frame.jpg')

# Resize image to (640, 640) if necessary
img_resized = cv2.resize(img, (640, 640))

# Convert the image to a tensor and add a batch dimension (C, H, W)
img_tensor = torch.from_numpy(img_resized).permute(2, 0, 1).unsqueeze(0).float() / 255.0  # shape (1, 3, 640, 640)

# Repeat the image 20 times to create a batch of 20 images
imgs = img_tensor.repeat(20, 1, 1, 1)  # shape (20, 3, 640, 640)

# Now you can pass it to the model
results = model.predict(imgs)





InferenceServerException: [400] [request id: <id_unknown>] unexpected shape for input 'images' for model 'model_det'. Expected [20,3,640,640], got [1,3,640,640]. 

In [22]:
import cv2
import torch

# Step 1: Read the image and resize it
img = cv2.imread('frame.jpg')

# Resize the image to 640x640 if necessary
img_resized = cv2.resize(img, (640, 640))

results = model.predict(img_resized)

# Print the results (optional)
print(results)





InferenceServerException: [400] [request id: <id_unknown>] unexpected shape for input 'images' for model 'model_det'. Expected [20,3,640,640], got [1,3,640,640]. 

In [23]:
import numpy as np

# Convert your single image to a batch of 20 identical images
img_batch = np.stack([img_resized] * 20)
results = model.predict(img_batch)




InferenceServerException: [400] [request id: <id_unknown>] unexpected shape for input 'images' for model 'model_det'. Expected [20,3,640,640], got [1,3,640,640]. 

In [24]:
from ultralytics import YOLO

# Load the model
model = YOLO("yolov8n.pt")  # or your custom model

# Manually set warmup batch size to 20 (instead of default 1)
model.predictor.done_warmup = False  # Force re-warmup
model.predictor.model.warmup(imgsz=(20, 3, 640, 640))  # Match Triton's expected shape

# Now inference should work
results = model.predict(img_resized)

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 381MB/s]


AttributeError: 'NoneType' object has no attribute 'done_warmup'

In [18]:
imgs.shape

torch.Size([20, 3, 640, 640])

In [11]:
import ffmpegcv
import time
from pynvml import *
from contextlib import contextmanager
import torch.nn.functional as F

@contextmanager
def gpu_monitoring():
    """Context manager for GPU monitoring"""
    class GPUMonitor:
        def __init__(self, handle):
            self.handle = handle
            
        def get_stats(self):
            try:
                util = nvmlDeviceGetUtilizationRates(self.handle)
                mem = nvmlDeviceGetMemoryInfo(self.handle)
                return {
                    'gpu_util': util.gpu,
                    'mem_util': util.memory,
                    'mem_used': mem.used / 1024**2  # MB
                }
            except NVMLError as e:
                print(f"GPU Stats Error: {e}")
                return {'gpu_util': -1, 'mem_util': -1, 'mem_used': -1}
    
    handle = None
    try:
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        yield GPUMonitor(handle)
    except NVMLError as e:
        print(f"NVML Init Error: {e}")
        yield None
    finally:
        if handle is not None:
            try:
                nvmlShutdown()
            except NVMLError as e:
                print(f"NVML Shutdown Error: {e}")

# cap1 = ffmpegcv.toCUDA(ffmpegcv.VideoCaptureNV('video.mp4', pix_fmt='nv12'))
# cap2 = ffmpegcv.toCUDA(ffmpegcv.VideoCaptureNV('video.mp4', pix_fmt='nv12'))

start_time = time.time()
frame_count = 0
cap = ffmpegcv.toCUDA(ffmpegcv.VideoCaptureNV('video.mp4', pix_fmt='nv12'), tensor_format='chw', gpu=0)

print(f"Processing on GPU 0 | {cap.width}x{cap.height} @ {cap.fps:.1f} FPS")
print("Frame | GPU% | MEM% | Used(MB) | FPS")
print("------------------------------------")
try:
    with gpu_monitoring() as monitor, cap:
        while True:
            ret, frame_CHW_CUDA = cap.read_torch()
            if not ret:
                break
            frame_CHW_CUDA = frame_CHW_CUDA.unsqueeze(0)
            new_height = 640  
            new_width = 640  
            resized_tensor = F.interpolate(frame_CHW_CUDA, size=(new_height, new_width), mode='bilinear', align_corners=False)/255.0
            print('-')
            inp = [resized_tensor.cpu().numpy()]*20
            results = model.track(inp, imgsz=640, verbose=False)
            print('-')
            if frame_count % 30 == 0:
                stats = monitor.get_stats() if monitor else {'gpu_util': -1, 'mem_util': -1, 'mem_used': -1}
                elapsed = max(0.001, time.time() - start_time)
                fps = frame_count / elapsed
                
                print(
                    f"{frame_count:5d} | "
                    f"{stats['gpu_util']:3d}% | "
                    f"{stats['mem_util']:3d}% | "
                    f"{stats['mem_used']:7.1f} | "
                    f"{fps:5.1f}"
                )  
            frame_count+=1   
                
except KeyboardInterrupt:
    print("\nProcess interrupted")
finally:
    total_time = time.time() - start_time
    print("\nSummary:")
    print(f"Frames processed: {frame_count}")
    print(f"Total time: {total_time:.1f}s")
    print(f"Average FPS: {frame_count/total_time:.1f}")

LogicError: cuCtxCreate failed: an illegal memory access was encountered