```bash
docker run --rm --gpus 0 -v /projects/demos/football_demo/triton/triton:/models -p 12000:12000 nvcr.io/nvidia/tritonserver:23.04-py3 tritonserver --model-repository=/models --http-port 12000
```

In [3]:
from ultralytics import YOLO
import torch
print(torch.cuda.is_available())

# Load models
player_tracking = YOLO('models/player_tracking.pt')
ball_tracking = YOLO('models/ball_tracking.pt')

player_tracking.to('cuda')
ball_tracking.to('cuda')

player_tracking_onnx = player_tracking.export(format="onnx", device="cuda", opset=18)
ball_tracking_onnx = ball_tracking.export(format="onnx", device="cuda", opset=18)

True
Ultralytics 8.3.3 🚀 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (Tesla V100-PCIE-32GB, 32501MiB)
YOLO11x summary (fused): 464 layers, 56,828,179 parameters, 0 gradients, 194.4 GFLOPs

[34m[1mPyTorch:[0m starting from 'models/player_tracking.pt' with input shape (1, 3, 1280, 1280) BCHW and output shape(s) (1, 5, 33600) (109.3 MB)

[34m[1mONNX:[0m starting export with onnx 1.13.0 opset 18...
[34m[1mONNX:[0m slimming with onnxslim 0.1.34...
[34m[1mONNX:[0m simplifier failure: FLOAT8E4M3FN
[34m[1mONNX:[0m export success ✅ 5.3s, saved as 'models/player_tracking.onnx' (217.5 MB)

Export complete (6.2s)
Results saved to [1m/projects/demos/football_demo/triton/models[0m
Predict:         yolo predict task=detect model=models/player_tracking.onnx imgsz=1280  
Validate:        yolo val task=detect model=models/player_tracking.onnx imgsz=1280 data=/projects/demos/football_demo/demo_tuesday/finetune_yolo/yolov11/final_dataset/data.yaml  
Visualize:       https://netron.app
Ultraly

In [23]:
# test out model usage
player_tracking_onnx = YOLO('models/player_tracking.onnx', task="detect")
# ball_tracking_onnx = YOLO('/projects/demos/football_demo/triton/models/ball_tracking/1/model.onnx', task="detect")
# player_tracking_onnx = YOLO('triton/player_tracking/1/model.onnx', task="detect")

frame = 'images/frame_1.jpg'

results = player_tracking_onnx(frame)
print(results)

Loading models/player_tracking.onnx for ONNX Runtime inference...

image 1/1 /projects/demos/football_demo/triton/images/frame_1.jpg: 640x640 19 players, 810.5ms
Speed: 2.9ms preprocess, 810.5ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)
[ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: None
names: {0: 'player'}
obb: None
orig_img: array([[[ 50,  45,  46],
        [ 47,  42,  43],
        [ 55,  51,  50],
        ...,
        [ 79,  81,  81],
        [ 72,  74,  74],
        [ 67,  69,  69]],

       [[ 52,  47,  48],
        [ 46,  41,  42],
        [ 50,  45,  46],
        ...,
        [ 75,  77,  78],
        [ 72,  74,  75],
        [ 68,  70,  71]],

       [[ 50,  45,  47],
        [ 46,  41,  42],
        [ 48,  43,  44],
        ...,
        [ 61,  62,  66],
        [ 61,  64,  68],
        [ 64,  67,  71]],

       ...,

       [[ 85,  96,  86],
        [ 83,  94,  84],
     

In [None]:
import onnx
# Load the model
model = onnx.load('/projects/demos/football_demo/triton/triton/ball_tracking/1/model.onnx')

# Convert the model to IR version 8
model.ir_version = 8

# Save the converted model
onnx.save(model, '/projects/demos/football_demo/triton/triton/ball_tracking/1/model_1.onnx')

In [17]:
from pathlib import Path

# Define paths
model_name = "ball_tracking"
triton_repo_path = Path("triton") 
triton_model_path = triton_repo_path / model_name

# Create directories
(triton_model_path / "1").mkdir(parents=True, exist_ok=True)
print(triton_model_path)

triton/ball_tracking


In [24]:

# test out model usage
ball_tracking_onnx = YOLO('triton/ball_tracking/1/model.onnx', task="detect")
# ball_tracking_onnx = YOLO('/projects/demos/football_demo/triton/models/ball_tracking/1/model.onnx', task="detect")
# player_tracking_onnx = YOLO('triton/player_tracking/1/model.onnx', task="detect")

frame = 'images/frame_1.jpg'
results = ball_tracking_onnx(frame)
print(results)

Loading triton/ball_tracking/1/model.onnx for ONNX Runtime inference...

image 1/1 /projects/demos/football_demo/triton/images/frame_1.jpg: 640x640 (no detections), 775.4ms
Speed: 2.9ms preprocess, 775.4ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)
[ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: None
names: {0: 'ball'}
obb: None
orig_img: array([[[ 50,  45,  46],
        [ 47,  42,  43],
        [ 55,  51,  50],
        ...,
        [ 79,  81,  81],
        [ 72,  74,  74],
        [ 67,  69,  69]],

       [[ 52,  47,  48],
        [ 46,  41,  42],
        [ 50,  45,  46],
        ...,
        [ 75,  77,  78],
        [ 72,  74,  75],
        [ 68,  70,  71]],

       [[ 50,  45,  47],
        [ 46,  41,  42],
        [ 48,  43,  44],
        ...,
        [ 61,  62,  66],
        [ 61,  64,  68],
        [ 64,  67,  71]],

       ...,

       [[ 85,  96,  86],
        [ 83,  94,  8

In [25]:
# Load the Triton Server model
from ultralytics import YOLO
model = YOLO("http://localhost:12000/player_tracking", task="detect")

In [27]:
while True:
    results = model.predict("images/frame_1.jpg", data="players.yaml")

# print(results)


image 1/1 /projects/demos/football_demo/triton/images/frame_1.jpg: 640x640 19 players, 640.9ms
Speed: 3.4ms preprocess, 640.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /projects/demos/football_demo/triton/images/frame_1.jpg: 640x640 19 players, 747.7ms
Speed: 3.8ms preprocess, 747.7ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /projects/demos/football_demo/triton/images/frame_1.jpg: 640x640 19 players, 754.1ms
Speed: 3.9ms preprocess, 754.1ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /projects/demos/football_demo/triton/images/frame_1.jpg: 640x640 19 players, 818.9ms
Speed: 2.9ms preprocess, 818.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /projects/demos/football_demo/triton/images/frame_1.jpg: 640x640 19 players, 640.2ms
Speed: 3.1ms preprocess, 640.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /projects/demos/foo

KeyboardInterrupt
2024-11-06T16:46:51Z


KeyboardInterrupt: 

In [15]:
import contextlib
import subprocess
import time

from tritonclient.http import InferenceServerClient

# Define image https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver
tag = "nvcr.io/nvidia/tritonserver:23.04-py3"  # 8.57 GB

# Pull the image
subprocess.call(f"docker pull {tag}", shell=True)

23.04-py3: Pulling from nvidia/tritonserver
5544ebdc0c7b: Pulling fs layer
5c7458fe8983: Pulling fs layer
84fe6556776e: Pulling fs layer
fd3b5161a370: Pulling fs layer
f600d053e235: Pulling fs layer
2904faaa955b: Pulling fs layer
02e937dc172c: Pulling fs layer
674931146b3f: Pulling fs layer
a7104e7e3962: Pulling fs layer
dd2a18b53db3: Pulling fs layer
f8c61d00c4fb: Pulling fs layer
c012c2787074: Pulling fs layer
953e993ff650: Pulling fs layer
fd3b5161a370: Waiting
f600d053e235: Waiting
d43c2dd3b915: Pulling fs layer
a7104e7e3962: Waiting
2904faaa955b: Waiting
f8c61d00c4fb: Waiting
dd2a18b53db3: Waiting
e27c48447f58: Pulling fs layer
c012c2787074: Waiting
d43c2dd3b915: Waiting
674931146b3f: Waiting
5c0a9d7fa918: Pulling fs layer
953e993ff650: Waiting
7558fdc023f4: Pulling fs layer
74cb7efc6ced: Pulling fs layer
e27c48447f58: Waiting
5c0a9d7fa918: Waiting
7558fdc023f4: Waiting
867e3a24cb4d: Pulling fs layer
a2b6974392b1: Pulling fs layer
867e3a24cb4d: Waiting
ed3c4c0ef681: Pulling fs lay

0

In [None]:
import contextlib
import subprocess
import time

from tritonclient.http import InferenceServerClient

# Define image https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver
tag = "nvcr.io/nvidia/tritonserver:24.09-py3"  # 8.57 GB

# Pull the image
subprocess.call(f"docker pull {tag}", shell=True)

# Run the Triton server and capture the container ID
container_id = (
    subprocess.check_output(
        f"docker run -d --rm --gpus 0 -v {triton_repo_path}:/models -p 8000:8000 {tag} tritonserver --model-repository=/models",
        shell=True,
    )
    .decode("utf-8")
    .strip()
)

# Wait for the Triton server to start
triton_client = InferenceServerClient(url="localhost:8000", verbose=False, ssl=False)

# Wait until model is ready
for _ in range(10):
    with contextlib.suppress(Exception):
        assert triton_client.is_model_ready(model_name)
        break
    time.sleep(1)

In [4]:
import tritonclient.http as httpclient
import numpy as np
from PIL import Image

# Load and preprocess the image
image_path = "images/frame_1.jpg"
image = Image.open(image_path).convert("RGB")
image = image.resize((640, 640))  # Resize based on model input requirements
input_data = np.array(image).astype(np.float32)
input_data = np.transpose(input_data, (2, 0, 1))  # Convert to CHW format if required by the model
input_data = np.expand_dims(input_data, axis=0)  # Add batch dimension

# Connect to Triton Server
triton_client = httpclient.InferenceServerClient(url="localhost:14600")

# Prepare inputs and outputs
inputs = [httpclient.InferInput("input_0", input_data.shape, "FP32")]
inputs[0].set_data_from_numpy(input_data)
outputs = [httpclient.InferRequestedOutput("output_0")]

# Run inference
response = triton_client.infer(model_name="ball_tracking", inputs=inputs, outputs=outputs)

# Retrieve results
output_data = response.as_numpy("output_0")
print("Inference output:", output_data)


ConnectionRefusedError: [Errno 111] Connection refused