Resources:
* https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#auto_complete_config
* https://github.com/triton-inference-server/python_backend/blob/29cb0f2570dcf411f4c457ea026676e020901460/src/resources/triton_python_backend_utils.py#L122

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Testing Triton server

```
docker compose up triton_server
```

In [3]:
import ops
import os
import tensorflow as tf
import numpy as np

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [5]:
images = tf.random.uniform((100, 224, 224, 3), minval=0, maxval=255, dtype=tf.int32)
images = tf.cast(images, tf.uint8)
jpeg_image = tf.image.encode_jpeg(images[0]).numpy()

In [6]:
resnet50_no_opt_client = ops.TritonSavedModelClient("resnet50-no-opt")
result = resnet50_no_opt_client.predict("predict_images", images=images)
result.as_numpy("scores")

array([0.001123  , 0.00112003, 0.00112299, 0.00111931, 0.00112378,
       0.00112881, 0.00112312, 0.00112267, 0.00112317, 0.00111989,
       0.00112083, 0.00111249, 0.00112129, 0.00112692, 0.00111697,
       0.00112269, 0.00111806, 0.0011159 , 0.00112861, 0.00112298,
       0.00112537, 0.00111864, 0.00111838, 0.00112509, 0.00111958,
       0.00111743, 0.00112137, 0.00111658, 0.00112167, 0.001124  ,
       0.00112375, 0.00112685, 0.00112421, 0.00111854, 0.00111348,
       0.00112904, 0.00112142, 0.00112101, 0.00111753, 0.00112497,
       0.00112569, 0.00111861, 0.00111691, 0.00111931, 0.001125  ,
       0.00112395, 0.00112398, 0.00112087, 0.00111862, 0.00112673,
       0.00111615, 0.0011141 , 0.00111529, 0.0011194 , 0.00112181,
       0.00111785, 0.00112357, 0.00112237, 0.00112502, 0.00112025,
       0.0011218 , 0.00111958, 0.0011234 , 0.00112008, 0.00111711,
       0.00112081, 0.00111542, 0.00111669, 0.00112142, 0.00112357,
       0.00112134, 0.00111784, 0.00111879, 0.00112036, 0.00112

In [7]:
result = resnet50_no_opt_client.predict("predict_jpeg", jpeg_image=jpeg_image)
result.as_numpy("scores")

array([0.00111299], dtype=float32)

In [8]:
resnet50_opt_client = ops.TritonSavedModelClient("resnet50-xla-amp")
result = resnet50_opt_client.predict("predict_images", images=images)
result.as_numpy("scores")

array([0.00112505, 0.00112018, 0.0011232 , 0.00111902, 0.00112409,
       0.00112959, 0.00112271, 0.00112237, 0.00112395, 0.00111984,
       0.00112107, 0.00111215, 0.00112169, 0.00112546, 0.0011167 ,
       0.0011223 , 0.00111827, 0.00111541, 0.00112945, 0.00112313,
       0.00112518, 0.00111895, 0.00111957, 0.00112518, 0.00111991,
       0.00111779, 0.00112141, 0.0011165 , 0.001121  , 0.00112429,
       0.00112319, 0.00112656, 0.00112409, 0.00111957, 0.00111357,
       0.00112945, 0.00112203, 0.00112141, 0.00111759, 0.00112594,
       0.0011258 , 0.00112052, 0.00111691, 0.00111984, 0.00112608,
       0.00112457, 0.00112409, 0.00112155, 0.00111936, 0.00112759,
       0.00111725, 0.00111384, 0.00111588, 0.00111847, 0.00112135,
       0.00111868, 0.00112299, 0.00112299, 0.00112491, 0.00112073,
       0.00112217, 0.00111882, 0.00112333, 0.00112039, 0.00111779,
       0.00112141, 0.00111479, 0.00111725, 0.00112182, 0.00112409,
       0.00112203, 0.00111834, 0.00112005, 0.00112073, 0.00112

In [9]:
efficientnetb0_no_opt_client = ops.TritonSavedModelClient("efficientnetb0-no-opt")
result = efficientnetb0_no_opt_client.predict("predict_images", images=images)
result.as_numpy("scores")

array([0.00102032, 0.0010204 , 0.00102034, 0.00102034, 0.00102038,
       0.0010204 , 0.0010203 , 0.00102038, 0.00102043, 0.0010203 ,
       0.00102033, 0.00102034, 0.00102037, 0.00102029, 0.00102038,
       0.00102037, 0.00102036, 0.00102034, 0.00102035, 0.00102037,
       0.00102037, 0.00102038, 0.00102038, 0.00102038, 0.00102037,
       0.00102036, 0.00102036, 0.00102036, 0.00102037, 0.00102033,
       0.00102031, 0.00102036, 0.00102031, 0.00102029, 0.00102032,
       0.00102037, 0.00102038, 0.00102033, 0.00102033, 0.00102035,
       0.00102034, 0.00102039, 0.00102033, 0.00102035, 0.0010204 ,
       0.00102033, 0.00102037, 0.00102033, 0.00102038, 0.00102037,
       0.00102038, 0.00102037, 0.00102035, 0.00102039, 0.00102037,
       0.0010204 , 0.00102032, 0.00102038, 0.00102036, 0.00102034,
       0.00102038, 0.00102034, 0.00102036, 0.00102032, 0.00102033,
       0.00102039, 0.00102042, 0.00102036, 0.00102041, 0.00102039,
       0.00102037, 0.00102032, 0.00102034, 0.00102035, 0.00102

In [12]:
efficientnetb0_opt_client = ops.TritonSavedModelClient("efficientnetb0-xla-amp")
result = efficientnetb0_opt_client.predict("predict_images", images=images)
result.as_numpy("scores")

array([0.00102052, 0.00102068, 0.00102057, 0.00102057, 0.0010206 ,
       0.00102065, 0.00102052, 0.00102065, 0.00102063, 0.00102052,
       0.00102057, 0.00102056, 0.0010206 , 0.00102052, 0.0010206 ,
       0.00102065, 0.00102062, 0.00102057, 0.00102057, 0.0010206 ,
       0.0010206 , 0.00102056, 0.00102065, 0.0010206 , 0.0010206 ,
       0.00102057, 0.0010206 , 0.00102056, 0.0010206 , 0.00102057,
       0.00102057, 0.00102065, 0.00102048, 0.00102052, 0.00102057,
       0.00102057, 0.00102065, 0.00102057, 0.0010206 , 0.00102057,
       0.00102057, 0.00102065, 0.00102052, 0.00102057, 0.00102065,
       0.00102057, 0.00102057, 0.00102048, 0.00102065, 0.0010206 ,
       0.0010206 , 0.0010206 , 0.0010206 , 0.0010206 , 0.00102068,
       0.00102065, 0.00102057, 0.0010206 , 0.00102062, 0.00102057,
       0.00102065, 0.00102052, 0.0010206 , 0.00102057, 0.00102056,
       0.00102065, 0.00102065, 0.0010206 , 0.00102065, 0.00102065,
       0.00102063, 0.00102052, 0.00102057, 0.00102057, 0.00102

## benchmarks but with instance_group count = 1

In [11]:
ops.benchmark_client(resnet50_no_opt_client, images)
ops.benchmark_client(resnet50_opt_client, images)
ops.benchmark_client(efficientnetb0_no_opt_client, images)
ops.benchmark_client(efficientnetb0_opt_client, images)

resnet50-no-opt: 100%|██████████| 500/500 [01:13<00:00,  6.84it/s]
resnet50-xla-amp: 100%|██████████| 500/500 [00:23<00:00, 20.94it/s]
efficientnetb0-no-opt: 100%|██████████| 500/500 [00:54<00:00,  9.11it/s]
efficientnetb0-xla-amp: 100%|██████████| 500/500 [00:17<00:00, 27.92it/s]


## benchmarks but with instance_group count = 2 
It requires change in the [config.pbtxt](../models/saved_model/config.pbtxt) and server restart

In [13]:
ops.benchmark_client(resnet50_no_opt_client, images)
ops.benchmark_client(resnet50_opt_client, images)
ops.benchmark_client(efficientnetb0_no_opt_client, images)
ops.benchmark_client(efficientnetb0_opt_client, images)

resnet50-no-opt: 100%|██████████| 500/500 [01:16<00:00,  6.51it/s]
resnet50-xla-amp: 100%|██████████| 500/500 [00:20<00:00, 24.10it/s]
efficientnetb0-no-opt: 100%|██████████| 500/500 [00:54<00:00,  9.14it/s]
efficientnetb0-xla-amp: 100%|██████████| 500/500 [00:13<00:00, 36.37it/s]


In [14]:
ops.benchmark_clients((resnet50_opt_client, efficientnetb0_opt_client), images)

100%|██████████| 500/500 [00:35<00:00, 14.19it/s]


# Testing TFServing
```
docker compose up tf_serving
```

In [15]:
resnet50_no_opt_client = ops.TFServingGRPCClient("resnet50-no-opt")
result = resnet50_no_opt_client.predict("predict_images", images=images)
outputs = {k: v for k, v in result.outputs.items()}
scores = np.array(outputs["scores"].float_val)
scores

array([0.00112347, 0.00111938, 0.00112282, 0.00111921, 0.00112437,
       0.00112868, 0.00112235, 0.00112249, 0.00112319, 0.00111921,
       0.00112008, 0.00111161, 0.0011212 , 0.00112503, 0.00111673,
       0.0011218 , 0.00111844, 0.001115  , 0.00112891, 0.00112266,
       0.00112505, 0.00111806, 0.00111901, 0.001125  , 0.00111949,
       0.0011174 , 0.00112132, 0.0011163 , 0.00112183, 0.00112356,
       0.00112334, 0.00112505, 0.00112348, 0.00111869, 0.00111363,
       0.00112868, 0.00112156, 0.00112057, 0.00111733, 0.00112499,
       0.00112544, 0.00111915, 0.00111677, 0.00111873, 0.00112461,
       0.00112377, 0.00112351, 0.00112111, 0.00111804, 0.00112716,
       0.00111665, 0.00111384, 0.00111493, 0.001119  , 0.00112109,
       0.00111768, 0.00112303, 0.00112228, 0.00112454, 0.00112079,
       0.00112175, 0.00111884, 0.00112348, 0.00111964, 0.00111755,
       0.00112049, 0.00111502, 0.00111681, 0.00112122, 0.00112348,
       0.0011214 , 0.00111762, 0.00111999, 0.00112016, 0.00112

In [16]:
# client = ops.TFServingGRPCClient("resnet50-xla-amp")
# result = client.predict("predict_images", images=images)
# UNIMPLEMENTED: Could not find compiler for platform CUDA: NOT_FOUND: could not find registered compiler for platform CUDA

In [17]:
efficientnetb0_no_opt_client = ops.TFServingGRPCClient("efficientnetb0-no-opt")
result = efficientnetb0_no_opt_client.predict("predict_images", images=images)
outputs = {k: v for k, v in result.outputs.items()}
scores = np.array(outputs["scores"].float_val)
scores

array([0.00102034, 0.00102041, 0.00102036, 0.00102036, 0.00102039,
       0.00102041, 0.00102031, 0.00102039, 0.00102045, 0.00102031,
       0.00102035, 0.00102037, 0.00102038, 0.00102032, 0.00102039,
       0.00102039, 0.00102038, 0.00102036, 0.00102037, 0.00102038,
       0.00102038, 0.00102039, 0.00102039, 0.00102039, 0.00102038,
       0.00102037, 0.00102037, 0.00102037, 0.00102038, 0.00102035,
       0.00102032, 0.00102038, 0.00102032, 0.0010203 , 0.00102033,
       0.00102039, 0.0010204 , 0.00102035, 0.00102035, 0.00102037,
       0.00102036, 0.0010204 , 0.00102034, 0.00102036, 0.00102041,
       0.00102034, 0.00102039, 0.00102035, 0.00102039, 0.00102039,
       0.0010204 , 0.00102039, 0.00102037, 0.0010204 , 0.00102039,
       0.00102041, 0.00102035, 0.00102038, 0.00102038, 0.00102035,
       0.00102039, 0.00102036, 0.00102037, 0.00102034, 0.00102036,
       0.0010204 , 0.00102042, 0.00102037, 0.00102041, 0.0010204 ,
       0.00102039, 0.00102034, 0.00102036, 0.00102037, 0.00102

In [18]:
# OOM when allocating tensor with shape[100,56,56,256]  when using num_workers=10
# UNIMPLEMENTED: Could not find compiler for platform CUDA: NOT_FOUNDm for XLA compiled models
ops.benchmark_client(resnet50_no_opt_client, images)
ops.benchmark_client(efficientnetb0_no_opt_client, images)

resnet50-no-opt: 100%|██████████| 500/500 [00:59<00:00,  8.35it/s]
efficientnetb0-no-opt: 100%|██████████| 500/500 [00:51<00:00,  9.63it/s]
