# How to run multiple deep learning models on GPUs with Azure Machine Learning Multi-model endpoints (MME) using Triton Inference Server

# Install required libraries

In [None]:
!pip install azureml-sdk
!pip install azureml-core
!pip install azureml-contrib-server[all]==1.14.0
!pip install azureml-contrib-triton==1.14.0

# Import required libraries

In [None]:
from azureml.core import Workspace, Environment, Model, Webservice
from azureml.core.webservice import AciWebservice
from azureml.core.compute import AksCompute, ComputeTarget
from azureml.contrib.server.dashboard import serve_dashboard
from azureml.contrib.server.utils import get_auth_header
from azureml.contrib.triton.deploy import deploy_triton, AksInferenceCluster


# Set up Azure Machine Learning workspace

In [None]:
ws = Workspace.from_config()
aks_compute_name = 'triton-demo-aks-cluster'
aks_compute = ComputeTarget(workspace=ws, name=aks_compute_name)

# Set up Triton Inference Server configuration

In [None]:
triton_config = {
    "name": "ms-reactor-demo-triton-server",
    "compute": {
        "resource_group": ws.resource_group,
        "cluster_name": aks_compute_name,
        "namespace": "my-namespace",
        "pvc_name": "my-pvc-name",
        "max_concurrent_requests_per_replica": 4,
        "replicas": 1,
        "gpus_per_node": 1,
        "cpu_cores": 4,
        "memory_gb": 16
    },
    "models": [{
        "model_name": "model-1",
        "model_path"    : Model.get_model_path("model-1"),
        "model_type": "tensorflow",
        "input_type": "fp32",
        "output_type": "fp32",
        "max_batch_size": 16,
        "version": 1
    }, {
        "model_name": "model-2",
        "model_path": Model.get_model_path("model-2"),
        "model_type": "onnx",
        "input_type": "fp32",
        "output_type": "fp32",
        "max_batch_size": 8,
        "version": 1
    }]
}

# Deploy Triton Inference Server using Azure Machine Learning Multi-model endpoints (MME)

In [None]:
aks_config = AksInferenceCluster(ws, aks_compute_name)
deploy_triton(ws, triton_config, aks_config)

# Set up and deploy the Azure Machine Learning Multi-model endpoint

In [None]:
env = Environment.from_conda_specification(name="myenv", file_path="conda_env.yaml")
models = [
    Model(ws, "model-1"),
    Model(ws, "model-2")
]
mme = Model.deploy(ws, "my-mme-endpoint", models, inference_config=None, deployment_config=AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1), environment=env)
mme.wait_for_deployment(show_output=True)   

# Test the deployed endpoint using the Triton Inference Server client 

In [None]:
import tritonclient.grpc as grpcclient
from tritonclient.utils import InferenceServerException
from tritonclient.utils import triton_to_np_dtype

def test_endpoint():
    try:
        triton_client = grpcclient.InferenceServerClient("my-triton-server:8001")
        inputs = [
            grpcclient.InferInput("input_1", [1, 224, 224, 3], "FP32"),
            grpcclient