# Deploy multiple python backend models on Sagemaker using Multi Model Endpoints and Triton Inference Server

In [None]:
!df -h

In [None]:
# build the image and push it to ECR
# build-and-push.sh takes in one arg: the tag. Here we tag the image with 1, but feel free to change the tag
# see docker/Dockerfile.sagemaker.gpu for details about the image
!cd docker && sudo bash build-and-push.sh 1

In [None]:
!pip install -qU pip awscli boto3 sagemaker
!pip install nvidia-pyindex
!pip install tritonclient[http]

In [None]:
import boto3, json, sagemaker, time 
from sagemaker import get_execution_role

sess              = boto3.Session()
sm                = sess.client('sagemaker')
sagemaker_session = sagemaker.Session(boto_session=sess)
role              = get_execution_role()
client            = boto3.client('sagemaker-runtime')

## PyTorch HuggingFace T5 models

For a simple use case we will take the pre-trained T5 model from [HuggingFace](https://huggingface.co/transformers/model_doc/t5.html) and deploy it on Sagemaker with Triton as the model server. We used the pre-configured `config.pbtxt` file provided with this repo [here](./travelers_t5_triton/models/t5/config.pbtxt) to specify model [configuration](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md) which Triton uses to load the model. We tar the model directory and upload it to s3 to later create a [Sagemaker Model](https://sagemaker.readthedocs.io/en/stable/api/inference/model.html).

In [None]:
mme_triton_image_uri='917092859813.dkr.ecr.ap-south-1.amazonaws.com/mme-triton-sagemaker-t5:1'

### Packaging model files and uploading to s3

In [None]:
!tar -C t5_triton/models -czf model.tar.gz t5
model_uri = sagemaker_session.upload_data(path="model.tar.gz", key_prefix="mme-triton-t5-python")

In [None]:
model_uri

In [None]:
!aws s3 cp $model_uri s3://sagemaker-ap-south-1-917092859813/mme-triton-t5-python/model1.tar.gz

In [None]:
!aws s3 cp $model_uri s3://sagemaker-ap-south-1-917092859813/mme-triton-t5-python/model2.tar.gz
!aws s3 cp $model_uri s3://sagemaker-ap-south-1-917092859813/mme-triton-t5-python/model3.tar.gz
!aws s3 cp $model_uri s3://sagemaker-ap-south-1-917092859813/mme-triton-t5-python/model4.tar.gz

### Create Sagemaker Enpoint

We start off by creating a [sagemaker model](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateModel.html) from the model files we uploaded to s3 in the previous step.

In this step we also provide an additional Environment Variable i.e. `SAGEMAKER_TRITON_DEFAULT_MODEL_NAME` which specifies the name of the model to be loaded by Triton. **The value of this key should match the folder name in the model package uploaded to s3**. This variable is optional in case of a single model. In case of ensemble models, this key **has to be** specified for Triton to startup in Sagemaker.

*Note*: The current release of Triton (21.06-py3) on Sagemaker doesn't support running instances of different models on the same server, except in case of [ensembles](https://github.com/triton-inference-server/server/blob/main/docs/architecture.md#ensemble-models). Only multiple model instances of the same model are supported, which can be specified under the [instance-groups](https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md#instance-groups) section of the config.pbtxt file.

In [None]:
model_data_url = f"s3://sagemaker-ap-south-1-917092859813/mme-triton-t5-python/"

In [None]:
sm_model_name = 'mme-triton-t5-python-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

container = {
    "Image": mme_triton_image_uri,
    "ModelDataUrl": model_data_url,
    "Mode": "MultiModel",
    "Environment": {"SAGEMAKER_TRITON_DEFAULT_MODEL_NAME": "t5", 
                    "SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" : "16777216",
                   "SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE": "1048576"},
}

create_model_response = sm.create_model(
    ModelName         = sm_model_name,
    ExecutionRoleArn  = role,
    PrimaryContainer  = container)

print("Model Arn: " + create_model_response['ModelArn'])

Using the model above, we create an [endpoint configuration](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateEndpointConfig.html) where we can specify the type and number of instances we want in the endpoint.

In [None]:
endpoint_config_name = 'mme-triton-t5-python-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants = [{
        'InstanceType'        : 'ml.g4dn.xlarge',
        'InitialVariantWeight': 1,
        'InitialInstanceCount': 1,
        'ModelName'           : sm_model_name,
        'VariantName'         : 'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

Using the above endpoint configuration we create a new sagemaker endpoint and wait for the deployment to finish. The status will change to **InService** once the deployment is successful.

In [None]:
endpoint_name = 'mme-triton-t5-python-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

create_endpoint_response = sm.create_endpoint(
    EndpointName         = endpoint_name,
    EndpointConfigName   = endpoint_config_name)

print("Endpoint Arn: " + create_endpoint_response['EndpointArn'])

In [None]:
resp = sm.describe_endpoint(EndpointName=endpoint_name)
status = resp['EndpointStatus']
print("Status: " + status)

while status=='Creating':
    time.sleep(60)
    resp = sm.describe_endpoint(EndpointName=endpoint_name)
    status = resp['EndpointStatus']
    print("Status: " + status)

print("Arn: " + resp['EndpointArn'])
print("Status: " + status)

In [None]:
endpoint_name

### Run inference

Once we have the endpoint running we can use the [sample image](./kitten.jpg) provided to do an inference using json as the payload format. For inference request format, Triton uses the KFServing community standard [inference protocols](https://github.com/triton-inference-server/server/blob/main/docs/protocol/README.md).

In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import numpy as np
from transformers import T5Tokenizer

import tritonclient.http as httpclient

tokenizer = T5Tokenizer.from_pretrained("t5-small")
# print(tokenizer)
input_ids = tokenizer("summarize: SageMaker enables customers to deploy a model using custom code with NVIDIA Triton Inference Server. This functionality is available through the development of Triton Inference Server Containers. These containers include NVIDIA Triton Inference Server, support for common ML frameworks, and useful environment variables that let you optimize performance on SageMaker. For a list of all available Deep Learning Containers images, see Available Deep Learning Containers Images. Deep Learning Containers images are maintained and regularly updated with security patches.", return_tensors='pt').input_ids
# print(input_ids.numpy().astype(np.int32).shape)

input_data = input_ids.numpy().astype(np.int32)

input_name = 'input'
output_name = "output"
inputs = []
outputs = []
inputs.append(httpclient.InferInput(input_name, input_data.shape, "INT32"))
inputs[0].set_data_from_numpy(input_data, binary_data=True)
outputs.append(
    httpclient.InferRequestedOutput(output_name, binary_data=True))
request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
    inputs, outputs=outputs)

In [None]:
response = client.invoke_endpoint(EndpointName=endpoint_name,
                                  ContentType='application/vnd.sagemaker-triton.binary+json;json-header-size={}'.format(header_length),
                                  Body=request_body,
                                 TargetModel='model.tar.gz')

# Parse json header size length from the response
header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size="
header_length_str = response['ContentType'][len(header_length_prefix):]

# Read response body
result = httpclient.InferenceServerClient.parse_response_body(
    response['Body'].read(), header_length=int(header_length_str))
output_data = result.as_numpy(output_name)

In [None]:
output_data

In [None]:
decoded_output = tokenizer.decode(
            output_data[0],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )
decoded_output

#### BenchMark

In [None]:
!for i in {1..5}; do aws s3 cp s3://sagemaker-ap-south-1-917092859813/mme-triton-t5-python/model.tar.gz s3://sagemaker-ap-south-1-917092859813/mme-triton-t5-python/t5_python_v"$i".tar.gz; done

In [None]:
# Perform auto-scaling of the endpoint based on GPU memory utilization

import boto3
from sagemaker import get_execution_role
import sagemaker

# Define application auto-scaling client
# Common class representing Application Auto Scaling for SageMaker amongst other AWS services
auto_scaling_client = boto3.client('application-autoscaling')

# This is the format in which application autoscaling references the endpoint
resource_id='endpoint/' + endpoint_name + '/variant/' + 'AllTraffic' 
response = auto_scaling_client.register_scalable_target(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount',
    MinCapacity=1,
    MaxCapacity=2
)


# GPUMemoryUtilization metric
response = auto_scaling_client.put_scaling_policy(
    PolicyName='GPUMemoryUtil-ScalingPolicy',
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount', # SageMaker supports only Instance Count
    PolicyType='TargetTrackingScaling', # 'StepScaling'|'TargetTrackingScaling'
    TargetTrackingScalingPolicyConfiguration={
        # Scale out when GPU Memory utilization hits GPUMemoryUtilization target value.
        'TargetValue': 50.0, 
        'CustomizedMetricSpecification':
        {
            'MetricName': 'GPUMemoryUtilization',
            'Namespace': '/aws/sagemaker/Endpoints',
            'Dimensions': [
                {'Name': 'EndpointName', 'Value': endpoint_name },
                {'Name': 'VariantName','Value': 'AllTraffic'}
            ],
            'Statistic': 'Average', # Possible - 'Statistic': 'Average'|'Minimum'|'Maximum'|'SampleCount'|'Sum'
            'Unit': 'Percent'
        },
        'ScaleInCooldown': 600,
        'ScaleOutCooldown': 200 
    }
)

In [None]:
import random


for i in range(50):
    n = random.randint(1,3)
    model_name=f"t5_python_v{n}.tar.gz"
    print(model_name)

    response = client.invoke_endpoint(EndpointName=endpoint_name,
                                  ContentType='application/vnd.sagemaker-triton.binary+json;json-header-size={}'.format(header_length),
                                  Body=request_body,
                                 TargetModel=model_name)

    # Parse json header size length from the response
    header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size="
    header_length_str = response['ContentType'][len(header_length_prefix):]

    # Read response body
    result = httpclient.InferenceServerClient.parse_response_body(
        response['Body'].read(), header_length=int(header_length_str))
    output_data = result.as_numpy(output_name)
    
    decoded_output = tokenizer.decode(
            output_data[0],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )
    print(decoded_output)
    

### Terminate endpoint and clean up artifacts

In [None]:
sm.delete_endpoint(EndpointName=endpoint_name)
sm.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm.delete_model(ModelName=sm_model_name)