# Triton on SageMaker - Ensemble + FIL Backend

## Set up Environment

Installs the dependencies required to package the model and run inferences using Triton server.

Also define the IAM role that will give SageMaker access to the model artifacts and the NVIDIA Triton ECR image.

In [20]:
!nvidia-smi

Wed Jun 29 23:16:14 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
| N/A   42C    P0    27W /  70W |   5091MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install -qU pip awscli boto3 sagemaker 
!pip install nvidia-pyindex
!pip install tritonclient[http]

In [2]:
import boto3
import json
import sagemaker
import time
import os
from sagemaker import get_execution_role

sess = boto3.Session()
sm = sess.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=sess)
role = get_execution_role()
client = boto3.client("sagemaker-runtime")

In [3]:
account_id_map = {
    'us-east-1': '785573368785',
    'us-east-2': '007439368137',
    'us-west-1': '710691900526',
    'us-west-2': '301217895009',
    'eu-west-1': '802834080501',
    'eu-west-2': '205493899709',
    'eu-west-3': '254080097072',
    'eu-north-1': '601324751636',
    'eu-south-1': '966458181534',
    'eu-central-1': '746233611703',
    'ap-east-1': '110948597952',
    'ap-south-1': '763008648453',
    'ap-northeast-1': '941853720454',
    'ap-northeast-2': '151534178276',
    'ap-southeast-1': '324986816169',
    'ap-southeast-2': '355873309152',
    'cn-northwest-1': '474822919863',
    'cn-north-1': '472730292857',
    'sa-east-1': '756306329178',
    'ca-central-1': '464438896020',
    'me-south-1': '836785723513',
    'af-south-1': '774647643957'
}

In [4]:
region = boto3.Session().region_name
if region not in account_id_map.keys():
    raise("UNSUPPORTED REGION")

In [5]:
base = "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com"
triton_image_uri = "{account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:22.05-py3".format(
    account_id=account_id_map[region], region=region, base=base
)

In [None]:
#triton_image_uri = "354625738399.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tritonserver:22.05-py"

In [6]:
triton_image_uri

'301217895009.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tritonserver:22.05-py3'

## Package models and dependencies and uploading to S3

First we create the Triton config file for the XGBoost model being served by the FIL Backend.

**TODO**: Note the reshape we had to do for output here. This is specific to this example.

In [8]:
USE_GPU = True
FIL_MODEL_DIR = './model_repository/fil'

# Maximum size in bytes for input and output arrays
MAX_MEMORY_BYTES = 60_000_000
NUM_FEATURES = 15
NUM_CLASSES = 2
bytes_per_sample = (NUM_FEATURES + NUM_CLASSES) * 4
max_batch_size = MAX_MEMORY_BYTES // bytes_per_sample

IS_CLASSIFIER = True
model_format = 'xgboost_json'

# Select deployment hardware (GPU or CPU)
if USE_GPU:
    instance_kind = 'KIND_GPU'
else:
    instance_kind = 'KIND_CPU'

# whether the model is doing classification or regression    
if IS_CLASSIFIER:
    classifier_string = 'true'
else:
    classifier_string = 'false'

# whether to predict probabilites or not
predict_proba = False

if predict_proba:
    predict_proba_string = 'true'
else:
    predict_proba_string = 'false'

config_text = f"""backend: "fil"
max_batch_size: {max_batch_size}
input [                                 
 {{  
    name: "input__0"
    data_type: TYPE_FP32
    dims: [ {NUM_FEATURES} ]                    
  }} 
]
output [
 {{
    name: "output__0"
    data_type: TYPE_FP32
    dims: [ 1 ]
    reshape: {{ shape: [ 1 ] }}
  }}
]
instance_group [{{ kind: {instance_kind} }}]
parameters [
  {{
    key: "model_type"
    value: {{ string_value: "{model_format}" }}
  }},
  {{
    key: "predict_proba"
    value: {{ string_value: "{predict_proba_string}" }}
  }},
  {{
    key: "output_class"
    value: {{ string_value: "{classifier_string}" }}
  }},
  {{
    key: "threshold"
    value: {{ string_value: "0.5" }}
  }},
  {{
    key: "storage_type"
    value: {{ string_value: "AUTO" }}
  }}
]

dynamic_batching {{}}"""

config_path = os.path.join(FIL_MODEL_DIR, 'config.pbtxt')
with open(config_path, 'w') as file_:
    file_.write(config_text)

In [13]:
# Download the RAPIDS 22.04 Conda env to be used in Python preprocessing
!wget -q -P model_repository/preprocessing https://rapidsai-data.s3.us-east-2.amazonaws.com/conda-pack/rapidsai/rapids22.06_cuda11.5_py3.8.tar.gz

In [None]:
# move label encoders into python preprocessing directory
!cp label_encoders.pkl model_repository/preprocessing/1/

In [10]:
# move trained xgboost model into fil model directory
!mkdir -p model_repository/fil/1
!cp xgboost.json model_repository/fil/1/

In [11]:
# create model version directory for ensemble model
!mkdir -p model_repository/ensemble/1

In [14]:
!tar --exclude='.ipynb_checkpoints' -czf model.tar.gz -C model_repository .

./
./postprocessing/
./postprocessing/1/
./postprocessing/1/model.py
./postprocessing/1/__pycache__/
./postprocessing/1/__pycache__/model.cpython-38.pyc
./postprocessing/.ipynb_checkpoints/
./postprocessing/.ipynb_checkpoints/config-checkpoint.pbtxt
./postprocessing/config.pbtxt
./ensemble/
./ensemble/1/
./ensemble/.ipynb_checkpoints/
./ensemble/.ipynb_checkpoints/config-checkpoint.pbtxt
./ensemble/config.pbtxt
./.ipynb_checkpoints/
./fil/
./fil/1/
./fil/1/xgboost.json
./fil/.ipynb_checkpoints/
./fil/.ipynb_checkpoints/config-checkpoint.pbtxt
./fil/config.pbtxt
./preprocessing/
./preprocessing/1/
./preprocessing/1/model.py
./preprocessing/1/__pycache__/
./preprocessing/1/__pycache__/model.cpython-38.pyc
./preprocessing/1/label_encoders.pkl
./preprocessing/.ipynb_checkpoints/
./preprocessing/.ipynb_checkpoints/config-checkpoint.pbtxt
./preprocessing/config.pbtxt
./preprocessing/rapids22.06_cuda11.5_py3.8.tar.gz


In [15]:
model_uri = sagemaker_session.upload_data(path="model.tar.gz", key_prefix="triton-fil-ensemble")

## Create SageMaker Endpoint

We start off by creating a sagemaker model from the model files we uploaded to s3 in the previous step.

In this step we also provide an additional Environment Variable i.e. `SAGEMAKER_TRITON_DEFAULT_MODEL_NAME` which specifies the name of the model to be loaded by Triton. **The value of this key should match the folder name in the model package uploaded to s3.** This variable is optional in case of a single model. In case of ensemble models, this **key has to be specified** for Triton to startup in SageMaker.

Additionally, customers can set `SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT` and `SAGEMAKER_TRITON_THREAD_COUNT` for optimizing the thread counts.

In [16]:
sm_model_name = "triton-fil-ensemble-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

container = {
    "Image": triton_image_uri,
    "ModelDataUrl": model_uri,
    "Environment": {"SAGEMAKER_TRITON_DEFAULT_MODEL_NAME": "ensemble"},
}

create_model_response = sm.create_model(
    ModelName=sm_model_name, ExecutionRoleArn=role, PrimaryContainer=container
)

print("Model Arn: " + create_model_response["ModelArn"])

Model Arn: arn:aws:sagemaker:us-west-2:354625738399:model/triton-fil-ensemble-2022-06-29-22-38-34


Using the model above, we create an endpoint configuration where we can specify the type and number of instances we want in the endpoint.

In [17]:
endpoint_config_name = "triton-fil-ensemble-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "InstanceType": "ml.g4dn.4xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": sm_model_name,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

Endpoint Config Arn: arn:aws:sagemaker:us-west-2:354625738399:endpoint-config/triton-fil-ensemble2022-06-29-22-38-56


Using the above endpoint configuration we create a new sagemaker endpoint and wait for the deployment to finish. The status will change to InService once the deployment is successful.

In [18]:
endpoint_name = "triton-fil-ensemble-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

create_endpoint_response = sm.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

Endpoint Arn: arn:aws:sagemaker:us-west-2:354625738399:endpoint/triton-fil-ensemble-2022-06-29-22-39-31


In [19]:
resp = sm.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating


KeyboardInterrupt: 

## Run Inference

In [21]:
import pandas as pd

In [24]:
data_infer = pd.read_csv("X_infer.csv").astype('str')

In [25]:
data_infer.head()

Unnamed: 0,User,Card,Year,Month,Day,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Hour,Minute
0,767,0,2012,11,12,25.37000084,1,22776,8313,145,26128,35,0,9,15
1,1676,0,2016,1,2,103.75,0,61835,5977,28,23436,74,0,5,12
2,1395,0,2017,5,11,6.699999809,0,21061,11313,214,25784,78,0,7,57
3,1073,1,2010,8,24,2.849999905,2,39706,5783,128,4953,56,0,8,5
4,123,0,2009,12,17,80.77999878,2,51879,6749,215,13903,54,0,13,7


In [26]:
sample_data = data_infer.iloc[:4].values

In [27]:
sample_data

array([['767', '0', '2012', '11', '12', '25.37000084', '1', '22776',
        '8313', '145', '26128', '35', '0', '9', '15'],
       ['1676', '0', '2016', '1', '2', '103.75', '0', '61835', '5977',
        '28', '23436', '74', '0', '5', '12'],
       ['1395', '0', '2017', '5', '11', '6.699999809', '0', '21061',
        '11313', '214', '25784', '78', '0', '7', '57'],
       ['1073', '1', '2010', '8', '24', '2.849999905', '2', '39706',
        '5783', '128', '4953', '56', '0', '8', '5']], dtype=object)

In [None]:
payload = {
    "inputs": [
        {"name": "INPUT", "shape": list(sample_data.shape), "datatype": "STRING", "data": sample_data},
    ]
}

response = client.invoke_endpoint(
    EndpointName=endpoint_name, ContentType="application/octet-stream", Body=json.dumps(payload)
)

In [None]:
print(json.loads(response["Body"].read().decode("utf8")))