# Step 1 - Installation of the model required libraries

In [1]:
!pip install torch -U
!pip install -qU pip awscli boto3 sagemaker transformers
!pip install nvidia-pyindex
!pip install tritonclient[http]
!pip install pickle5
!pip install datasets
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com, https://pypi.ngc.nvidia.com
Collecting torch
  Downloading torch-1.11.0-cp38-cp38-manylinux1_x86_64.whl (750.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 MB[0m [31m201.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torch
Successfully installed torch-1.11.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.0.1 requires botocore<1.22.9,>=1.22.8, but you have botocore 1.27.13 which is incompatible.[0m[31m
[0mLooking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com, https://pypi.ngc.nvidia.com
Collecting nvidia-pyindex
  Download

# Step 2 - Train the model

In [2]:
import pickle5 as pickle
import logging
import pandas as pd
import sklearn
import nltk
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader, TensorDataset
import sys
from datasets import load_metric
import numpy as np

TRAIN_DIR = "./distilbert_train_intermediate"
FINAL_DIR = "./workspace-trt"
DEFAULT_FILENAME = "./spam_training_dataset_43k.pkl"
BATCH_SIZE = 128
COL_DATA = "text"  # Name of the column with the spam text
LABEL = "is_spam"  # Name of the column with the label 0 (ham) or 1 (spam)
NUM_EPOCHS = 4

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)

def clean_text(text):
    import re
    stopwords = nltk.corpus.stopwords.words('english')
    text = str(text).lower()  # Convert to lower case
    text = re.sub(r'[^\w\s]', '', text)  # Remove everything except words
    words = [word for word in text.split() if word not in stopwords]  # Remove stopwords
    text = " ".join(words)
    return text

def download_dataset():  
    nltk.download("stopwords")

    with open(DEFAULT_FILENAME, "rb") as fh:
        data = pickle.load(fh)
        data = data[[COL_DATA, LABEL]]  
        data[COL_DATA] = data[COL_DATA].apply(clean_text)
        data.reset_index()
        return data

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)    
    
logging.info('Loading the Dataset')
dataset = download_dataset()

logging.info('Loading the Metric')
metric = load_metric("accuracy")

logging.info('Loading the pretrained tokenizer and model')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

logging.info('Preparing the training and evaluation dataset')
train_data, val_data, train_labels, val_labels = train_test_split(dataset[COL_DATA].values, dataset[LABEL].values)
train_tokens = tokenizer(list(train_data), return_tensors="pt", padding=True, truncation=True, max_length=BATCH_SIZE)
val_tokens = tokenizer(list(val_data), return_tensors="pt", padding=True, truncation=True, max_length=BATCH_SIZE)

train_dataset = ClassificationDataset(train_tokens, train_labels)
val_dataset = ClassificationDataset(val_tokens, val_labels)

logging.info('Training Started')
trainer = Trainer(
    model=model,
    args=TrainingArguments(output_dir=TRAIN_DIR, num_train_epochs=NUM_EPOCHS),
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

model.save_pretrained(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)
logging.info('Training Completed')

print("**************** Evaluation ************")
metrics = trainer.evaluate()
metrics["eval_samples"] = len(val_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

2022-06-21 17:53:54,585 [INFO] Loading the Dataset


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2022-06-21 17:54:12,809 [INFO] Loading the Metric
2022-06-21 17:54:12,889 [INFO] Loading the pretrained tokenizer and model


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

2022-06-21 17:54:13,917 [INFO] Preparing the training and evaluation dataset
2022-06-21 17:56:48,785 [INFO] Training Started


***** Running training *****
  Num examples = 32580
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16292
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
500,0.2799
1000,0.233
1500,0.2213
2000,0.1848
2500,0.177
3000,0.1805
3500,0.1732
4000,0.1438
4500,0.1168
5000,0.1034


Saving model checkpoint to ./distilbert_train_intermediate/checkpoint-500
Configuration saved in ./distilbert_train_intermediate/checkpoint-500/config.json
Model weights saved in ./distilbert_train_intermediate/checkpoint-500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./distilbert_train_intermediate/checkpoint-1000
Configuration saved in ./distilbert_train_intermediate/checkpoint-1000/config.json
Model weights saved in ./distilbert_train_intermediate/checkpoint-1000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./distilbert_train_intermediate/checkpoint-1500
Configuration saved in ./distilbert_train_intermediate/checkpoint-1500/config.json
Model weights saved in ./distilbert_train_intermediate/checkpoint-1500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to .

2022-06-21 18:11:06,809 [INFO] Training Completed


***** Running Evaluation *****
  Num examples = 10860
  Batch size = 8


**************** Evaluation ************


***** eval metrics *****
  epoch                   =        4.0
  eval_accuracy           =     0.9611
  eval_loss               =      0.262
  eval_runtime            = 0:00:15.25
  eval_samples            =      10860
  eval_samples_per_second =    712.027
  eval_steps_per_second   =     89.036


# Step 3 - Generate the ONNX file

In [11]:
!docker run --gpus=all --rm -it -v `pwd`/workspace-trt:/workspace nvcr.io/nvidia/pytorch:21.08-py3 /bin/bash generate_models.sh


== PyTorch ==

NVIDIA Release 21.08 (build 26011915)
PyTorch Version 1.10.0a0+3fd9dcf

Container image Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.

Copyright (c) 2014-2021 Facebook Inc.
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU                      (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
Copyright (c) 2015      Google Inc.
Copyright (c) 2015      Yangqing Jia
Copyright (c) 2013-2016 The Caffe contributors
All rights reserved.

NVIDIA Deep Learning Profiler (dlprof) Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.

Various fil

# Step 4 - Create SageMaker model package and upload it to SageMaker

In [21]:
!mkdir -p triton-serve-trt/bert/1/ 
!cp workspace-trt/model_bs16.plan triton-serve-trt/bert/1/model.plan 
!tar -C triton-serve-trt/ -czf model.tar.gz bert 

import boto3, json, sagemaker, time
from sagemaker import get_execution_role

sess = boto3.Session()
sm = sess.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=sess)
role = get_execution_role()
client = boto3.client("sagemaker-runtime")

model_uri = sagemaker_session.upload_data(path="model.tar.gz", key_prefix="triton-serve-trt")

# Step 5 - Create SageMaker Inference endpoint

In [25]:
account_id_map = {
    'us-east-1': '785573368785',
    'us-east-2': '007439368137',
    'us-west-1': '710691900526',
    'us-west-2': '301217895009',
    'eu-west-1': '802834080501',
    'eu-west-2': '205493899709',
    'eu-west-3': '254080097072',
    'eu-north-1': '601324751636',
    'eu-south-1': '966458181534',
    'eu-central-1': '746233611703',
    'ap-east-1': '110948597952',
    'ap-south-1': '763008648453',
    'ap-northeast-1': '941853720454',
    'ap-northeast-2': '151534178276',
    'ap-southeast-1': '324986816169',
    'ap-southeast-2': '355873309152',
    'cn-northwest-1': '474822919863',
    'cn-north-1': '472730292857',
    'sa-east-1': '756306329178',
    'ca-central-1': '464438896020',
    'me-south-1': '836785723513',
    'af-south-1': '774647643957'
}

region = boto3.Session().region_name
if region not in account_id_map.keys():
    raise("UNSUPPORTED REGION")
    
base = "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com"
triton_image_uri = "{account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:21.08-py3".format(
    account_id=account_id_map[region], region=region, base=base
)

sm_model_name = "triton-nlp-bert-trt-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

container = {
    "Image": triton_image_uri,
    "ModelDataUrl": model_uri,
    "Environment": {"SAGEMAKER_TRITON_DEFAULT_MODEL_NAME": "bert"},
}

create_model_response = sm.create_model(
    ModelName=sm_model_name, ExecutionRoleArn=role, PrimaryContainer=container
)

print("Model Arn: " + create_model_response["ModelArn"])

endpoint_config_name = "triton-nlp-bert-trt-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "InstanceType": "ml.p3.2xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": sm_model_name,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

endpoint_name = "triton-nlp-bert-trt-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

create_endpoint_response = sm.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

resp = sm.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Model Arn: arn:aws:sagemaker:us-east-1:806460758762:model/triton-nlp-bert-trt-2022-06-21-18-45-34
Endpoint Config Arn: arn:aws:sagemaker:us-east-1:806460758762:endpoint-config/triton-nlp-bert-trt-2022-06-21-18-45-34
Endpoint Arn: arn:aws:sagemaker:us-east-1:806460758762:endpoint/triton-nlp-bert-trt-2022-06-21-18-45-34
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-east-1:806460758762:endpoint/triton-nlp-bert-trt-2022-06-21-18-45-34
Status: InService


# Step 6 - Test Triton SageMaker Inference Endpoint

In [14]:
!pip install retry

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com, https://pypi.ngc.nvidia.com
Collecting retry
  Downloading retry-0.9.2-py2.py3-none-any.whl (8.0 kB)
Installing collected packages: retry
Successfully installed retry-0.9.2


In [30]:
import tritonclient.http as httpclient
from transformers import DistilBertTokenizer
import torch.nn.functional as F 
import numpy as np
from retry import retry
import botocore
import concurrent
import time


enc = DistilBertTokenizer.from_pretrained("./workspace-trt/")
    
def tokenize_text(text):
    encoded_text = enc(clean_text(text), padding="max_length", max_length=128, truncation=True)
    return encoded_text["input_ids"], encoded_text["attention_mask"]


def get_sample_tokenized_text_binary(text):
    inputs = []
    outputs = []
    input_names =  ["input_ids", "attention_mask"]
    output_names = ["logits"]
    
    inputs.append(httpclient.InferInput(input_names[0], [1, 128], "INT32"))
    inputs.append(httpclient.InferInput(input_names[1], [1, 128], "INT32"))
    indexed_tokens, attention_mask = tokenize_text(text)

    indexed_tokens = np.array(indexed_tokens, dtype=np.int32)
    indexed_tokens = np.expand_dims(indexed_tokens, axis=0)
    inputs[0].set_data_from_numpy(indexed_tokens, binary_data=True)

    attention_mask = np.array(attention_mask, dtype=np.int32)
    attention_mask = np.expand_dims(attention_mask, axis=0)
    inputs[1].set_data_from_numpy(attention_mask, binary_data=True)

    outputs.append(httpclient.InferRequestedOutput(output_names[0], binary_data=True))
    outputs.append(httpclient.InferRequestedOutput(output_names[1], binary_data=True))
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(inputs, outputs=outputs)
    return request_body, header_length


@retry(botocore.exceptions.ClientError, tries=5, delay=1)
def get_prediction(text):
    input_ids, attention_mask = tokenize_text(text)

    payload = {
        "inputs": [
            {"name": "input_ids", "shape": [1, 128], "datatype": "INT32", "data": input_ids},
            {"name": "attention_mask", "shape": [1, 128], "datatype": "INT32", "data": attention_mask},
        ]
    }

    response = client.invoke_endpoint(EndpointName=endpoint_name, ContentType="application/octet-stream", Body=json.dumps(payload))

    result = json.loads(response["Body"].read().decode("utf8"))
    predictions = F.softmax(torch.tensor(result['outputs'][0]['data']),dim=-1)
    return torch.argmax(predictions, dim=-1).numpy()
    
test_texts = [
                "Oh k...i'''m watching here:)",
                "As a valued customer, I am pleased to advise you that following recent review of your Mob No. you are awarded with a £1500 Bonus Prize, call 09066364589",
                "I HAVE A DATE ON SUNDAY WITH WILL!!",
                "England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/ú1.20 POBOXox36504W45WQ 16+"
]


num_inferences = 1000
start = time.time() 
with concurrent.futures.ThreadPoolExecutor() as exe: 
    fut_list = []
    for _ in range (num_inferences):
        for test_text in test_texts:
            fut = exe.submit(get_prediction, test_text)         
            fut_list.append(fut)     
    for fut in fut_list:         
        rslt = fut.result() 
        
elapsed_time = time.time() - start 
print('num_inferences:{:>6}[texts], elapsed_time:{:6.2f}[sec], Throughput:{:8.2f}[texts/sec]'.format(num_inferences * len(test_texts), elapsed_time, num_inferences * len(test_texts)/ elapsed_time))



Didn't find file ./workspace-trt/added_tokens.json. We won't load it.
loading file ./workspace-trt/vocab.txt
loading file None
loading file ./workspace-trt/special_tokens_map.json
loading file ./workspace-trt/tokenizer_config.json


num_inferences:  4000[texts], elapsed_time: 11.79[sec], Throughput:  339.35[texts/sec]


# Step 7 - Delete the SageMaker Inference Endpoint

In [31]:
sm.delete_endpoint(EndpointName=endpoint_name)
sm.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm.delete_model(ModelName=sm_model_name)

{'ResponseMetadata': {'RequestId': 'e54e00b3-d5b4-4cd4-ba5f-690c4794e023',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e54e00b3-d5b4-4cd4-ba5f-690c4794e023',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Tue, 21 Jun 2022 18:56:00 GMT'},
  'RetryAttempts': 0}}

# Step 8 - Zip model

In [None]:
!rm -r ./distilbert_train_intermediate
!rm -r ./distilbert_train_intermediate-torchscript
!zip -r ./sagemaker.zip .