# Step 1 - Installation of the model required libraries

In [1]:
!pip install torch -U
!pip install -U sagemaker
!pip install -qU pip awscli boto3 transformers
!pip3 install pickle5
!pip install datasets
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting torch
  Downloading torch-1.11.0-cp38-cp38-manylinux1_x86_64.whl (750.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m750.6/750.6 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.10.0
    Uninstalling torch-1.10.0:
      Successfully uninstalled torch-1.10.0
Successfully installed torch-1.11.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mLooking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting sagemaker
  Downloading sagemaker-2.96.0.tar.gz (534 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m534.4/534.4 KB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ..

# Step 2 - Train the model

In [1]:
import pickle5 as pickle
import logging
import pandas as pd
import sklearn
import nltk
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader, TensorDataset
import sys
from datasets import load_metric
import numpy as np

TRAIN_DIR = "./distilbert_train_intermediate-torchscript"
FINAL_DIR = "./workspace-torchscript"
DEFAULT_FILENAME = "./spam_training_dataset_43k.pkl"
BATCH_SIZE = 128
COL_DATA = "text"  # Name of the column with the spam text
LABEL = "is_spam"  # Name of the column with the label 0 (ham) or 1 (spam)
NUM_EPOCHS = 4

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)

def clean_text(text):
    import re
    stopwords = nltk.corpus.stopwords.words('english')
    text = str(text).lower()  # Convert to lower case
    text = re.sub(r'[^\w\s]', '', text)  # Remove everything except words
    words = [word for word in text.split() if word not in stopwords]  # Remove stopwords
    text = " ".join(words)
    return text

def download_dataset():  
    nltk.download("stopwords")

    with open(DEFAULT_FILENAME, "rb") as fh:
        data = pickle.load(fh)
        data = data[[COL_DATA, LABEL]]  
        data[COL_DATA] = data[COL_DATA].apply(clean_text)
        data.reset_index()
        return data

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)    
    
logging.info('Loading the Dataset')
dataset = download_dataset()

logging.info('Loading the Metric')
metric = load_metric("accuracy")

logging.info('Loading the pretrained tokenizer and model')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

logging.info('Preparing the training and evaluation dataset')
train_data, val_data, train_labels, val_labels = train_test_split(dataset[COL_DATA].values, dataset[LABEL].values)
train_tokens = tokenizer(list(train_data), return_tensors="pt", padding=True, truncation=True, max_length=BATCH_SIZE)
val_tokens = tokenizer(list(val_data), return_tensors="pt", padding=True, truncation=True, max_length=BATCH_SIZE)

train_dataset = ClassificationDataset(train_tokens, train_labels)
val_dataset = ClassificationDataset(val_tokens, val_labels)

logging.info('Training Started')
trainer = Trainer(
    model=model,
    args=TrainingArguments(output_dir=TRAIN_DIR, num_train_epochs=NUM_EPOCHS),
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

model.save_pretrained(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)
logging.info('Training Completed')

print("**************** Evaluation ************")
metrics = trainer.evaluate()
metrics["eval_samples"] = len(val_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

2022-06-21 13:45:13,480 [INFO] Loading the Dataset


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2022-06-21 13:45:31,658 [INFO] Loading the Metric
2022-06-21 13:45:31,726 [INFO] Loading the pretrained tokenizer and model


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

2022-06-21 13:45:32,722 [INFO] Preparing the training and evaluation dataset
2022-06-21 13:48:07,621 [INFO] Training Started


***** Running training *****
  Num examples = 32580
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16292
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
500,0.2873
1000,0.2046
1500,0.2017
2000,0.1696
2500,0.1577
3000,0.1751
3500,0.1476
4000,0.1686
4500,0.0974
5000,0.1148


Saving model checkpoint to ./distilbert_train_intermediate-torchscript/checkpoint-500
Configuration saved in ./distilbert_train_intermediate-torchscript/checkpoint-500/config.json
Model weights saved in ./distilbert_train_intermediate-torchscript/checkpoint-500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./distilbert_train_intermediate-torchscript/checkpoint-1000
Configuration saved in ./distilbert_train_intermediate-torchscript/checkpoint-1000/config.json
Model weights saved in ./distilbert_train_intermediate-torchscript/checkpoint-1000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Saving model checkpoint to ./distilbert_train_intermediate-torchscript/checkpoint-1500
Configuration saved in ./distilbert_train_intermediate-torchscript/checkpoint-1500/config.json
Model weights saved in ./distilbert_train_intermediate-torchscript/checkpoint-1500/pytorch_model.bi

2022-06-21 14:01:38,655 [INFO] Training Completed


***** Running Evaluation *****
  Num examples = 10860
  Batch size = 8


**************** Evaluation ************


***** eval metrics *****
  epoch                   =        4.0
  eval_accuracy           =     0.9581
  eval_loss               =     0.2565
  eval_runtime            = 0:00:15.01
  eval_samples            =      10860
  eval_samples_per_second =    723.116
  eval_steps_per_second   =     90.423


# Step 3 - Generate the PT file

In [3]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

loaded_model = DistilBertForSequenceClassification.from_pretrained("./workspace-torchscript/", torchscript=True)
device = torch.device("cuda")
loaded_model.to(device)

bs = 1
seq_len = 128
dummy_inputs = [
    torch.randint(1000, (bs, seq_len)).to(device),
    torch.zeros(bs, seq_len, dtype=torch.int).to(device),
]

loaded_model = loaded_model.eval()
loaded_model.to(device)

traced_model = torch.jit.trace(loaded_model, dummy_inputs)
torch.jit.save(traced_model, './spamdetection-torchscript/model/model.pt')

loading configuration file ./workspace-torchscript/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "torchscript": true,
  "transformers_version": "4.20.0",
  "vocab_size": 30522
}

loading weights file ./workspace-torchscript/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the model checkpoint at ./workspa

# Step 4 - Create SageMaker model package and upload it to SageMaker

In [67]:
!cd spamdetection-torchscript/model && tar czvf ../model.tar.gz *

model.pt


# Step 5 - Create SageMaker Inference endpoint

In [68]:
import boto3, json, sagemaker, time
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorchModel
from sagemaker import get_execution_role
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

sess = boto3.Session()
sm = sess.client("sagemaker")
sagemaker_session = sagemaker.Session(boto_session=sess)
role = get_execution_role()

model_uri = sagemaker_session.upload_data(path="./spamdetection-torchscript/model.tar.gz", key_prefix="spamdetection-torchscript")
print(model_uri)

sm_model_name = "spamdetection-torchscript-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

model = PyTorchModel(
              role=role,
              name=sm_model_name,
              sagemaker_session=sagemaker_session,
              model_data=model_uri,
              framework_version='1.11.0',
              py_version='py38',
              entry_point="serve.py",
              source_dir="spamdetection-torchscript",
              )

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.p3.2xlarge",
    endpoint_name=sm_model_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),    
)

s3://sagemaker-us-east-1-806460758762/spamdetection-torchscript/model.tar.gz
2022-06-21 16:32:41,690 [INFO] Creating model with name: spamdetection-torchscript-2022-06-21-16-31-54
2022-06-21 16:32:41,975 [INFO] Creating endpoint-config with name spamdetection-torchscript-2022-06-21-16-31-54
2022-06-21 16:32:42,050 [INFO] Creating endpoint with name spamdetection-torchscript-2022-06-21-16-31-54
------------!

# Step 6 - Test Inference Point

In [9]:
!pip install retry

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting retry
  Downloading retry-0.9.2-py2.py3-none-any.whl (8.0 kB)
Installing collected packages: retry
Successfully installed retry-0.9.2


In [87]:
from retry import retry
import botocore
import concurrent
import torch.nn.functional as F    

def tokenize_text(text):
    encoded_text = enc(clean_text(text), padding="max_length", max_length=128, truncation=True)
    return encoded_text["input_ids"], encoded_text["attention_mask"]

@retry(botocore.exceptions.ClientError, tries=5, delay=1)
def get_prediction(test_text):
    input_ids, attention_mask = tokenize_text(test_text)
    result = predictor.predict({"input_ids" : input_ids, "attention_mask" : attention_mask}, initial_args={'ContentType': 'application/json'})
    predictions = F.softmax(torch.tensor(result[0]),dim=-1)
    return torch.argmax(predictions, dim=-1).numpy()

enc = DistilBertTokenizer.from_pretrained("./workspace-torchscript")

test_texts = [
                "Oh k...i'''m watching here:)",
                "As a valued customer, I am pleased to advise you that following recent review of your Mob No. you are awarded with a £1500 Bonus Prize, call 09066364589",
                "I HAVE A DATE ON SUNDAY WITH WILL!!",
                "England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/ú1.20 POBOXox36504W45WQ 16+"
]

num_inferences = 1000
start = time.time() 
with concurrent.futures.ThreadPoolExecutor() as exe: 
    fut_list = []
    for _ in range (num_inferences):
        for test_text in test_texts:
            fut = exe.submit(get_prediction, test_text)         
            fut_list.append(fut)     
    for fut in fut_list:         
        rslt = fut.result() 
        
elapsed_time = time.time() - start 
print('num_inferences:{:>6}[texts], elapsed_time:{:6.2f}[sec], Throughput:{:8.2f}[texts/sec]'.format(num_inferences * len(test_texts), elapsed_time, num_inferences * len(test_texts)/ elapsed_time))


Didn't find file ./workspace-torchscript/added_tokens.json. We won't load it.
loading file ./workspace-torchscript/vocab.txt
loading file None
loading file ./workspace-torchscript/special_tokens_map.json
loading file ./workspace-torchscript/tokenizer_config.json


num_inferences:  4000[texts], elapsed_time: 28.49[sec], Throughput:  140.40[texts/sec]


# Step 7 - Delete the Inference Endpoint

In [None]:
predictor.delete_endpoint()
predictor.delete_model(ModelName=sm_model_name)