## Imports and Variables

In [1]:
import os

import sagemaker
from dotenv import load_dotenv
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import CategoricalParameter, HyperparameterTuner

load_dotenv()

sess = sagemaker.Session()
ROLE = os.environ["SM_ARN_ROLE"]
WAND_API_KEY = os.environ["WANDB_API_KEY"]
instance_type = "ml.c4.xlarge"
output_path = f"s3://{sess.default_bucket()}/digit_classification/models"
code_location = output_path + "/digit_classification/source"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


## Hyperparameter Tuning

In [2]:
estimator = TensorFlow(
    entry_point="train.py",
    source_dir="../digit_classification",
    role=ROLE,
    framework_version="2.3.1",
    model_dir="/opt/ml/model",
    py_version="py37",
    instance_type=instance_type,
    instance_count=1,
    output_path=output_path,
    code_location=code_location,
    environment={"WANDB_API_KEY": WAND_API_KEY},
    hyperparameters={
        "epochs": 3,
        "beta_1": 0.9,
        "beta_2": 0.999,
    },
)


hyperparameter_ranges = {
    "lr": CategoricalParameter([0.0001, 0.001, 0.01]),
    "batch_size": CategoricalParameter([128, 256, 512]),
}

objective_metric_name = "train loss"
objective_type = "Minimize"
metric_definitions = [
    {"Name": "train loss", "Regex": "train loss: ([0-9\\.]+)"},
    {"Name": "train accuracy", "Regex": "train accuracy: ([0-9\\.]+)"},
    {"Name": "test loss", "Regex": "test loss: ([0-9\\.]+)"},
    {"Name": "test accuracy", "Regex": "test accuracy: ([0-9\\.]+)"},
]

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=3,
    max_parallel_jobs=1,
    objective_type=objective_type,
    early_stopping_type="Off",  # we can turn on early stopping by setting to 'AUTO' (we need to do it on test loss/accuracy as an objective metric instead of the train)
    autotune=False,  # we can turn this to true to do automatic tuning
)

tuner.fit()
print(f"best model job is at {tuner.best_training_job()}")
print(f"best model is saved at {tuner.best_estimator().model_data}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
Using provided s3_resource


No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.......................................................................................................................................................................!
best model job is at tensorflow-training-230926-0310-002-944c49e0

2023-09-26 03:20:51 Starting - Found matching resource for reuse
2023-09-26 03:20:51 Downloading - Downloading input data
2023-09-26 03:20:51 Training - Training image download completed. Training in progress.
2023-09-26 03:20:51 Uploading - Uploading generated training model
2023-09-26 03:20:51 Completed - Resource reused by training job: tensorflow-training-230926-0310-003-c66ba9ba
best model is saved at s3://sagemaker-us-east-1-633875729936/digit_classification/models/tensorflow-training-230926-0310-002-944c49e0/output/model.tar.gz


## Train One Model Locally or on the Cloud

If you want to test your code locally and train it using sagemaker with an specified hyperparams you can run below block. Unfortunately sagemaker does not support hyperparameter tunning locally.

In [4]:
local_mode = True
output_path = f"s3://{sess.default_bucket()}/digit_classification/models"

if local_mode:
    instance_type = "local"
    output_path = "file://models/digit_classification"  ## comment it if you want to upload the model to the cloud for production
    code_location = None

estimator = TensorFlow(
    entry_point="train.py",
    source_dir="../digit_classification",
    role=ROLE,
    framework_version="2.3.1",
    model_dir=False,
    py_version="py37",
    instance_type=instance_type,
    instance_count=1,
    volume_size=50,
    output_path=output_path,
    code_location=code_location,
    hyperparameters={
        "batch_size": 256,
        "epochs": 1,
        "lr": 1e-3,
        "beta_1": 0.9,
        "beta_2": 0.999,
    },
    environment={"WANDB_API_KEY": WAND_API_KEY},
)

estimator.fit()
tf_mnist_model_data = estimator.model_data
print("Model artifact saved at:\n", tf_mnist_model_data)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2023-09-26-03-27-07-653
INFO:sagemaker.local.image:'Docker Compose' found using Docker CLI.
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:Using the long-lived AWS credentials found in session
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-6rne8:
    command: train
    container_name: uwaro26ocq-algo-1-6rne8
    environment:
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.3.1-cpu-py

 Container uwaro26ocq-algo-1-6rne8  Creating
 Container uwaro26ocq-algo-1-6rne8  Created
Attaching to uwaro26ocq-algo-1-6rne8
uwaro26ocq-algo-1-6rne8  | 2023-09-26 03:27:09.581509: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
uwaro26ocq-algo-1-6rne8  | 2023-09-26 03:27:09.581649: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
uwaro26ocq-algo-1-6rne8  | 2023-09-26 03:27:09.601823: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
uwaro26ocq-algo-1-6rne8  | 2023-09-26 03:27:10,529 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training
uwaro26ocq-algo-1-6rne8  | 2023-09-26 03:27:10,546 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
uwaro26ocq-algo-1-6rne8  | 2023-09-26 03

INFO:root:copying /tmp/tmpzzhbu2i7/algo-1-6rne8/output/success -> /tmp/tmpzzhbu2i7/artifacts/output
INFO:root:creating /tmp/tmpzzhbu2i7/artifacts/output/data
INFO:root:creating /tmp/tmpzzhbu2i7/artifacts/model/mnist
INFO:root:copying /tmp/tmpzzhbu2i7/model/mnist/config.pbtxt -> /tmp/tmpzzhbu2i7/artifacts/model/mnist
INFO:root:creating /tmp/tmpzzhbu2i7/artifacts/model/mnist/1
INFO:root:creating /tmp/tmpzzhbu2i7/artifacts/model/mnist/1/model.savedmodel
INFO:root:creating /tmp/tmpzzhbu2i7/artifacts/model/mnist/1/model.savedmodel/assets
INFO:root:copying /tmp/tmpzzhbu2i7/model/mnist/1/model.savedmodel/saved_model.pb -> /tmp/tmpzzhbu2i7/artifacts/model/mnist/1/model.savedmodel
INFO:root:creating /tmp/tmpzzhbu2i7/artifacts/model/mnist/1/model.savedmodel/variables
INFO:root:copying /tmp/tmpzzhbu2i7/model/mnist/1/model.savedmodel/variables/variables.data-00000-of-00001 -> /tmp/tmpzzhbu2i7/artifacts/model/mnist/1/model.savedmodel/variables
INFO:root:copying /tmp/tmpzzhbu2i7/model/mnist/1/model.

uwaro26ocq-algo-1-6rne8 exited with code 0
Aborting on container exit...
 Container uwaro26ocq-algo-1-6rne8  Stopping
 Container uwaro26ocq-algo-1-6rne8  Stopped


INFO:root:copying /tmp/tmpzzhbu2i7/compressed_artifacts/model.tar.gz -> /app/sagemaker_scripts/models/digit_classification
INFO:root:copying /tmp/tmpzzhbu2i7/compressed_artifacts/output.tar.gz -> /app/sagemaker_scripts/models/digit_classification


===== Job Complete =====
Model artifact saved at:
 file://models/digit_classification/model.tar.gz


## Deploy Nvidia Triton locally


In [5]:
import os
import time

working_directory = os.getcwd()
model_path = os.path.join(working_directory, "models/digit_classification/model.tar.gz")

# unzip the model:
!tar -xzf $model_path -C /tmp/models/digit_classification

# run docker nvidia triton server
command = f"""
            docker run --gpus=all --shm-size=24g --ulimit memlock=-1 --rm -d --net host \
            -p 8000:8000 -p 8001:8001 -p 8002:8002 -p 8080:8080 \
            --ulimit stack=67108864 \
            --env SAGEMAKER_MULTI_MODEL=false \
            --env SAGEMAKER_TRITON_DEFAULT_MODEL_NAME="mnist" \
            -v /tmp/models/digit_classification:/opt/ml/model \
            --name triton-server -t nvcr.io/nvidia/tritonserver:23.08-py3 \
            tritonserver --model-repository /opt/ml/model
          """
!docker pull nvcr.io/nvidia/tritonserver:23.08-py3
!{command}
time.sleep(2)
!docker logs triton-server

23.08-py3: Pulling from nvidia/tritonserver
Digest: sha256:b88ed20fe7daa16c4d45cbc931b3e3fc18be8b206a4aac6f04e40022151eaa93
Status: Image is up to date for nvcr.io/nvidia/tritonserver:23.08-py3
nvcr.io/nvidia/tritonserver:23.08-py3
a78fcc72ae771455b889d4de4675c0d48f02dde8a2b4fe54d37e104dfba36548

== Triton Inference Server ==

NVIDIA Release 23.08 (build 66820947)
Triton Server Version 2.37.0

Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

Various files include modifications (c) NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

This container image and its contents are governed by the NVIDIA Deep Learning Container License.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

I0926 03:27:42.512057 1 libtorch.cc:2507] TRITONBACKEND_Initialize: pytorch
I0926 03:27:42.512101 1 libtorch.cc:2517] Triton TRITONBACKEND API version: 1.15
I0926 03:

## Test model speed using perf_analyzer


In [6]:
!docker pull nvcr.io/nvidia/tritonserver:23.08-py3-sdk
!docker run --gpus all --rm --net host nvcr.io/nvidia/tritonserver:23.08-py3-sdk perf_analyzer -m mnist --shape input_1:1,28,28,1


23.08-py3-sdk: Pulling from nvidia/tritonserver
Digest: sha256:a0c56ad380d1d9c19c87cbcd3940ab27d511ca979d3538f357e4e1be8cdba94f
Status: Image is up to date for nvcr.io/nvidia/tritonserver:23.08-py3-sdk
nvcr.io/nvidia/tritonserver:23.08-py3-sdk

== Triton Inference Server SDK ==

NVIDIA Release 23.08 (build 66821657)

Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

Various files include modifications (c) NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

This container image and its contents are governed by the NVIDIA Deep Learning Container License.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

*** Measurement Settings ***
  Batch size: 1
  Service Kind: Triton
  Using "time_windows" mode for stabilization
  Measurement window: 5000 msec
  Using synchronous calls for inference
  Stabilizing using average latency

Request concurrency: 