## Imports and Variables

In [8]:
import os

import sagemaker
from dotenv import load_dotenv
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import CategoricalParameter, HyperparameterTuner

load_dotenv()

sess = sagemaker.Session()
ROLE = os.environ["SM_ARN_ROLE"]
WAND_API_KEY = os.environ["WANDB_API_KEY"]
instance_type = "ml.c4.xlarge"
output_path = f"s3://{sess.default_bucket()}/digit_classification/models"
code_location = output_path + "/digit_classification/source"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/xanier/.config/sagemaker/config.yaml


In [6]:
estimator = TensorFlow(
    entry_point="train.py",
    source_dir="../digit_classification",
    role=ROLE,
    framework_version="2.3.1",
    model_dir="/opt/ml/model",
    py_version="py37",
    instance_type=instance_type,
    instance_count=1,
    output_path=output_path,
    code_location=code_location,
    environment={"WANDB_API_KEY": WAND_API_KEY},
    hyperparameters={
        "epochs": 1,
        "beta_1": 0.9,
        "beta_2": 0.999,
    },
)


hyperparameter_ranges = {
    "lr": CategoricalParameter([0.0001, 0.001, 0.01]),
    "batch_size": CategoricalParameter([128, 256, 512]),
}

objective_metric_name = "train loss"
objective_type = "Minimize"
metric_definitions = [
    {"Name": "train loss", "Regex": "train loss: ([0-9\\.]+)"},
    {"Name": "train accuracy", "Regex": "train accuracy: ([0-9\\.]+)"},
    {"Name": "test loss", "Regex": "test loss: ([0-9\\.]+)"},
    {"Name": "test accuracy", "Regex": "test accuracy: ([0-9\\.]+)"},
]

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=1,
    max_parallel_jobs=1,
    objective_type=objective_type,
    early_stopping_type="Off",  # we can turn on early stopping by setting to 'AUTO' (we need to do it on test loss/accuracy)
    autotune=False,  # we can turn this to true to make it automatic tuning
)

tuner.fit()
print(f"best model job is at {tuner.best_training_job()}")
print(f"best model is saved at {tuner.best_estimator().model_data}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/xanier/.config/sagemaker/config.yaml
Using provided s3_resource


No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...................................................!
best model job is at tensorflow-training-230924-2059-001-5cdc8dfe

2023-09-25 01:03:50 Starting - Preparing the instances for training
2023-09-25 01:03:50 Downloading - Downloading input data
2023-09-25 01:03:50 Training - Training image download completed. Training in progress.
2023-09-25 01:03:50 Uploading - Uploading generated training model
2023-09-25 01:03:50 Completed - Resource retained for reuse
best model is saved at s3://sagemaker-us-east-1-633875729936/digit_classification/models/tensorflow-training-230924-2059-001-5cdc8dfe/output/model.tar.gz


## Train One Model Locally or on the Cloud

If you want to test your code locally and train it using sagemaker with an specified hyperparams you can run below block. Unfortunately sagemaker does not support hyperparameter tunning locally.

In [7]:
local_mode = True
output_path = f"s3://{sess.default_bucket()}/digit_classification/models"

if local_mode:
    instance_type = "local"
    output_path = "file://models/digit_classification"  ## Uncomment if you want to skip uploading to cloud
    code_location = None

estimator = TensorFlow(
    entry_point="train.py",
    source_dir="../digit_classification",
    role=ROLE,
    framework_version="2.3.1",
    model_dir=False,
    py_version="py37",
    instance_type=instance_type,
    instance_count=1,
    volume_size=50,
    output_path=output_path,
    code_location=code_location,
    hyperparameters={
        "batch_size": 512,
        "epochs": 1,
        "lr": 1e-3,
        "beta_1": 0.9,
        "beta_2": 0.999,
    },
    environment={"WANDB_API_KEY": WAND_API_KEY},
)

estimator.fit()
tf_mnist_model_data = estimator.model_data
print("Model artifact saved at:\n", tf_mnist_model_data)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/xanier/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/xanier/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2023-09-25-01-05-27-479
INFO:sagemaker.local.image:'Docker Compose' found using Docker CLI.
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:Using the long-lived AWS credentials found in session
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-boq2n:
    command: train
    container_name: exwx5s2g3r-algo-1-boq2n
    environment:
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.3.1-cpu-py37
    networks:
      sagemaker-local:
        aliases:
        - algo-1-boq2n
    stdin_open: true
    tty: true
    volumes:
    - /tmp/tmpea8id_m3/algo-1-boq2n/input:/opt/ml/input
    - /tmp/tmpea8id_m

 Container exwx5s2g3r-algo-1-boq2n  Creating
 Container exwx5s2g3r-algo-1-boq2n  Created
Attaching to exwx5s2g3r-algo-1-boq2n
exwx5s2g3r-algo-1-boq2n  | 2023-09-25 01:05:30.906279: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
exwx5s2g3r-algo-1-boq2n  | 2023-09-25 01:05:30.906441: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
exwx5s2g3r-algo-1-boq2n  | 2023-09-25 01:05:30.926916: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
exwx5s2g3r-algo-1-boq2n  | 2023-09-25 01:05:31,832 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training
exwx5s2g3r-algo-1-boq2n  | 2023-09-25 01:05:31,846 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
exwx5s2g3r-algo-1-boq2n  | 2023-09-25 01

INFO:root:copying /tmp/tmpea8id_m3/algo-1-boq2n/output/success -> /tmp/tmpea8id_m3/artifacts/output
INFO:root:creating /tmp/tmpea8id_m3/artifacts/output/data
INFO:root:creating /tmp/tmpea8id_m3/artifacts/model/mnist
INFO:root:copying /tmp/tmpea8id_m3/model/mnist/config.pbtxt -> /tmp/tmpea8id_m3/artifacts/model/mnist
INFO:root:creating /tmp/tmpea8id_m3/artifacts/model/mnist/1
INFO:root:creating /tmp/tmpea8id_m3/artifacts/model/mnist/1/model.savedmodel
INFO:root:creating /tmp/tmpea8id_m3/artifacts/model/mnist/1/model.savedmodel/assets
INFO:root:copying /tmp/tmpea8id_m3/model/mnist/1/model.savedmodel/saved_model.pb -> /tmp/tmpea8id_m3/artifacts/model/mnist/1/model.savedmodel
INFO:root:creating /tmp/tmpea8id_m3/artifacts/model/mnist/1/model.savedmodel/variables
INFO:root:copying /tmp/tmpea8id_m3/model/mnist/1/model.savedmodel/variables/variables.data-00000-of-00001 -> /tmp/tmpea8id_m3/artifacts/model/mnist/1/model.savedmodel/variables
INFO:root:copying /tmp/tmpea8id_m3/model/mnist/1/model.

exwx5s2g3r-algo-1-boq2n exited with code 0
Aborting on container exit...
 Container exwx5s2g3r-algo-1-boq2n  Stopping
 Container exwx5s2g3r-algo-1-boq2n  Stopped


INFO:root:creating /home/xanier/git/test_ci_cd/sagemaker_scripts/digit_classification
INFO:root:creating /home/xanier/git/test_ci_cd/sagemaker_scripts/digit_classification/models
INFO:root:copying /tmp/tmpea8id_m3/compressed_artifacts/model.tar.gz -> /home/xanier/git/test_ci_cd/sagemaker_scripts/digit_classification/models
INFO:root:copying /tmp/tmpea8id_m3/compressed_artifacts/output.tar.gz -> /home/xanier/git/test_ci_cd/sagemaker_scripts/digit_classification/models


===== Job Complete =====
Model artifact saved at:
 file://digit_classification/models/model.tar.gz
