## Imports and Variables

In [5]:
import os

import sagemaker
from dotenv import load_dotenv
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import CategoricalParameter, HyperparameterTuner

load_dotenv()

sess = sagemaker.Session()
ROLE = os.environ["SM_ARN_ROLE"]
WAND_API_KEY = os.environ["WANDB_API_KEY"]
instance_type = "ml.c4.xlarge"
output_path = f"s3://{sess.default_bucket()}/digit_classification/models"
code_location = output_path + "/digit_classification/source"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/xanier/.config/sagemaker/config.yaml


In [4]:
estimator = TensorFlow(
    entry_point="train.py",
    source_dir="../digit_classification",
    role=ROLE,
    framework_version="2.3.1",
    model_dir="/opt/ml/model",
    py_version="py37",
    instance_type=instance_type,
    instance_count=1,
    output_path=output_path,
    code_location=code_location,
    environment={"WANDB_API_KEY": WAND_API_KEY},
    hyperparameters={
        "epochs": 1,
        "beta_1": 0.9,
        "beta_2": 0.999,
    },
)


hyperparameter_ranges = {
    "lr": CategoricalParameter([0.0001, 0.001, 0.01]),
    "batch_size": CategoricalParameter([128, 256, 512]),
}

objective_metric_name = "train loss"
objective_type = "Minimize"
metric_definitions = [
    {"Name": "train loss", "Regex": "train loss: ([0-9\\.]+)"},
    {"Name": "train accuracy", "Regex": "train accuracy: ([0-9\\.]+)"},
    {"Name": "test loss", "Regex": "test loss: ([0-9\\.]+)"},
    {"Name": "test accuracy", "Regex": "test accuracy: ([0-9\\.]+)"},
]

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=1,
    max_parallel_jobs=1,
    objective_type=objective_type,
    early_stopping_type="Off",  # we can turn on early stopping by setting to 'AUTO' (we need to do it on test loss/accuracy)
    autotune=False,  # we can turn this to true to make it automatic tuning
)

tuner.fit()
print(f"best model job is at {tuner.best_training_job()}")
print(f"best model is saved at {tuner.best_estimator().model_data}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/xanier/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating hyperparameter tuning job with name: tensorflow-training-230925-0155


..

KeyboardInterrupt: 

## Train One Model Locally or on the Cloud

If you want to test your code locally and train it using sagemaker with an specified hyperparams you can run below block. Unfortunately sagemaker does not support hyperparameter tunning locally.

In [6]:
local_mode = True
output_path = f"s3://{sess.default_bucket()}/digit_classification/models"

if local_mode:
    instance_type = "local"
    # output_path = "file://models/digit_classification"  ## Uncomment if you want to skip uploading to cloud
    code_location = None

estimator = TensorFlow(
    entry_point="train.py",
    source_dir="../digit_classification",
    role=ROLE,
    framework_version="2.3.1",
    model_dir=False,
    py_version="py37",
    instance_type=instance_type,
    instance_count=1,
    volume_size=50,
    output_path=output_path,
    code_location=code_location,
    hyperparameters={
        "batch_size": 256,
        "epochs": 5,
        "lr": 1e-3,
        "beta_1": 0.9,
        "beta_2": 0.999,
    },
    environment={"WANDB_API_KEY": WAND_API_KEY},
)

estimator.fit()
tf_mnist_model_data = estimator.model_data
print("Model artifact saved at:\n", tf_mnist_model_data)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/xanier/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/xanier/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2023-09-25-05-55-53-638
INFO:sagemaker.local.image:'Docker Compose' found using Docker CLI.
INFO:sagemaker.local.local_session:Starting training job
INFO:sagemaker.local.image:Using the long-lived AWS credentials found in session
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-r6kgf:
    command: train
    container_name: 1la2huhuop-algo-1-r6kgf
    environment:
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    image: 763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.3.1-cpu-py

 Container 1la2huhuop-algo-1-r6kgf  Creating
 Container 1la2huhuop-algo-1-r6kgf  Created
Attaching to 1la2huhuop-algo-1-r6kgf
1la2huhuop-algo-1-r6kgf  | 2023-09-25 05:55:56.616539: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
1la2huhuop-algo-1-r6kgf  | 2023-09-25 05:55:56.616694: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
1la2huhuop-algo-1-r6kgf  | 2023-09-25 05:55:56.637703: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
1la2huhuop-algo-1-r6kgf  | 2023-09-25 05:55:57,569 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training
1la2huhuop-algo-1-r6kgf  | 2023-09-25 05:55:57,584 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
1la2huhuop-algo-1-r6kgf  | 2023-09-25 05

INFO:root:copying /tmp/tmpdh82t4wn/algo-1-r6kgf/output/success -> /tmp/tmpdh82t4wn/artifacts/output
INFO:root:creating /tmp/tmpdh82t4wn/artifacts/output/data
INFO:root:creating /tmp/tmpdh82t4wn/artifacts/model/mnist
INFO:root:copying /tmp/tmpdh82t4wn/model/mnist/config.pbtxt -> /tmp/tmpdh82t4wn/artifacts/model/mnist
INFO:root:creating /tmp/tmpdh82t4wn/artifacts/model/mnist/1
INFO:root:creating /tmp/tmpdh82t4wn/artifacts/model/mnist/1/model.savedmodel
INFO:root:creating /tmp/tmpdh82t4wn/artifacts/model/mnist/1/model.savedmodel/assets
INFO:root:copying /tmp/tmpdh82t4wn/model/mnist/1/model.savedmodel/saved_model.pb -> /tmp/tmpdh82t4wn/artifacts/model/mnist/1/model.savedmodel
INFO:root:creating /tmp/tmpdh82t4wn/artifacts/model/mnist/1/model.savedmodel/variables
INFO:root:copying /tmp/tmpdh82t4wn/model/mnist/1/model.savedmodel/variables/variables.data-00000-of-00001 -> /tmp/tmpdh82t4wn/artifacts/model/mnist/1/model.savedmodel/variables
INFO:root:copying /tmp/tmpdh82t4wn/model/mnist/1/model.

1la2huhuop-algo-1-r6kgf exited with code 0
Aborting on container exit...
 Container 1la2huhuop-algo-1-r6kgf  Stopping
 Container 1la2huhuop-algo-1-r6kgf  Stopped
===== Job Complete =====
Model artifact saved at:
 s3://sagemaker-us-east-1-633875729936/digit_classification/models/tensorflow-training-2023-09-25-05-55-53-638/model.tar.gz
