In [9]:
#!pip install --upgrade sagemaker sagemaker-experiments botocore==1.24.42

In [1]:
import os
from datetime import datetime

import boto3
from sagemaker import analytics, image_uris
from sagemaker.tensorflow import TensorFlow
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker
from sagemaker import get_execution_role
from sagemaker.debugger import (
    Rule,
    DebuggerHookConfig,
    TensorBoardOutputConfig,
    CollectionConfig,
    ProfilerConfig,
    FrameworkProfile,
    DetailedProfilingConfig,
    DataloaderProfilingConfig,
    rule_configs,
)
from smdebug.core.collection import CollectionKeys

Extension horovod.torch has not been built: /home/ec2-user/anaconda3/envs/tensorflow2_p38/lib/python3.8/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-38-x86_64-linux-gnu.so not found
If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.


In [2]:
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

region = boto3.session.Session().region_name
boto_sess = boto3.Session()
sm = boto_sess.client('sagemaker')

s3_bucket = "s3://jbsnyder-sagemaker-us-east/"

base_job_name = "jbsnyder-tf-resnet-debugger"
date_str = datetime.now().strftime("%d-%m-%Y")
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
job_name = f"{base_job_name}-{time_str}"

output_path = os.path.join(s3_bucket, "sagemaker-output", date_str, job_name)
code_location = os.path.join(s3_bucket, "sagemaker-code", date_str, job_name)

In [3]:
try: # Create new experiment
    experiment = Experiment.create(
        experiment_name=base_job_name,
        description='Resnet50 Classifier Training',
        sagemaker_boto_client=sm)
except: # Or reload existing
    experiment = Experiment.load(
        experiment_name=base_job_name,
        sagemaker_boto_client=sm)

trial = Trial.create(
    trial_name=job_name,
    experiment_name=experiment.experiment_name,
    sagemaker_boto_client=sm)
experiment_config = {
    'TrialName': trial.trial_name,
    'TrialComponentDisplayName': 'Training'}

In [4]:
tensorboard_output_config = TensorBoardOutputConfig(s3_output_path=os.path.join(output_path, 'tensorboard'))

In [5]:
collection_configs=[
    CollectionConfig(
        name="losses",
        parameters={
            "save_interval": "25",
            "reductions": "mean",
        }
    ),
    CollectionConfig(
        name=CollectionKeys.GRADIENTS,
        parameters={
            "save_interval": "100",
            "reductions": "mean",
        }
    )
]

debugger_hook_config=DebuggerHookConfig(
    collection_configs=collection_configs
)

In [6]:
profiler_config=ProfilerConfig(
    system_monitor_interval_millis=500,
)

In [7]:
hyperparameters = {"train_data_dir": os.path.join(s3_bucket, "data", "imagenet", "tfrecord", "train"), # "/opt/ml/input/data/train/", 
                   "validation_data_dir": os.path.join(s3_bucket, "data", "imagenet", "tfrecord", "validation"), # "/opt/ml/input/data/val/", 
                   "batch_size": 512,
                   "num_epochs": 5,
                   "model_dir": "/opt/ml/model",
                   "learning_rate": 0.08,
                   "momentum": 0.9,
                   "label_smoothing": 0.1,
                   "mixup_alpha": 0.1,
                   "l2_weight_decay": 1e-4,
                   "fp16": True,
                   "xla": False,
                   "tf32": False,
                   "model": "resnet50",
                   }

In [8]:
distribution = {"mpi": {"enabled": True}}
entry_point = "train.py"

In [9]:
instance_type = 'ml.p3.16xlarge'
instance_count = 1

image_uri = image_uris.retrieve(
    framework='tensorflow',
    region=region,
    version='2.7',
    py_version='py38',
    image_scope='training',
    instance_type=instance_type,
)

In [10]:
estimator = TensorFlow(
    source_dir="./src",
    entry_point=entry_point,
    base_job_name=job_name,
    role=get_execution_role(),
    instance_count=instance_count,
    instance_type=instance_type,
    distribution=distribution,
    volume_size=400,
    max_run=7200,
    hyperparameters=hyperparameters,
    image_uri=image_uri,
    output_path=os.path.join(output_path, 'training-output'),
    checkpoint_s3_uri=os.path.join(output_path, 'training-checkpoints'),
    model_dir=os.path.join(output_path, 'training-model'),
    code_location=code_location,
    ## Debugger parameters
    enable_sagemaker_metrics=True,
    # rules=rules,
    debugger_hook_config=debugger_hook_config,
    disable_profiler=False,
    tensorboard_output_config=tensorboard_output_config,
    profiler_config=profiler_config,
    input_mode='File',
)

In [11]:
# Run training
estimator.fit(
    inputs=None if hyperparameters['train_data_dir'].startswith('s3') else channels,
    wait=False,
    job_name=job_name,
    experiment_config=experiment_config,
)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: jbsnyder-tf-resnet-debugger-17-06-2022-23-24-32


In [None]:
estimator.logs()

2022-06-17 23:24:38 Starting - Starting the training job...ProfilerReport-1655508275: InProgress
...
2022-06-17 23:25:34 Starting - Preparing the instances for training.........
2022-06-17 23:27:00 Downloading - Downloading input data...
2022-06-17 23:27:20 Training - Downloading the training image...............
2022-06-17 23:30:07 Training - Training image download completed. Training in progress..[34m2022-06-17 23:30:11.218027: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2022-06-17 23:30:11.228351: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2022-06-17 23:30:11.536312: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2022-06-17 23:30:16,618 sagemaker-training-toolkit INFO     Imported framework sagemaker_ten