In [None]:
import os
from datetime import datetime

import boto3
from sagemaker import analytics, image_uris
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import (
    Rule,
    DebuggerHookConfig,
    TensorBoardOutputConfig,
    CollectionConfig,
    ProfilerConfig,
    FrameworkProfile,
    DetailedProfilingConfig,
    rule_configs,
    ProfilerRule,
)
from smdebug.core.collection import CollectionKeys

In [None]:
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

region = boto3.session.Session().region_name
boto_sess = boto3.Session()
sm = boto_sess.client('sagemaker')

s3_bucket = "s3://jbsnyder-sagemaker-us-east/"

base_job_name = "jbsnyder-resnet-debugger"
date_str = datetime.now().strftime("%d-%m-%Y")
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
job_name = f"{base_job_name}-{time_str}"

output_path = os.path.join(s3_bucket, "sagemaker-output", date_str, job_name)
code_location = os.path.join(s3_bucket, "sagemaker-code", date_str, job_name)

In [None]:
collection_configs=[
    CollectionConfig(
        name=CollectionKeys.WEIGHTS,
        parameters={
            "save_interval": "100",
        }
    ),
]

debugger_hook_config=DebuggerHookConfig(
    collection_configs=collection_configs,   
)

rules = [
Rule.sagemaker(
        base_config=rule_configs.loss_not_decreasing(),
        rule_parameters={
                "tensor_regex": ".*",
                "use_losses_collection": "True",
                "num_steps": "10",
                "diff_percent": "0.1",
                "increase_threshold_percent": "5",
                "mode": "GLOBAL"
        },
        collections_to_save=[
            CollectionConfig(
                name=CollectionKeys.LOSSES,
                parameters={
                    "save_interval": "10",
                }
            ),
        ],
        actions=rule_configs.ActionList(rule_configs.Email("email@email.com"))
    ),
Rule.sagemaker(
        base_config=rule_configs.exploding_tensor(),
        rule_parameters={
                "tensor_regex": ".*gradient",
                "only_nan": "False"
        },
        collections_to_save=[ 
            CollectionConfig(
                name="gradients", 
                parameters={
                    "save_interval": "100"
                }
            )
        ],
        actions=rule_configs.ActionList(rule_configs.Email("email@email.com"))
    ),
ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

profiler_config=ProfilerConfig(
    system_monitor_interval_millis=500,
    framework_profile_params=FrameworkProfile(
        detailed_profiling_config=DetailedProfilingConfig(
            start_step=50, 
            num_steps=10
        ),
    )
)

tensorboard_output_config = TensorBoardOutputConfig(s3_output_path=os.path.join(output_path, 'tensorboard'))

In [None]:
hyperparameters = {"train_data_src": os.path.join(s3_bucket, "data", "imagenet", "train"), # "/opt/ml/input/data/train/", 
                   "val_data_src": os.path.join(s3_bucket, "data", "imagenet", "val"), # "/opt/ml/input/data/val/", 
                   "num_epochs": 4,
                   'learning_rate': 0.004,
                   'batch_size': 512,
                   'dataloader_workers': 12,
                   'precision': 'float16',
                   'dist': 'smddp',
                   }

distribution = { "smdistributed": { "dataparallel": { "enabled": True } } }
entry_point = "train.py"

In [None]:
instance_type = 'ml.p4d.24xlarge'
instance_count = 2

image_uri = image_uris.retrieve(
    framework='pytorch',
    region=region,
    version='1.12',
    py_version='py38',
    image_scope='training',
    instance_type=instance_type,
)

In [None]:
estimator = PyTorch(
    source_dir="./src",
    entry_point=entry_point,
    base_job_name=job_name,
    role=get_execution_role(),
    instance_count=instance_count,
    instance_type=instance_type,
    distribution=distribution,
    volume_size=400,
    max_run=7200,
    hyperparameters=hyperparameters,
    image_uri=image_uri,
    output_path=os.path.join(output_path, 'training-output'),
    checkpoint_s3_uri=os.path.join(output_path, 'training-checkpoints'),
    model_dir=os.path.join(output_path, 'training-model'),
    code_location=code_location,
    ## Debugger parameters
    enable_sagemaker_metrics=True,
    rules=rules,
    debugger_hook_config=debugger_hook_config,
    tensorboard_output_config=tensorboard_output_config,
    profiler_config=profiler_config,
    input_mode='File',
)

In [None]:
estimator.fit(
    inputs=None,
    wait=False,
    job_name=job_name,
)

In [None]:
estimator.tensorboard_output_config.s3_output_path

In [None]:
estimator.logs()