In [1]:
import os
from datetime import datetime
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import (
    Rule,
    DebuggerHookConfig,
    TensorBoardOutputConfig,
    CollectionConfig,
    rule_configs,
)

In [2]:
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'

time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
experiment_name = f"jbsnyder_pl_{time_str}"

s3_bucket = "s3://jbsnyder-sagemaker-us-east/"

base_job_name = "jbsnyder-pl-resnet"
date_str = datetime.now().strftime("%d-%m-%Y")
time_str = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
job_name = f"{base_job_name}-{time_str}"

output_path = os.path.join(s3_bucket, "sagemaker-output", date_str, job_name)
code_location = os.path.join(s3_bucket, "sagemaker-code", date_str, job_name)

hyperparameters = {"train_file_dir": os.path.join(s3_bucket, "data", "imagenet", "train"),
                   "validation_file_dir": os.path.join(s3_bucket, "data", "imagenet", "val"),
                   "max_epochs": 2,
                   'optimizer': 'adamw',
                   'lr': 0.032, 
                   'batch_size': 64,
                   'dataloader_workers': 4,
                   'warmup_epochs': 1,
                   'mixup_alpha': 0.1,
                   'precision': 16,
                   }

instance_type = "local_gpu"
instance_count = 1

distribution = None

entry_point = "launch_ddp.py"
hyperparameters['training_script'] = "train.py"
    
training_image = "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker"

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=os.path.join(output_path, 'tensorboard')
)

hook_config = DebuggerHookConfig(
    hook_parameters={
        "train.save_interval": "100",
        "eval.save_interval": "100"
    },
)

rules = [
    #Rule.sagemaker(rule_configs.vanishing_gradient()),
    #Rule.sagemaker(rule_configs.overfit()),
    #Rule.sagemaker(rule_configs.overtraining()),
    #Rule.sagemaker(rule_configs.poor_weight_initialization()),
    #Rule.sagemaker(rule_configs.all_zero()),
    #Rule.sagemaker(rule_configs.check_input_images()),
    #Rule.sagemaker(rule_configs.class_imbalance()),
    #Rule.sagemaker(rule_configs.dead_relu()),
    #Rule.sagemaker(rule_configs.exploding_tensor()),
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    #Rule.sagemaker(rule_configs.saturated_activation()),
    #Rule.sagemaker(rule_configs.weight_update_ratio()),
    #Rule.sagemaker(rule_configs.tensor_variance()),
]

In [3]:
estimator = PyTorch(
    source_dir="./src",
    entry_point=entry_point,
    base_job_name=job_name,
    role=get_execution_role(),
    instance_count=instance_count,
    instance_type=instance_type,
    distribution=distribution,
    volume_size=400,
    max_run=3600,
    hyperparameters=hyperparameters,
    image_uri=training_image,
    output_path=output_path,
    checkpoint_s3_uri=None if 'local' in instance_type else output_path,
    model_dir=output_path,
    code_location=code_location,
    ## Debugger parameters
    rules=rules,
    debugger_hook_config=hook_config,
    tensorboard_output_config=tensorboard_output_config,
    disable_profiler=False
)

In [4]:
# estimator.fit(inputs=None, wait=True, job_name=job_name)