# Docker Build

In [92]:
%cd ~/SageMaker/CIFAR

/home/ec2-user/SageMaker/docker_test_folder


In [97]:
!bash build_and_push.sh cifar-extended-container-test-gpu v1

383750972175.dkr.ecr.us-west-2.amazonaws.com/cifar-extended-container-test-gpu:v1
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Error response from daemon: Get "https://registry-1.docker.io/v2/": unauthorized: incorrect username or password
build_and_push.sh: line 54: 763104351884.dkr.ecr.us-west-2.amazonaws.com: command not found
Sending build context to Docker daemon  203.3kB
Step 1/5 : FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker
 ---> 8b18de602b3c
Step 2/5 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Using cache
 ---> 91f529c5cd05
Step 3/5 : ENV SAGEMAKER_SUBMIT_DIRECTORY /opt/ml/code
 ---> Using cache
 ---> e398e7fb6e71
Step 4/5 : COPY cifar10.py /opt/ml/code/cifar10.py
 ---> 22b449d3d37d
Step 5/5 : ENV SAGEMAKER_PROGRAM cifar10.py
 ---> Running in c5853eb7de6f
Removing intermediate container c5853eb7de6f
 ---> abd595ac83b9
Successfully built abd595ac83b9
Successfully tagg

# Data

In [15]:
import torch
import torchvision
import torchvision.transforms as transforms

def _get_transform():
    return transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


def get_train_data_loader(data_dir='/tmp/pytorch/cifar-10-data'):
    transform=_get_transform()
    trainset=torchvision.datasets.CIFAR10(root=data_dir, train=True,
                                            download=True, transform=transform)
    return torch.utils.data.DataLoader(trainset, batch_size=4,
                                       shuffle=True, num_workers=2)


def get_test_data_loader(data_dir='/tmp/pytorch/cifar-10-data'):
    transform=_get_transform()
    testset=torchvision.datasets.CIFAR10(root=data_dir, train=False,
                                           download=True, transform=transform)
    return torch.utils.data.DataLoader(testset, batch_size=4,
                                       shuffle=False, num_workers=2)

trainloader=get_train_data_loader('/tmp/pytorch-example/cifar-10-data')
testloader=get_test_data_loader('/tmp/pytorch-example/cifar-10-data')

Files already downloaded and verified
Files already downloaded and verified


In [37]:
s3_path_to_data = sagemaker.Session().upload_data(
    bucket='datasets-rf1', 
    path='/tmp/pytorch-example/cifar-10-data',
    key_prefix='cifar-10-data'
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


# Train

In [29]:
!pip install -qU smdebug sagemaker

In [2]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

from sagemaker.debugger import ProfilerConfig, FrameworkProfile

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500,
    framework_profile_params=FrameworkProfile(start_step=5, num_steps=10),
)

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500,
    framework_profile_params=FrameworkProfile(num_steps=10)
)

In [3]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator

estimator=Estimator(
    role=get_execution_role(),
    base_job_name='cifar-extended-container-test-gpu',
    image_uri='383750972175.dkr.ecr.us-west-2.amazonaws.com/cifar-extended-container-test-gpu:v1',
    instance_count=1,
    instance_type='ml.g4dn.12xlarge',
    hyperparameters={
        "epoch":3,
        "batch-size":1024
    },
    profiler_config=profiler_config,
    rules=rules,
)

In [51]:
estimator.fit('s3://datasets-rf1/cifar-10-data/')

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: cifar-extended-container-test-gpu-2023-02-11-06-50-02-853


2023-02-11 06:50:03 Starting - Starting the training job...
2023-02-11 06:50:30 Starting - Preparing the instances for trainingLossNotDecreasing: InProgress
LowGPUUtilization: InProgress
ProfilerReport: InProgress
......
2023-02-11 06:51:30 Downloading - Downloading input data...
2023-02-11 06:51:50 Training - Downloading the training image..................
2023-02-11 06:55:00 Training - Training image download completed. Training in progress.......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-02-11 06:55:49,536 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-02-11 06:55:49,571 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-02-11 06:55:49,581 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-02-11 06:55:49,583 sagemaker_pytorch_c

# Profile Report

In [8]:
import boto3

session = boto3.session.Session()
region = session.region_name

training_job_name = 'cifar-extended-container-test-gpu-2023-02-11-06-50-02-853'
# training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")
print(f"Region: {region}")

Training jobname: cifar-extended-container-test-gpu-2023-02-11-06-50-02-853
Region: us-west-2


In [9]:
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob

tj = TrainingJob(training_job_name, region)
tj.wait_for_sys_profiling_data_to_be_available()

[2023-02-11 07:11:40.301 ip-172-16-34-190.us-west-2.compute.internal:25418 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None


ProfilerConfig:{'S3OutputPath': 's3://sagemaker-us-west-2-383750972175/', 'ProfilingIntervalInMilliseconds': 500, 'ProfilingParameters': {'DataloaderProfilingConfig': '{"StartStep": 0, "NumSteps": 10, "MetricsRegex": ".*", }', 'DetailedProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }', 'FileOpenFailThreshold': '50', 'HorovodProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }', 'LocalPath': '/opt/ml/output/profiler', 'PythonProfilingConfig': '{"StartStep": 0, "NumSteps": 10, "ProfilerName": "cprofile", "cProfileTimer": "total_time", }', 'RotateFileCloseIntervalInSeconds': '60', 'RotateMaxFileSizeInBytes': '10485760', 'SMDataParallelProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }'}, 'DisableProfiler': False}
s3 path:s3://sagemaker-us-west-2-383750972175/cifar-extended-container-test-gpu-2023-02-11-06-50-02-853/profiler-output


Profiler data from system is available


In [10]:
from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts

system_metrics_reader = tj.get_systems_metrics_reader()
system_metrics_reader.refresh_event_file_list()

view_timeline_charts = TimelineCharts(
    system_metrics_reader,
    framework_metrics_reader=None,
    select_dimensions=["CPU", "GPU"],
    select_events=["total"],
)

[2023-02-11 07:11:51.309 ip-172-16-34-190.us-west-2.compute.internal:25418 INFO metrics_reader_base.py:134] Getting 6 event files
select events:['total']
select dimensions:['CPU', 'GPU']
filtered_events:{'total'}
filtered_dimensions:{'GPUUtilization-nodeid:algo-1', 'GPUMemoryUtilization-nodeid:algo-1', 'CPUUtilization-nodeid:algo-1'}
