# Debugger and Profiler

In [1]:
# install dependencies
!pip install smdebug

Collecting smdebug
[?25l  Downloading https://files.pythonhosted.org/packages/70/bf/1ed1299d0d653bc67979d09cc85bef71397631d882cd793e3d7021528b2b/smdebug-1.0.12-py2.py3-none-any.whl (270kB)
[K    100% |████████████████████████████████| 276kB 3.4MB/s ta 0:00:01
[?25hCollecting protobuf>=3.6.0 (from smdebug)
[?25l  Downloading https://files.pythonhosted.org/packages/c6/1c/f18d97fc479b4fb6f72bbb0e41188575362e3bbd31014cf294ef0fdec8bf/protobuf-3.19.4-py2.py3-none-any.whl (162kB)
[K    100% |████████████████████████████████| 163kB 17.3MB/s ta 0:00:01
[?25hCollecting numpy>=1.16.0 (from smdebug)
[?25l  Downloading https://files.pythonhosted.org/packages/45/b2/6c7545bb7a38754d63048c7696804a0d947328125d81bf12beaa692c3ae3/numpy-1.19.5-cp36-cp36m-manylinux1_x86_64.whl (13.4MB)
[K    100% |████████████████████████████████| 13.4MB 2.5MB/s eta 0:00:01 0% |▏                               | 51kB 17.0MB/s eta 0:00:01    8% |██▋                             | 1.1MB 21.1MB/s eta 0:00:01    16% |███

In [2]:
hyperparameters = {
    "batch_size": 2048,
    "gpu": True,
    "epoch": 2,
    "model": "resnet50",
}

In [6]:
!pip install sagemaker

Collecting sagemaker
[?25l  Downloading https://files.pythonhosted.org/packages/28/a2/850e5bcf16354a04cd46ce098ae2d75947c8f38ba1aeb7c7f4b8e3476ce3/sagemaker-2.74.0.tar.gz (481kB)
[K    100% |████████████████████████████████| 491kB 15.8MB/s ta 0:00:01
Collecting google-pasta (from sagemaker)
[?25l  Downloading https://files.pythonhosted.org/packages/a3/de/c648ef6835192e6e2cc03f40b19eeda4382c49b5bafb43d88b931c4c74ac/google_pasta-0.2.0-py3-none-any.whl (57kB)
[K    100% |████████████████████████████████| 61kB 13.6MB/s ta 0:00:01
Collecting protobuf3-to-dict>=0.1.5 (from sagemaker)
  Downloading https://files.pythonhosted.org/packages/6b/55/522bb43539fed463275ee803d79851faaebe86d17e7e3dbc89870d0322b9/protobuf3-to-dict-0.1.5.tar.gz
Collecting smdebug_rulesconfig==1.0.1 (from sagemaker)
  Downloading https://files.pythonhosted.org/packages/26/a1/45a13a05198bbe9527bab2c5e5daa8bd02678aa825eec14783e767bfa7d1/smdebug_rulesconfig-1.0.1-py2.py3-none-any.whl
Collecting importlib-metadata>=1.4.0

In [7]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport())
]

AttributeError: module 'google.protobuf.internal.containers' has no attribute 'MutableMapping'

In [None]:
from sagemaker.debugger import DebuggerHookConfig, ProfilerConfig, FrameworkProfile

#TODO: Can you create the profiler and debugger configs
profiler_config = ProfilerConfig(system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10))
debugger_config = DebuggerHookConfig(hook_parameters={'train.save_interval':'100', 'eval.save_interval':'10'})

In [None]:
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

#TODO: Create the estimator to train your model
estimator = PyTorch(
    role=get_execution_role(),
    instance_count=1,
    instance_type='ml.m5.xlarge',
    entry_point='',
    framework_version=,
    py_version=,
    hyperparameters=,
    # debugger & profiler Parameters
    rules=rules,
    debugger_hook_config=debugger_config,
    profiler_config=profiler_config
    )

In [None]:
estimator.fit(wait=True)

In [None]:
import boto3

session = boto3.session.Session()
region = session.region_name

training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")
print(f"Region: {region}")

In [None]:
from smdebug.trials import create_trial
from smdebug.core.modes import ModeKeys

trial = create_trial(estimator.latest_job_debugger_artifacts_path())

In [None]:
# TODO: Can you print the names of all the tensors that were tracked
# TODO: Can you print the number of datapoints for one of those tensors
# for both train and eval mode

In [None]:
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob

tj = TrainingJob(training_job_name, region)
tj.wait_for_sys_profiling_data_to_be_available()

In [None]:
from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts

system_metrics_reader = tj.get_systems_metrics_reader()
system_metrics_reader.refresh_event_file_list()

view_timeline_charts = TimelineCharts(
    system_metrics_reader,
    framework_metrics_reader=None,
    select_dimensions=["CPU", "GPU"],
    select_events=["total"],
)

In [None]:
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"
print(f"You will find the profiler report in {rule_output_path}")

In [None]:
! aws s3 ls {rule_output_path} --recursive

In [None]:
! aws s3 cp {rule_output_path} ./ --recursive

In [None]:
import os

# get the autogenerated folder name of profiler report
profiler_report_name = [
    rule["RuleConfigurationName"]
    for rule in estimator.latest_training_job.rule_job_summary()
    if "Profiler" in rule["RuleConfigurationName"]
][0]

In [None]:
import IPython

IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")