In [None]:
!pip install sagemaker

In [None]:
!pip install pandas

In [1]:
from __future__ import absolute_import

import boto3
import pytest
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, rule_configs

In [2]:
role = get_execution_role()
role

'arn:aws:iam::926857016169:role/pllarroy'

In [3]:
hyperparameters = {'random_seed': True, 'num_steps': 50, 'epochs': 5,
                   'data_dir':'/tmp/pytorch-smdebug'}

In [4]:
rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing())
]

estimator = PyTorch(
                  entry_point='train.py',
                  role=role,
                  train_instance_count=1,
                  train_instance_type='local',
                  train_volume_size=400,
                  train_max_run=3600,
                  hyperparameters=hyperparameters,
                  framework_version='1.3.1',
                  py_version='py3',
                  rules = rules
                 )



In [5]:
estimator.fit(wait=False)

Creating tmprmjgokvr_algo-1-prn7w_1 ... 
[1BAttaching to tmprmjgokvr_algo-1-prn7w_12mdone[0m
[36malgo-1-prn7w_1  |[0m 2020-03-04 01:11:21,618 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training
[36malgo-1-prn7w_1  |[0m 2020-03-04 01:11:21,621 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-prn7w_1  |[0m 2020-03-04 01:11:21,635 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
[36malgo-1-prn7w_1  |[0m 2020-03-04 01:11:21,644 sagemaker_pytorch_container.training INFO     Invoking user training script.
[36malgo-1-prn7w_1  |[0m 2020-03-04 01:11:22,048 sagemaker-containers INFO     Module default_user_module_name does not provide a setup.py. 
[36malgo-1-prn7w_1  |[0m Generating setup.py
[36malgo-1-prn7w_1  |[0m 2020-03-04 01:11:22,048 sagemaker-containers INFO     Generating setup.cfg
[36malgo-1-prn7w_1  |[0m 2020-03-04 01:11:22,049 sagemaker-containers INFO     

[36malgo-1-prn7w_1  |[0m Create neural network module
[36malgo-1-prn7w_1  |[0m INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)
[36malgo-1-prn7w_1  |[0m INFO:__main__:Create neural network module
[36malgo-1-prn7w_1  |[0m Traceback (most recent call last):
[36malgo-1-prn7w_1  |[0m   File "train.py", line 195, in <module>
[36malgo-1-prn7w_1  |[0m     main()
[36malgo-1-prn7w_1  |[0m   File "train.py", line 187, in main
[36malgo-1-prn7w_1  |[0m     hook = create_smdebug_hook()
[36malgo-1-prn7w_1  |[0m   File "train.py", line 121, in create_smdebug_hook
[36malgo-1-prn7w_1  |[0m     hook = smd.Hook.create_from_json_file()
[36malgo-1-prn7w_1  |[0m   File "/opt/conda/lib/python3.6/site-packages/smdebug/core/hook.py", line 248, in create_from_json_file
[36malgo-1-prn7w_1  |[0m     return create_hook_from_json_config(cls, json_config_path=json_file_path)
[36malgo-1-prn7w_1  |[0m   File "/opt/conda/lib/python3.6/site-packages/smdebug/core/json_con

RuntimeError: Failed to run: ['docker-compose', '-f', '/tmp/tmprmjgokvr/docker-compose.yaml', 'up', '--build', '--abort-on-container-exit'], Process exited with code: 1

In [None]:
path = estimator.latest_job_debugger_artifacts_path()
print('Tensors are stored in: {}'.format(path))

In [None]:
estimator.latest_training_job.rule_job_summary()

In [None]:
from smdebug.trials import create_trial

path = estimator.latest_job_debugger_artifacts_path()
trial = create_trial(path)

In [None]:
# First Party
from smdebug import modes
from smdebug.analysis.utils import parse_bool, parse_list_from_str
from smdebug.core.modes import ALLOWED_MODE_NAMES
from smdebug.rules.rule import Rule
import numpy as np
import sys

class LossNotDecreasing(Rule):
    def __init__(
        self,
        base_trial,
        collection_names=None,
        tensor_regex=None,
        use_losses_collection=True,
        num_steps=10,
        diff_percent=0.1,
        increase_threshold_percent=5.0,
        mode=None,
        absolute_val=None,
    ):
        """
        This rule helps you identify if you are running into a situation
        where loss is not going down fast enough as termed by diff_percent or
        loss has increased by increase_threshold_percent.
        Note that if loss tensor is not scalar, mean is calculated for loss tensor to convert it to scalar.

        :param base_trial: the trial whose execution will invoke the rule
        :param collection_names: List of str representing collection names.
              The tensors belonging to these collections will be considered.
              Note that only scalar tensors will be picked.
        :param tensor_regex: List of str representing regex patterns.
              The tensors matching these patterns will be considered.
              If both collection_names and tensor_regex are specified,
              the rule will check for union of tensors.
              Note that only scalar tensors will be picked.
        :param use_losses_collection: bool
              Tries to use the collection 'losses' to fetch the losses
        :param num_steps: int
              The minimum number of steps after which
              we want which we check if the loss has decreased.
              The rule evaluation happens every num_steps, and
              the rule checks the loss for this step with the loss at the
              newest step which is at least num_steps behind the current step.
              For example, if the loss is being saved every 4 steps and steps saved are 0,4,8,12,16,20,24,28,32,36
              but num_steps is 10. First invocation will happen at step 12 and loss at step 12 is compared to loss at step 0
              Next invocation would happen at step:24(since 10 steps after 12 would be 22 and there is no 22 && 23) and loss at step:24 is compared with loss at step 12
              Next invocation would happen at step:36(since 10 steps after 24 would be 34 and there is no 34 && 35) and loss at step:36 is compared with loss at step 24
              Default: 10
        :param diff_percent: float
            The minimum difference in percentage that loss should be lower by.
            By default, the rule just checks if loss is going down.
            If you want to specify a stricter check that loss is
            going down fast enough, you might want to pass diff_percent. Default: 0.1
        :param increase_threshold_percent: float
            The maximum threshold percent that loss is allowed to increase in case loss has been
            increasing. Default: 5
        :param mode: string
            name of mode to query tensor values for rule checking.
            If this is not passed, the rule checks for GLOBAL mode. Allowed values are TRAIN, EVAL, GLOBAL
            Default: GLOBAL
        :param absolute_val: string
            If this is False, rule checks for original values of loss tensors. By default: rule checks if absolute loss is decreasing or not
            Default: True
        """
        super().__init__(base_trial)
        self.tensor_regex = parse_list_from_str(tensor_regex)
        self.collection_names = parse_list_from_str(collection_names)
        self.use_losses_collection = parse_bool(use_losses_collection, True)
        self.num_steps = int(num_steps)
        self._set_mode(mode)
        self._set_min_diff(diff_percent)
        self._load_tensor_names()

        self.last_evaluated_step = None
        self.increase_threshold_percent = float(increase_threshold_percent)
        self.absolute_val = parse_bool(absolute_val, True)
        self.logger.info(
            "LossNotDecreasing rule created with num_steps: {},"
            " diff_percent: {}, increase_threshold_percent: {}, mode: {}, tensor_regex: {}, "
            "collection_names: {} absolute_val:{}".format(
                self.num_steps,
                self.min_diff_percent,
                self.increase_threshold_percent,
                self.mode.name,
                ",".join(self.tensor_regex),
                ",".join(self.collection_names),
                self.absolute_val,
            )
        )

    def _set_min_diff(self, min_diff_percent):
        if min_diff_percent is not None:
            self.min_diff_percent = float(min_diff_percent)
        else:
            self.min_diff_percent = 0.0
        if self.min_diff_percent < 0.0 or self.min_diff_percent > 100.0:
            raise SageMakerDebuggerRuleConfigValidationError(
                self.rule_name,
                "diff_percent {} has to be between 0.0 and 100.0".format(self.min_diff_percent),
            )

    def _set_mode(self, mode):
        if mode is None:
            self.mode = modes.GLOBAL
        elif mode in ALLOWED_MODE_NAMES:
            self.mode = modes[mode]
        else:
            raise SageMakerDebuggerRuleConfigValidationError(
                self.rule_name, "mode can only be one of {}".format(",".join(ALLOWED_MODE_NAMES))
            )

    def _load_tensor_names(self):
        self.tensor_names = set()

        if self.use_losses_collection and "losses" not in self.collection_names:
            self.collection_names.append("losses")

        for cname in self.collection_names:
            try:
                c = self.base_trial.collection(cname)
                tn = c.tensor_names
                self.tensor_names.update(tn)
            except KeyError:
                self.logger.info("Could not find collection {}".format(cname))

        if not len(self.tensor_regex) and not len(self.tensor_names):
            self.tensor_regex.append("loss")

    def _update_tensor_names(self):
        for tname in self.base_trial.tensor_names(regex=self.tensor_regex):
            self.tensor_names.add(tname)

    def _get_prev_global_step(self, prev_mode_steps, curr_mode_step):
        # now go and find the last step
        # which is at least num_steps behind
        if len(prev_mode_steps) > 0:
            last_step = prev_mode_steps[-1]
            if last_step == curr_mode_step:
                for prev_mode_step in reversed(prev_mode_steps):
                    step_diff = curr_mode_step - prev_mode_step
                    if step_diff >= self.num_steps:
                        prev_global_step = self.base_trial.global_step(
                            mode=self.mode, mode_step=prev_mode_step
                        )
                        return prev_global_step
        return None

    def set_required_tensors(self, step):
        # we call this on every step as there may be
        # new tensors after some steps
        # since we are querying tensors by regex
        self._update_tensor_names()
        # for every num_steps
        if self.last_evaluated_step is None or self.last_evaluated_step + self.num_steps <= step:
            # get mode and mode_step for the current step
            curr_mode, curr_mode_step = self.base_trial.mode_modestep(step)
            self.logger.debug(
                "Current mode is: {} and mode_step is {} ".format(curr_mode, curr_mode_step)
            )
            if curr_mode == self.mode:
                for tname in self.tensor_names:
                    # get steps before mode_step in the current mode
                    prev_mode_steps = self.base_trial.tensor(tname).prev_steps(
                        curr_mode_step, mode=self.mode
                    )
                    self.logger.debug(
                        "tname:{} Previous mode steps :{}".format(tname, prev_mode_steps)
                    )
                    prev_global_step = self._get_prev_global_step(prev_mode_steps, curr_mode_step)
                    self.logger.debug("Previous global step:{}".format(prev_global_step))
                    if prev_global_step is not None:
                        steps = [prev_global_step, step]
                        self.logger.info(
                            "Adding tname:{} and steps:{} to req_tensors".format(tname, steps)
                        )
                        self.req_tensors.add(tname, steps=steps)

    def _add_failed_tname(self, failed_tnames, tname, step, old_val, new_val):
        self.logger.info(
            "Loss {} is not decreasing over the last {} steps "
            "at step {}. It was {} and now is {}".format(
                tname, self.num_steps, step, old_val, new_val
            )
        )
        failed_tnames.append(tname)
        return failed_tnames

    def invoke_at_step(self, step):
        failed_tnames = []
        for tensor in self.req_tensors.get():
            steps = self.req_tensors.get_tensor_steps(tensor.name)
            self.logger.info(
                "Checking loss values(tensor_name:{}) at step:{} and step:{} ".format(
                    tensor.name, steps[0], steps[1]
                )
            )
            old_val = tensor.value(steps[0])
            new_val = tensor.value(steps[1])
            if old_val.size != 1:
                old_val = np.ones(shape=(1,)) * np.mean(old_val)
                self.logger.info("Calculated mean:{} from old_val loss vector".format(old_val[0]))
            if new_val.size != 1:
                new_val = np.ones(shape=(1,)) * np.mean(new_val)
                self.logger.info("Calculated mean:{} from new_val loss vector".format(new_val[0]))
            if old_val.size == 1 and new_val.size == 1:
                if self.absolute_val:
                    old_val = abs(float(old_val[0]))
                    new_val = abs(float(new_val[0]))
                else:
                    old_val = float(old_val[0])
                    new_val = float(new_val[0])

                min_decreased_loss_allowed = old_val - old_val * (self.min_diff_percent / 100.0)
                max_increased_loss_allowed = old_val + old_val * (
                    self.increase_threshold_percent / 100.0
                )
                diff = old_val - new_val
                if diff >= 0 and new_val > min_decreased_loss_allowed + sys.float_info.epsilon:
                    self.logger.info(
                        "Loss is not decreasing fast enough, diff is:{} , abs_old_val:{} , abs_new_val:{} min_required_change_percent:{} min_decreased_loss_allowed:{}".format(
                            diff,
                            old_val,
                            new_val,
                            self.min_diff_percent,
                            min_decreased_loss_allowed,
                        )
                    )
                    self._add_failed_tname(failed_tnames, tensor.name, step, old_val, new_val)
                elif diff < 0 and new_val > max_increased_loss_allowed + sys.float_info.epsilon:
                    self.logger.info(
                        "Loss is increasing and breached increase_threshold_percent:{}, diff is:{} , abs_old_val:{} , abs_new_val:{} , max_increased_loss_allowed:{} ".format(
                            self.increase_threshold_percent,
                            diff,
                            old_val,
                            new_val,
                            max_increased_loss_allowed,
                        )
                    )
                    self._add_failed_tname(failed_tnames, tensor.name, step, old_val, new_val)
                self.logger.info(
                    "loss_mode:{} step:{} loss_value:{} previous_step:{} previous_loss_value:{} min_decreased_loss_allowed:{} , max_increased_loss_allowed:{}".format(
                        self.mode,
                        steps[1],
                        new_val,
                        steps[0],
                        old_val,
                        min_decreased_loss_allowed,
                        max_increased_loss_allowed,
                    )
                )
            else:
                self.logger.warning(
                    "Tensor {} was not a scalar or 1-D array as expected at the "
                    "steps {}".format(tensor.name, ",".join(steps))
                )

        if len(self.req_tensors.get_names()):
            self.last_evaluated_step = step

        if failed_tnames:
            message = "{} {} not decreasing over the last {} steps at step {}"
            s = "losses are " if len(failed_tnames) > 1 else "loss is"
            self.logger.info(message.format(len(failed_tnames), s, self.num_steps, step))
            return True
        else:
            return False


In [None]:
from smdebug.rules.rule_invoker import invoke_rule

rule = LossNotDecreasing(trial, tensor_regex="CrossEntropyLoss_output_0", mode="TRAIN")
invoke_rule(rule)