In [1]:
# !pip install -e ./src

In [1]:
import os
import json
import argparse
import sys
import warnings
from pathlib import Path
from ast import literal_eval
warnings.filterwarnings('ignore')

import torch
import torchvision as tv
import pytorch_lightning as pl
import webdataset as wds
from sm_resnet.models import ResNet
from sm_resnet.callbacks import PlSageMakerLogger, ProfilerCallback, SMDebugCallback
import smdebug.pytorch as smd
from smdebug.core.reduction_config import ReductionConfig
from smdebug.core.save_config import SaveConfig
from smdebug.core.collection import CollectionKeys
from smdebug.core.config_constants import DEFAULT_CONFIG_FILE_PATH

world_size = int(os.environ.get("WORLD_SIZE", 1))
rank = int(os.environ.get("RANK", 0))
local_rank = int(os.environ.get("LOCAL_RANK", 0))

[2022-06-03 04:58:25.517 pytorch-1-10-gpu-py-ml-g4dn-xlarge-53638dc8bb3d40f5ed7ac21aaf03:1512 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2022-06-03 04:58:25.644 pytorch-1-10-gpu-py-ml-g4dn-xlarge-53638dc8bb3d40f5ed7ac21aaf03:1512 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


In [2]:
s3_bucket = "s3://jbsnyder-sagemaker-us-east/"

model_params = {'num_classes': 1000,
                    'resnet_version': 50,
                    'train_path': os.path.join(s3_bucket, "data", "imagenet", "train"),
                    'val_path': os.path.join(s3_bucket, "data", "imagenet", "val"),
                    'optimizer': 'adamw',
                    'lr': 0.004, 
                    'batch_size': 64,
                    'dataloader_workers': 0,
                    'max_epochs': 2,
                    'warmup_epochs': 1,
                    'mixup_alpha': 0.1
                   }

reduction_config = ReductionConfig(['mean'])
save_config = SaveConfig(save_interval=25)
include_collections = [CollectionKeys.LOSSES]
smd_callback = SMDebugCallback(out_dir='./smdebugger',
                               export_tensorboard=True,
                               tensorboard_dir='./tensorboard',
                               reduction_config=reduction_config,
                               save_config=save_config,
                               include_regex=None,
                               include_collections=include_collections,
                               save_all=False,)

trainer_params = {'gpus': [local_rank],
                  'max_epochs': 2,
                  'precision': 16,
                  'progress_bar_refresh_rate': 0,
                  'replace_sampler_ddp': False,
                  'callbacks': [PlSageMakerLogger(),
                                smd_callback]
                  }

In [3]:
model = ResNet(**model_params)

In [4]:
'''# Setup Debugger
if Path(DEFAULT_CONFIG_FILE_PATH).exists():
    smd.Hook.register_hook(model.model, model.criterion)
else:
    reduction_config = ReductionConfig(['mean'])
    save_config = SaveConfig(save_interval=25)
    include_collections = [CollectionKeys.LOSSES]
    hook = smd.Hook(out_dir='./smdebugger',
                    export_tensorboard=True,
                    tensorboard_dir='./tensorboard',
                    reduction_config=reduction_config,
                    save_config=save_config,
                    include_regex=None,
                    include_collections=include_collections,
                    save_all=False,)
    hook.register_module(model.model)
    hook.register_loss(model.criterion)'''

"# Setup Debugger\nif Path(DEFAULT_CONFIG_FILE_PATH).exists():\n    smd.Hook.register_hook(model.model, model.criterion)\nelse:\n    reduction_config = ReductionConfig(['mean'])\n    save_config = SaveConfig(save_interval=25)\n    include_collections = [CollectionKeys.LOSSES]\n    hook = smd.Hook(out_dir='./smdebugger',\n                    export_tensorboard=True,\n                    tensorboard_dir='./tensorboard',\n                    reduction_config=reduction_config,\n                    save_config=save_config,\n                    include_regex=None,\n                    include_collections=include_collections,\n                    save_all=False,)\n    hook.register_module(model.model)\n    hook.register_loss(model.criterion)"

In [5]:
trainer = pl.Trainer(**trainer_params)

Using 16bit native Automatic Mixed Precision (AMP)
  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [6]:
trainer.fit(model)

Missing logger folder: /root/sagemaker_lightning/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[2022-06-03 04:58:35.198 pytorch-1-10-gpu-py-ml-g4dn-xlarge-53638dc8bb3d40f5ed7ac21aaf03:1512 INFO hook.py:254] Saving to ./smdebugger
[2022-06-03 04:58:35.199 pytorch-1-10-gpu-py-ml-g4dn-xlarge-53638dc8bb3d40f5ed7ac21aaf03:1512 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.
[2022-06-03 04:58:35.203 pytorch-1-10-gpu-py-ml-g4dn-xlarge-53638dc8bb3d40f5ed7ac21aaf03:1512 INFO hook.py:560] name:model.conv1.weight count_params:9408
[2022-06-03 04:58:35.204 pytorch-1-10-gpu-py-ml-g4dn-xlarge-53638dc8bb3d40f5ed7ac21aaf03:1512 INFO hook.py:560] name:model.bn1.weight count_params:64
[2022-06-03 04:58:35.204 pytorch-1-10-gpu-py-ml-g4dn-xlarge-53638dc8bb3d40f5ed7ac21aaf03:1512 INFO hook.py:560] name:model.bn1.bias count_params:64
[2022-06-03 04:58:35.205 pytorch-1-10-gpu-py-ml-g4dn-xlarge-53638dc8bb3d40f5ed7ac21aaf03:1512 INFO hook.py:560] name:model.layer1.0.conv1.weight count_params:4096
[2022-06-03 04:58:35.205 pytorch-1-10-gpu-py-m


  | Name      | Type             | Params
-----------------------------------------------
0 | criterion | CrossEntropyLoss | 0     
1 | model     | ResNet           | 25.6 M
-----------------------------------------------
25.6 M    Trainable params
0         Non-trainable params
25.6 M    Total params
51.114    Total estimated model params size (MB)


[2022-06-03 04:58:36.971 pytorch-1-10-gpu-py-ml-g4dn-xlarge-53638dc8bb3d40f5ed7ac21aaf03:1512 INFO hook.py:421] Monitoring the collections: losses
[2022-06-03 04:58:36.989 pytorch-1-10-gpu-py-ml-g4dn-xlarge-53638dc8bb3d40f5ed7ac21aaf03:1512 INFO hook.py:485] Hook is writing from the hook with pid: 1512

Validation
val_loss: 68.0511 val_acc: 0.0000
Step : 10 of epoch 1
Training Losses:
train_loss_step: 9.8896 train_acc_step: 0.0000
Step time: 934.00 milliseconds
Step : 20 of epoch 1
Training Losses:
train_loss_step: 7.6051 train_acc_step: 0.0000
Step time: 830.52 milliseconds
Step : 30 of epoch 1
Training Losses:
train_loss_step: 7.1223 train_acc_step: 0.0156
Step time: 813.91 milliseconds
Step : 40 of epoch 1
Training Losses:
train_loss_step: 6.8978 train_acc_step: 0.0000
Step time: 777.52 milliseconds
Step : 50 of epoch 1
Training Losses:
train_loss_step: 7.0656 train_acc_step: 0.0000
Step time: 801.17 milliseconds
Step : 60 of epoch 1
Training Losses:
train_loss_step: 6.9351 train_ac

In [7]:
Path(DEFAULT_CONFIG_FILE_PATH).exists()

False