In [5]:
import sagemaker
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig
collection_config_1 = CollectionConfig(name='all')
d = DebuggerHookConfig(
    s3_output_path='s3://path/for/data/emission',
    container_local_output_path='/local/path/for/data/emission',
    hook_parameters={
        'save_interval': '10'
    },
    collection_configs=[
        collection_config_1
    ]
)
print(d._to_request_dict())

{'S3OutputPath': 's3://path/for/data/emission', 'LocalPath': '/local/path/for/data/emission', 'HookParameters': {'save_interval': '10'}, 'CollectionConfigurations': [{'CollectionName': 'all'}]}


In [None]:
import sys
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

import torch
import torchvision as tv
import pytorch_lightning as pl
import webdataset as wds

In [6]:
import smdebug.pytorch as smd
hook = smd.Hook(
    'out_dir',
    export_tensorboard = False,
    tensorboard_dir = None,
    dry_run = False,
    reduction_config = None,
    save_config = None,
    include_regex = None,
    include_collections = None,
    save_all = True,
    include_workers="one"
)

[2023-06-08 15:01:12.987 ip-172-31-93-109:2987 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2023-06-08 15:01:13.066 ip-172-31-93-109:2987 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
TEST :out_dir 
[2023-06-08 15:01:13.067 ip-172-31-93-109:2987 INFO hook.py:207] tensorboard_dir has not been set for the hook. SMDebug will not be exporting tensorboard summaries.
[2023-06-08 15:01:13.068 ip-172-31-93-109:2987 INFO hook.py:260] Saving to out_dir
[2023-06-08 15:01:13.068 ip-172-31-93-109:2987 INFO state_store.py:77] The checkpoint config file /opt/ml/input/config/checkpointconfig.json does not exist.


In [6]:
from smdebug.trials import create_trial
from smdebug import modes
smdebug_trial = create_trial("/home/ubuntu/resnet-sagemaker/pytorch/debugger_logs/")

[2023-06-07 20:26:11.373 ip-172-31-93-109:94668 INFO local_trial.py:35] Loading trial  at path /home/ubuntu/resnet-sagemaker/pytorch/debugger_logs/


In [7]:
for step in smdebug_trial.steps(mode=modes.TRAIN):
    print(step)


[2023-06-07 20:26:12.196 ip-172-31-93-109:94668 INFO trial.py:197] Training has ended, will refresh one final time in 1 sec.
[2023-06-07 20:26:13.197 ip-172-31-93-109:94668 INFO trial.py:210] Loaded all steps
0
500
1000
1500
2000
2500


In [8]:
smdebug_trial.tensor_names()

['NLLLoss_output_0']

In [None]:

class ResNet(pl.LightningModule):
    
    def __init__(self, num_classes, resnet_version,
                 train_path, val_path, optimizer='adamw',
                 lr=1e-3, batch_size=64,
                 dataloader_workers=4, 
                 *args, **kwargs):
        super().__init__()
        
        self.__dict__.update(locals())
        
        resnets = {
            18:tv.models.resnet18,
            34:tv.models.resnet34,
            50:tv.models.resnet50,
            101:tv.models.resnet101,
            152:tv.models.resnet152
        }
        
        optimizers = {
            'adam': torch.optim.Adam,
            'adamw': torch.optim.AdamW,
            'sgd': torch.optim.SGD
        }
        
        self.optimizer = optimizers[optimizer]
        self.criterion = torch.nn.CrossEntropyLoss()
        
        self.model = resnets[resnet_version]()
        linear_size = list(self.model.children())[-1].in_features
        self.model.fc = torch.nn.Linear(linear_size, num_classes)
        
    def forward(self, X):
        return self.model(X)
    
    def configure_optimizers(self):
        return self.optimizer(self.parameters(), lr=self.lr)
    
    def train_dataloader(self):
        preproc = tv.transforms.Compose([
                tv.transforms.ToTensor(),
                tv.transforms.Normalize((0.485, 0.456, 0.406), 
                                        (0.229, 0.224, 0.225)),
                tv.transforms.Resize((224, 224))
            ])
        dataset = wds.WebDataset(self.train_path, resampled=True).shuffle(1024) \
                        .decode("pil").to_tuple("jpeg", "cls").map_tuple(preproc, lambda x:x) \
                        .with_epoch(10000)
        return wds.WebLoader(dataset, 
                                           num_workers=self.dataloader_workers, 
                                           batch_size=self.batch_size)
    
    def val_dataloader(self):
        preproc = tv.transforms.Compose([
                tv.transforms.ToTensor(),
                tv.transforms.Normalize((0.485, 0.456, 0.406), 
                                        (0.229, 0.224, 0.225)),
                tv.transforms.Resize((224, 224))
            ])
        dataset = wds.WebDataset(self.val_path, resampled=True).shuffle(1024) \
                        .decode("pil").to_tuple("jpeg", "cls").map_tuple(preproc, lambda x:x)\
                        .with_epoch(10000)
        return wds.WebLoader(dataset, 
                                           num_workers=self.dataloader_workers, 
                                           batch_size=self.batch_size)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = self.criterion(preds, y)
        acc = (y == torch.argmax(preds, 1)).type(torch.FloatTensor).mean()
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
        self.log("train_acc", acc, on_step=True, on_epoch=True, prog_bar=False, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        preds = self(x)
        loss = self.criterion(preds, y)
        acc = (y == torch.argmax(preds, 1)).type(torch.FloatTensor).mean()
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log("train_acc", acc, on_step=False, on_epoch=True, prog_bar=False, logger=True)
    

In [None]:
class PlSageMakerLogger(pl.Callback):
    
    def __init__(self, frequency=10):
        self.frequency=frequency
        self.step = 0
        
    def on_epoch_start(self, trainer, module, *args, **kwargs):
        self.inner_step = 0
    
    def on_train_batch_end(self, trainer, module, *args, **kwargs):
        if self.inner_step%self.frequency==0:
            print(' '.join(["{0}: {1:.4f}".format(i, float(j)) for i,j in trainer.logged_metrics.items()]))
        self.inner_step += 1
        self.step += 1

In [None]:
model_params = {'num_classes': 1000,
                'resnet_version': 50,
                'train_path': 'pipe:aws s3 cp s3://jbsnyder-sagemaker-us-east/data/imagenet/train/train_{0000..2047}.tar -',
                'val_path': 'pipe:aws s3 cp s3://jbsnyder-sagemaker-us-east/data/imagenet/val/val_{0000..0127}.tar -',
                'optimizer': 'adamw',
                'lr': 1e-3, 
                'batch_size': 64,
                'dataloader_workers': 0}

trainer_params = {'accelerator': 'gpu',
                  'max_epochs': 4,
                  'num_nodes': 1,
                  'precision': 16,
                #   'callbacks': [PlSageMakerLogger()]
                  }

In [None]:
model = ResNet(**model_params)
trainer = pl.Trainer(**trainer_params)

In [None]:
trainer.fit(model)

In [None]:
dataloader = model.train_dataloader()

In [None]:
for batch_idx, batch in enumerate(model.val_dataloader()):
    print(len(batch))
    break