In [1]:
import time

import torch
import torch.utils.data

import composer
import matplotlib.pyplot as plt

from torchvision import datasets, transforms
from composer.loggers import InMemoryLogger, LogLevel
from composer.core.time import Time, Timestamp

torch.manual_seed(42) # For replicability

<torch._C.Generator at 0x7f17549bbd90>

In [2]:
data_directory = "../data"

# Normalization constants
mean = (0.507, 0.487, 0.441)
std = (0.267, 0.256, 0.276)

batch_size = 512

cifar10_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean, std)])

train_dataset = datasets.CIFAR10(data_directory, train=True, download=True, transform=cifar10_transforms)
test_dataset = datasets.CIFAR10(data_directory, train=False, download=True, transform=cifar10_transforms)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
logger = InMemoryLogger(log_level=LogLevel.BATCH)


In [27]:
from composer import models
model = models.composer_resnet_cifar(model_name='resnet_56', num_classes=10)

In [28]:
optimizer = composer.optim.DecoupledSGDW(
    model.parameters(), # Model parameters to update
    lr=0.05, # Peak learning rate
    momentum=0.9,
    weight_decay=2.0e-3 # If this looks large, it's because its not scaled by the LR as in non-decoupled weight decay
)

You are using a high value of `weight_decay=0.002` for the `DecoupledSGDW` optimizer. Are you sure you want to do this? Your model's weights will be multiplied by 0.998 on every step!


In [29]:
lr_scheduler = composer.optim.LinearWithWarmupScheduler(
    t_warmup="1ep", # Warm up over 1 epoch
    alpha_i=1.0, # Flat LR schedule achieved by having alpha_i == alpha_f
    alpha_f=1.0
)

In [30]:
train_epochs = "3ep" # Train for 3 epochs because we're assuming Colab environment and hardware
device = "gpu" if torch.cuda.is_available() else "cpu" # select the device

trainer = composer.trainer.Trainer(
    model=model,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    max_duration=train_epochs,
    optimizers=optimizer,
    schedulers=lr_scheduler,
    device=device

)

In [31]:
start_time = time.perf_counter()
trainer.fit()
end_time = time.perf_counter()
print(f"It took {end_time - start_time:0.4f} seconds to train")

train          Epoch   0:    0%|| 0/98 [00:00<?, ?ba/s]         

eval           Epoch   0:    0%|| 0/20 [00:00<?, ?ba/s]         

train          Epoch   1:    0%|| 0/98 [00:00<?, ?ba/s]         

eval           Epoch   1:    0%|| 0/20 [00:00<?, ?ba/s]         

train          Epoch   2:    0%|| 0/98 [00:00<?, ?ba/s]         

eval           Epoch   2:    0%|| 0/20 [00:00<?, ?ba/s]         

It took 58.4973 seconds to train


In [8]:
timeseries_raw = logger.get_timeseries("metrics/eval/Accuracy")
plt.plot(timeseries_raw['epoch'], timeseries_raw["metrics/eval/Accuracy"])
plt.xlabel("Epoch")
plt.ylabel("Validation Accuracy")
plt.title("Accuracy per epoch without Composer")
plt.show()

NameError: name 'logger' is not defined

In [32]:
label_smoothing = composer.algorithms.LabelSmoothing(0.1)

In [33]:
blurpool = composer.algorithms.BlurPool(
    replace_convs=True, # Blur before convs
    replace_maxpools=True, # Blur before max-pools
    blur_first=True # Blur before conv/max-pool
)

In [34]:
prog_resize = composer.algorithms.ProgressiveResizing(
    initial_scale=.6, # Size of images at the beginning of training = .6 * default image size
    finetune_fraction=0.34, # Train on default size images for 0.34 of total training time.
)

In [35]:
algorithms = [label_smoothing, blurpool, prog_resize]

In [41]:
model = models.composer_resnet_cifar(model_name="resnet_56", num_classes=10)



optimizer = composer.optim.DecoupledSGDW(
    model.parameters(),
    lr=0.05,
    momentum=0.9,
    weight_decay=2.0e-3
)

trainer = composer.trainer.Trainer(
    model=model,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    max_duration=train_epochs,
    optimizers=optimizer,
    schedulers=lr_scheduler,
    device=device,
    precision='amp',
    algorithms=algorithms # Adding algorithms this time
)

You are using a high value of `weight_decay=0.002` for the `DecoupledSGDW` optimizer. Are you sure you want to do this? Your model's weights will be multiplied by 0.998 on every step!


In [42]:
start_time = time.perf_counter()
trainer.fit()
end_time = time.perf_counter()
three_epochs_accelerated_time = end_time - start_time
print(f"It took {three_epochs_accelerated_time:0.4f} seconds to train")

train          Epoch   0:    0%|| 0/98 [00:00<?, ?ba/s]         



eval           Epoch   0:    0%|| 0/20 [00:00<?, ?ba/s]         

It took 19.6964 seconds to train


In [43]:
train_epochs = "1ep"

In [44]:
lr_scheduler = composer.optim.scheduler.ConstantScheduler(alpha=1.0, t_max='1dur')

In [45]:
algorithms = [label_smoothing]

In [46]:


trainer = composer.trainer.Trainer(
    model=model,
    train_dataloader=train_dataloader,
    eval_dataloader=test_dataloader,
    max_duration=train_epochs,
    optimizers=optimizer,
    schedulers=lr_scheduler,
    device=device,
    precision='amp',
    algorithms=algorithms
)

start_time = time.perf_counter()
trainer.fit()

end_time = time.perf_counter()
final_epoch_accelerated_time = end_time - start_time
# Time for four epochs = time for three epochs + time for fourth epoch
four_epochs_accelerated_time = three_epochs_accelerated_time + final_epoch_accelerated_time
print(f"It took {four_epochs_accelerated_time:0.4f} seconds to train")

train          Epoch   0:    0%|| 0/98 [00:00<?, ?ba/s]         



eval           Epoch   0:    0%|| 0/20 [00:00<?, ?ba/s]         

It took 39.5432 seconds to train
