# Setup PyTorch and Ray Tune



In [3]:
!pip install torch torchvision
!pip install ray



In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
from torchvision import datasets, transforms
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
import torchvision.models as models
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

# Helper code

## Data loader

In [15]:
def data_loader(batch_size=4):
  normalize = transforms.Normalize(mean=0.2859,
                                      std=0.3530)

  fmnist_training_data = torchvision.datasets.FashionMNIST("/content", 
                                              train=True,
                                              transform = transforms.Compose([
                                                                              transforms.ToTensor(),
                                                                              normalize,]), 
                                              download=True)
  
  fmnist_val_data = torchvision.datasets.FashionMNIST("/content", 
                                              train=True,
                                              transform = transforms.Compose([transforms.ToTensor(),
                                                                              normalize,]), 
                                              download=True)

  fmnist_testing_data = torchvision.datasets.FashionMNIST("/content", 
                                              train=False,
                                              transform = transforms.Compose([transforms.ToTensor(),
                                                                              normalize,]), 
                                              download=True)
  # data = [torch.Size([10, 3, 32, 32]), torch.Size([10])]
  num_train = len(fmnist_training_data)
  indices = list(range(num_train))
  split = 5000 #45k/5k train/val split
  train_idx, valid_idx = indices[split:], indices[:split]
  train_sampler = SubsetRandomSampler(train_idx)
  valid_sampler = SubsetRandomSampler(valid_idx)
  fmnist_training_data_loader = torch.utils.data.DataLoader(fmnist_training_data, batch_size, sampler=train_sampler, shuffle=False)
  fmnist_val_data_loader = torch.utils.data.DataLoader(fmnist_val_data, batch_size, sampler=valid_sampler, shuffle=False)
  fmnist_testing_data_loader = torch.utils.data.DataLoader(fmnist_testing_data, batch_size, shuffle=True)
  return fmnist_training_data_loader, fmnist_val_data_loader, fmnist_testing_data_loader

  # for data in cifar10_training_data_loader:
  #   # print("data: ", data)
  #   images, labels = data[0], data[1]
  #   print("images.shape: {}, labels.shape: {}".format(images.shape, labels.shape))
  #   break

In [13]:
_, _, _ = data_loader()

## Model

In [17]:
# A 2-conv-layer block of ResNet 
class block(nn.Module):
  def __init__(self, num_filters, enable_subsample):
    super().__init__()
    if enable_subsample:
      self.conv1 = nn.Conv2d(num_filters // 2, num_filters, kernel_size=3, stride=2, padding=1, bias=False)
    else:
      self.conv1 = nn.Conv2d(num_filters, num_filters, kernel_size=3, padding=1, bias=False)
    self.bn1 = nn.BatchNorm2d(num_filters)
    self.relu1 = nn.ReLU()

    self.conv2 = nn.Conv2d(num_filters, num_filters, kernel_size=3, padding=1, bias=False)
    self.bn2 = nn.BatchNorm2d(num_filters)
    self.relu2 = nn.ReLU()

    # Weight initialization as in https://github.com/a-martyn/resnet/blob/master/resnet.py
    for m in self.modules():
      if isinstance(m, nn.Conv2d):
          nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
      elif isinstance(m, (nn.BatchNorm2d)):
          nn.init.constant_(m.weight,1)
          nn.init.constant_(m.bias, 0) 

  def forward(self, x, enable_skip_connections=False):
    out = self.conv1(x)
    out = self.bn1(out)
    out = self.relu1(out)
    out = self.conv2(out)
    out = self.bn2(out)
    if enable_skip_connections:
      # print(out.shape, x.shape)
      if out.shape != x.shape:
        W_s = nn.Conv2d(x.shape[1], out.shape[1], kernel_size=1, stride=2).to(device='cuda')
        x = W_s(x)
        # print("after", out.shape, x.shape)
      else:
        out = x + out
    out = self.relu2(out)
    return out

# ResNet for CIFAR-10 as in paper
class ResNet(nn.Module):
  def __init__(self, n, enable_skip_connections=False):
    super().__init__()
    self.skip_connection = enable_skip_connections

    self.num_layer1_filters = 16
    self.num_layer2_filters = 32
    self.num_layer3_filters = 64

    self.layer0 = nn.Sequential(
        nn.Conv2d(1, self.num_layer1_filters, kernel_size=3, padding=1, bias=False),
        nn.BatchNorm2d(self.num_layer1_filters),
        nn.ReLU()
    )
    
    self.layer1 = nn.ModuleList([block(self.num_layer1_filters, enable_subsample=False) for i in range(n)])
    self.layer2_subsample = block(self.num_layer2_filters, enable_subsample=True)
    self.layer2 = nn.ModuleList([block(self.num_layer2_filters, enable_subsample=False) for i in range(n-1)])
    self.layer3_subsample = block(self.num_layer3_filters, enable_subsample=True)
    self.layer3 = nn.ModuleList([block(self.num_layer3_filters, enable_subsample=False) for i in range(n-1)])

    self.avgpooling = nn.AdaptiveAvgPool2d(1)
    self.fc_layer = nn.Linear(self.num_layer3_filters, 10)
    self.softmax = nn.Softmax(dim=1)    
  
  def forward(self, x):
    out = self.layer0(x)
    for block in self.layer1:
      out = block(out, self.skip_connection)
    out = self.layer2_subsample(out, self.skip_connection)
    for block in self.layer2:
      out = block(out, self.skip_connection)
    out = self.layer3_subsample(out, self.skip_connection)
    for block in self.layer3:
      out = block(out, self.skip_connection)
    out = self.avgpooling(out)
    out = out.reshape((-1, self.num_layer3_filters))
    out = self.fc_layer(out)
    out = self.softmax(out)

    return out

# curr_model = block(32, enable_subsample=False)
# print(curr_model)

# resnet20_plain = ResNet(3)
# resnet20 = ResNet(3, enable_skip_connections=True)

# resnet18 = models.resnet18()
# print(resnet18)
# print(sum(p.numel() for p in resnet20_plain.parameters()))
# print(sum(p.numel() for p in resnet20.parameters()))

## Test Training

In [18]:
def resnet_training():

  batch_size = 128
  net = ResNet(3, True)
  #net = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)

  trainloader, valloader, testloader = data_loader(batch_size)
  print(len(trainloader), len(valloader), len(testloader))
  classes = ('plane', 'car', 'bird', 'cat',
            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

  device = "cpu"
  if torch.cuda.is_available():
      device = "cuda:0"
      if torch.cuda.device_count() > 1:
          net = nn.DataParallel(net)
  net = net.to(device)
  # https://discuss.pytorch.org/t/how-to-increase-the-learning-rate-without-using-cyclical-learning-rates/140208/4
  def _lr_lambda(current_step):
        """
        _lr_lambda returns a multiplicative factor given an interger parameter epochs.
        """
        if current_step < 400:
            _lr =.1
        elif current_step < 32000:
            _lr = 1
        elif current_step < 48000:
            _lr = .1
        else:
            _lr = .01

        return _lr

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001)
  scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, _lr_lambda, last_epoch=-1, verbose=False)
  for epoch in range(182):  # loop over the dataset multiple times
      running_loss = 0.0
      for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        # print statistics
        running_loss += loss.item()
        if i % 10 == 9:    # print every 10 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 10:.3f}')
            running_loss = 0.0
      
      val_loss = 0.0
      val_steps = 0
      total = 0
      correct = 0
      for i, data in enumerate(valloader, 0):
          with torch.no_grad():
              inputs, labels = data
              inputs, labels = inputs.to(device), labels.to(device)

              outputs = net(inputs)
              _, predicted = torch.max(outputs.data, 1)
              total += labels.size(0)
              correct += (predicted == labels).sum().item()

              loss = criterion(outputs, labels)
              val_loss += loss.cpu().numpy()
              val_steps += 1
      print("epoch {} val_loss {} val_steps {} val_acc {}".format(epoch, val_loss, val_steps, correct / total))
  print('Finished Training')
  return net
trained_net = resnet_training()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[69,   180] loss: 1.507
[69,   190] loss: 1.504
[69,   200] loss: 1.510
[69,   210] loss: 1.515
[69,   220] loss: 1.506
[69,   230] loss: 1.505
[69,   240] loss: 1.507
[69,   250] loss: 1.523
[69,   260] loss: 1.511
[69,   270] loss: 1.516
[69,   280] loss: 1.510
[69,   290] loss: 1.511
[69,   300] loss: 1.494
[69,   310] loss: 1.515
[69,   320] loss: 1.505
[69,   330] loss: 1.503
[69,   340] loss: 1.493
[69,   350] loss: 1.505
[69,   360] loss: 1.510
[69,   370] loss: 1.501
[69,   380] loss: 1.515
[69,   390] loss: 1.505
[69,   400] loss: 1.513
[69,   410] loss: 1.506
[69,   420] loss: 1.502
[69,   430] loss: 1.509
epoch 68 val_loss 61.42504286766052 val_steps 40 val_acc 0.9288
[70,    10] loss: 1.506
[70,    20] loss: 1.507
[70,    30] loss: 1.501
[70,    40] loss: 1.507
[70,    50] loss: 1.506
[70,    60] loss: 1.500
[70,    70] loss: 1.500
[70,    80] loss: 1.503
[70,    90] loss: 1.498
[70,   100] loss: 1.508
[70,   

## Test accuracy


In [19]:
def test(net, testloader, device, epoch):
    global best_acc
    net.eval()
    test_loss=0
    correct=0
    total=0
    criterion=nn.CrossEntropyLoss()
    test_steps=0
    with torch.no_grad():
        for batch_idx,(inputs,targets)in enumerate(testloader):
            inputs,targets=inputs.to(device),targets.to(device)
            outputs=net(inputs)
            loss=criterion(outputs,targets)

            test_loss+=loss.item()
            _,predicted=outputs.max(1)
            total+=targets.size(0)
            correct+=predicted.eq(targets).sum().item()
            test_steps+=1
    print("epoch {} test_loss {} test_steps {} test_acc {}".format(epoch,test_loss,test_steps,correct/total))

batch_size = 128
_, _, testloader = data_loader(batch_size)
device = "cpu"
if torch.cuda.is_available():
  device = "cuda:0"
test(trained_net, testloader, device, 1)
#print("test acc {}".format(test_accuracy(trained_net)))

epoch 1 test_loss 120.62423348426819 test_steps 79 test_acc 0.9343


# ...


## DenseNet

Model

In [None]:
import math

class Bottleneck(nn.Module):
    def __init__(self, in_planes, growth_rate):
        super(Bottleneck, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
        self.bn2 = nn.BatchNorm2d(4*growth_rate)
        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        out = torch.cat([out,x], 1)
        return out


class Transition(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(Transition, self).__init__()
        self.bn = nn.BatchNorm2d(in_planes)
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)

    def forward(self, x):
        out = self.conv(F.relu(self.bn(x)))
        out = F.avg_pool2d(out, 2)
        return out


class DenseNet(nn.Module):
    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
        super(DenseNet, self).__init__()
        self.growth_rate = growth_rate

        num_planes = 2*growth_rate
        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)

        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
        num_planes += nblocks[0]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans1 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
        num_planes += nblocks[1]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans2 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
        num_planes += nblocks[2]*growth_rate
        out_planes = int(math.floor(num_planes*reduction))
        self.trans3 = Transition(num_planes, out_planes)
        num_planes = out_planes

        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
        num_planes += nblocks[3]*growth_rate

        self.bn = nn.BatchNorm2d(num_planes)
        self.linear = nn.Linear(num_planes, num_classes)

    def _make_dense_layers(self, block, in_planes, nblock):
        layers = []
        for i in range(nblock):
            layers.append(block(in_planes, self.growth_rate))
            in_planes += self.growth_rate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.trans1(self.dense1(out))
        out = self.trans2(self.dense2(out))
        out = self.trans3(self.dense3(out))
        out = self.dense4(out)
        out = F.avg_pool2d(F.relu(self.bn(out)), 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def DenseNet121():
    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12)

Train


batch size 64 for 300 and 40 epochs
init lr 0.1 divided by 10 at 50% and 75%
weight decay 10^-4
momentum 0.9
dropout 0.2

In [None]:
def train_dense(weight_decay=0.0001, momentum=0.9, batch_size=128):
  net = torch.hub.load('pytorch/vision:v0.10.0', 'densenet121', pretrained=False)
  
  trainloader, valloader, testloader = data_loader(batch_size)
  classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')
  
  device = "cpu"
  if torch.cuda.is_available():
      device = "cuda:0"
      if torch.cuda.device_count() > 1:
          net = nn.DataParallel(net)
  net.to(device)

  def _lr_lambda(current_step):
      """
      _lr_lambda returns a multiplicative factor given an interger parameter epochs.
      """
      if current_step < 25000:
          _lr = 1
      elif current_step < 37500:
          _lr = .1
      else:
          _lr = .01
      return _lr

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(net.parameters(), lr=0.1,
                        momentum, weight_decay)
  scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, _lr_lambda, last_epoch=-1, verbose=True)

  for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0
        

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1
        print("epoch {} val_loss {} val_steps {} val_acc {}".format(epoch, val_loss, val_steps, correct / total))
  print("Finished Training")


In [None]:
train_dense()

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
epoch 0 val_loss 35.10935813188553 val_steps 40 val_acc 0.7032
epoch 1 val_loss 29.122162997722626 val_steps 40 val_acc 0.7608
epoch 2 val_loss 25.626320630311966 val_steps 40 val_acc 0.7912


KeyboardInterrupt: ignored

Config the Search space for Ray Tune

In [None]:
# import torch.optim as optim
# from ray import tune
# from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test


# def train_mnist(config):
#     train_loader, test_loader = get_data_loaders()
#     model = ConvNet()
#     optimizer = optim.SGD(model.parameters(), lr=config["lr"])

#     device = "cpu"
#     if torch.cuda.is_available():
#         device = "cuda:0"
#         if torch.cuda.device_count() > 1:
#             net = nn.DataParallel(net)
#     net.to(device)

#     for i in range(10):
#         train(model, optimizer, train_loader)
#         acc = test(model, test_loader)
#         tune.report(mean_accuracy=acc)


# analysis = tune.run(
#     train_mnist, config={"lr": tune.grid_search([0.001, 0.01, 0.1])})

# print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

# # Get a dataframe for analyzing trial results.
# df = analysis.dataframe()

In [None]:
import numpy as np
import os
from functools import partial
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from filelock import FileLock
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
import psutil
import ray
ray._private.utils.get_system_memory = lambda: psutil.virtual_memory().total
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [None]:
def load_data1(data_dir="./data"):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=transform)

    testset = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=transform)

    return trainset, testset

In [None]:
# class Net(nn.Module):
#     def __init__(self, l1=120, l2=84):
#         super(Net, self).__init__()
#         self.conv1 = nn.Conv2d(3, 6, 5)
#         self.pool = nn.MaxPool2d(2, 2)
#         self.conv2 = nn.Conv2d(6, 16, 5)
#         self.fc1 = nn.Linear(16 * 5 * 5, 32)
#         self.fc2 = nn.Linear(32, 16)
#         self.fc3 = nn.Linear(16, 10)

#     def forward(self, x):
#         x = self.pool(F.relu(self.conv1(x)))
#         x = self.pool(F.relu(self.conv2(x)))
#         x = x.view(-1, 16 * 5 * 5)
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = self.fc3(x)
#         return x

In [None]:
def train_cifar(config, checkpoint_dir=None, data_dir=None):
    net = ResNet(3, enable_skip_connections=True) # resnet20
    if torch.cuda.is_available():
      net.cuda()

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data1(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])
    
    trainloader, valloader, testloader = data_loader()

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            print("device:", device)
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [None]:
def test_accuracy(net, device="cpu"):
    trainset, testset = load_data1()

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        net.cuda()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In Original paper
(Hyper) SGD wiht mini-batch size 128  
(Hyper) learning rate starts from 0.1, divide it by 10 at 32K and 48K iterations,   
terminate at 64k iterations  
45k/5k train/val split
(Done) Image agumentation: 4 pixels are padded on each side, and a 32x32 crop is randomly sampled from the padded image or its horizontal flip. for testing, only evaluate the single view of the original 32x32 image.  
(Hyper) weight decay : 0.0001  
(Hyper) momentum : 0.9  


In [None]:
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    load_data1(data_dir)
    config = {
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = ResNet(3, enable_skip_connections=True)
    # best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)
    best_trained_model.cuda()

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))

main(num_samples=5, max_num_epochs=10, gpus_per_trial=1)

Files already downloaded and verified
Files already downloaded and verified


2022-04-20 02:36:56,574	INFO trial_runner.py:803 -- starting train_cifar_bee56_00000


== Status ==
Current time: 2022-04-20 02:36:56 (running for 00:00:00.25)
Memory usage on this node: 2.1/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.35 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/train_cifar_2022-04-20_02-36-56
Number of trials: 5/5 (4 PENDING, 1 RUNNING)
+-------------------------+----------+-----------------+--------------+-------------+
| Trial name              | status   | loc             |   batch_size |          lr |
|-------------------------+----------+-----------------+--------------+-------------|
| train_cifar_bee56_00000 | RUNNING  | 172.28.0.2:2625 |            4 | 0.0285835   |
| train_cifar_bee56_00001 | PENDING  |                 |            4 | 0.00398609  |
| train_cifar_bee56_00002 | PENDING  |                 |            2 | 0.000279323 |
| train_cifar_bee56_00003 

[2m[36m(func pid=2625)[0m   cpuset_checked))


[2m[36m(func pid=2625)[0m device: cuda:0


[2m[36m(func pid=2625)[0m 2022-04-20 02:37:04,439	ERROR function_runner.py:281 -- Runner Thread raised error.
[2m[36m(func pid=2625)[0m Traceback (most recent call last):
[2m[36m(func pid=2625)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 272, in run
[2m[36m(func pid=2625)[0m     self._entrypoint()
[2m[36m(func pid=2625)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 351, in entrypoint
[2m[36m(func pid=2625)[0m     self._status_reporter.get_checkpoint(),
[2m[36m(func pid=2625)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
[2m[36m(func pid=2625)[0m     return method(self, *_args, **_kwargs)
[2m[36m(func pid=2625)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 640, in _trainable_func
[2m[36m(func pid=2625)[0m     output = fn()
[2m[36m(func pid=2625)[0m   File "<ipython-input-

== Status ==
Current time: 2022-04-20 02:37:04 (running for 00:00:08.16)
Memory usage on this node: 3.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.35 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/train_cifar_2022-04-20_02-36-56
Number of trials: 5/5 (4 PENDING, 1 RUNNING)
+-------------------------+----------+-----------------+--------------+-------------+
| Trial name              | status   | loc             |   batch_size |          lr |
|-------------------------+----------+-----------------+--------------+-------------|
| train_cifar_bee56_00000 | RUNNING  | 172.28.0.2:2625 |            4 | 0.0285835   |
| train_cifar_bee56_00001 | PENDING  |                 |            4 | 0.00398609  |
| train_cifar_bee56_00002 | PENDING  |                 |            2 | 0.000279323 |
| train_cifar_bee56_00003 

2022-04-20 02:37:05,395	INFO trial_runner.py:803 -- starting train_cifar_bee56_00001


== Status ==
Current time: 2022-04-20 02:37:10 (running for 00:00:13.99)
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.35 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/train_cifar_2022-04-20_02-36-56
Number of trials: 5/5 (1 ERROR, 3 PENDING, 1 RUNNING)
+-------------------------+----------+-----------------+--------------+-------------+
| Trial name              | status   | loc             |   batch_size |          lr |
|-------------------------+----------+-----------------+--------------+-------------|
| train_cifar_bee56_00001 | RUNNING  | 172.28.0.2:2710 |            4 | 0.00398609  |
| train_cifar_bee56_00002 | PENDING  |                 |            2 | 0.000279323 |
| train_cifar_bee56_00003 | PENDING  |                 |            4 | 0.0333555   |
| train_cifar_bee

[2m[36m(func pid=2710)[0m   cpuset_checked))


[2m[36m(func pid=2710)[0m device: cuda:0


[2m[36m(func pid=2710)[0m 2022-04-20 02:37:13,125	ERROR function_runner.py:281 -- Runner Thread raised error.
[2m[36m(func pid=2710)[0m Traceback (most recent call last):
[2m[36m(func pid=2710)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 272, in run
[2m[36m(func pid=2710)[0m     self._entrypoint()
[2m[36m(func pid=2710)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 351, in entrypoint
[2m[36m(func pid=2710)[0m     self._status_reporter.get_checkpoint(),
[2m[36m(func pid=2710)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
[2m[36m(func pid=2710)[0m     return method(self, *_args, **_kwargs)
[2m[36m(func pid=2710)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 640, in _trainable_func
[2m[36m(func pid=2710)[0m     output = fn()
[2m[36m(func pid=2710)[0m   File "<ipython-input-

Result for train_cifar_bee56_00001:
  date: 2022-04-20_02-37-08
  experiment_id: a84939f2765d44518c53d0e5c839a937
  hostname: 753d77ac8148
  node_ip: 172.28.0.2
  pid: 2710
  timestamp: 1650422228
  trial_id: bee56_00001
  
== Status ==
Current time: 2022-04-20 02:37:18 (running for 00:00:22.01)
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.35 GiB heap, 0.0/3.67 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/train_cifar_2022-04-20_02-36-56
Number of trials: 5/5 (2 ERROR, 2 PENDING, 1 RUNNING)
+-------------------------+----------+-----------------+--------------+-------------+
| Trial name              | status   | loc             |   batch_size |          lr |
|-------------------------+----------+-----------------+--------------+-------------|
| train_cifar_bee56_00002 | RUNNING  | 172.28.0.2:2

[2m[36m(func pid=2794)[0m   cpuset_checked))


[2m[36m(func pid=2794)[0m device: cuda:0


[2m[36m(func pid=2794)[0m 2022-04-20 02:37:21,145	ERROR function_runner.py:281 -- Runner Thread raised error.
[2m[36m(func pid=2794)[0m Traceback (most recent call last):
[2m[36m(func pid=2794)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 272, in run
[2m[36m(func pid=2794)[0m     self._entrypoint()
[2m[36m(func pid=2794)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 351, in entrypoint
[2m[36m(func pid=2794)[0m     self._status_reporter.get_checkpoint(),
[2m[36m(func pid=2794)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/util/tracing/tracing_helper.py", line 462, in _resume_span
[2m[36m(func pid=2794)[0m     return method(self, *_args, **_kwargs)
[2m[36m(func pid=2794)[0m   File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 640, in _trainable_func
[2m[36m(func pid=2794)[0m     output = fn()
[2m[36m(func pid=2794)[0m   File "<ipython-input-

Result for train_cifar_bee56_00002:
  date: 2022-04-20_02-37-16
  experiment_id: b624c444353d4fa29dde8771f42b3652
  hostname: 753d77ac8148
  node_ip: 172.28.0.2
  pid: 2794
  timestamp: 1650422236
  trial_id: bee56_00002
  




KeyboardInterrupt: ignored

# Test

In [None]:
# Change these values if you want the training to run quicker or slower.
EPOCH_SIZE = 512
TEST_SIZE = 256

def train(model, optimizer, train_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # We set this just for the example to run quickly.
        if batch_idx * len(data) > EPOCH_SIZE:
            return
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()


def test(model, data_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(data_loader):
            # We set this just for the example to run quickly.
            if batch_idx * len(data) > TEST_SIZE:
                break
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    return correct / total

In [None]:
def train_cifar1(config):
    # Data Setup
    mnist_transforms = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    train_loader = DataLoader(
        torchvision.datasets.CIFAR10("~/data", train=True, download=True, transform=mnist_transforms),
        batch_size=64,
        shuffle=True)
    test_loader = DataLoader(
        torchvision.datasets.CIFAR10("~/data", train=False, transform=mnist_transforms),
        batch_size=64,
        shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = ResNet(3, enable_skip_connections=True)
    model.to(device)

    optimizer = optim.SGD(
        model.parameters(), lr=config["lr"], momentum=config["momentum"])
    for i in range(10):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)

        # Send the current training result back to Tune
        tune.report(mean_accuracy=acc)

        if i % 5 == 0:
            # This saves the model to the trial directory
            torch.save(model.state_dict(), "./model.pth")

In [None]:
search_space = {
    "lr": tune.sample_from(lambda spec: 10 ** (-10 * np.random.rand())),
    "momentum": tune.uniform(0.1, 0.9),
}

# Uncomment this to enable distributed execution
# `ray.init(address="auto")`

# Download the dataset first
torchvision.datasets.CIFAR10("~/data", train=True, download=True)

analysis = tune.run(train_cifar1, config=search_space)

Files already downloaded and verified


2022-04-20 02:45:42,927	INFO trial_runner.py:803 -- starting train_cifar1_f8a47_00000


Trial name,status,loc,lr,momentum
train_cifar1_f8a47_00000,RUNNING,172.28.0.2:3168,1.06222e-07,0.628584


[2m[36m(train_cifar1 pid=3168)[0m Files already downloaded and verified


Trial name,status,loc,lr,momentum
train_cifar1_f8a47_00000,RUNNING,172.28.0.2:3168,1.06222e-07,0.628584


Result for train_cifar1_f8a47_00000:
  date: 2022-04-20_02-45-52
  done: false
  experiment_id: c79eb786ed0e4c59be1a3212475a4b14
  hostname: 753d77ac8148
  iterations_since_restore: 1
  mean_accuracy: 0.109375
  node_ip: 172.28.0.2
  pid: 3168
  time_since_restore: 6.664595127105713
  time_this_iter_s: 6.664595127105713
  time_total_s: 6.664595127105713
  timestamp: 1650422752
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: f8a47_00000
  warmup_time: 0.003980159759521484
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_cifar1_f8a47_00000,RUNNING,172.28.0.2:3168,1.06222e-07,0.628584,0.109375,1,6.6646


Result for train_cifar1_f8a47_00000:
  date: 2022-04-20_02-45-57
  done: false
  experiment_id: c79eb786ed0e4c59be1a3212475a4b14
  hostname: 753d77ac8148
  iterations_since_restore: 2
  mean_accuracy: 0.109375
  node_ip: 172.28.0.2
  pid: 3168
  time_since_restore: 11.78224802017212
  time_this_iter_s: 5.117652893066406
  time_total_s: 11.78224802017212
  timestamp: 1650422757
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: f8a47_00000
  warmup_time: 0.003980159759521484
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_cifar1_f8a47_00000,RUNNING,172.28.0.2:3168,1.06222e-07,0.628584,0.109375,2,11.7822


Result for train_cifar1_f8a47_00000:
  date: 2022-04-20_02-46-02
  done: false
  experiment_id: c79eb786ed0e4c59be1a3212475a4b14
  hostname: 753d77ac8148
  iterations_since_restore: 3
  mean_accuracy: 0.10625
  node_ip: 172.28.0.2
  pid: 3168
  time_since_restore: 16.856648206710815
  time_this_iter_s: 5.074400186538696
  time_total_s: 16.856648206710815
  timestamp: 1650422762
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: f8a47_00000
  warmup_time: 0.003980159759521484
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_cifar1_f8a47_00000,RUNNING,172.28.0.2:3168,1.06222e-07,0.628584,0.10625,3,16.8566


Result for train_cifar1_f8a47_00000:
  date: 2022-04-20_02-46-07
  done: false
  experiment_id: c79eb786ed0e4c59be1a3212475a4b14
  hostname: 753d77ac8148
  iterations_since_restore: 4
  mean_accuracy: 0.10625
  node_ip: 172.28.0.2
  pid: 3168
  time_since_restore: 21.949237823486328
  time_this_iter_s: 5.092589616775513
  time_total_s: 21.949237823486328
  timestamp: 1650422767
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: f8a47_00000
  warmup_time: 0.003980159759521484
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_cifar1_f8a47_00000,RUNNING,172.28.0.2:3168,1.06222e-07,0.628584,0.10625,4,21.9492


Result for train_cifar1_f8a47_00000:
  date: 2022-04-20_02-46-12
  done: false
  experiment_id: c79eb786ed0e4c59be1a3212475a4b14
  hostname: 753d77ac8148
  iterations_since_restore: 5
  mean_accuracy: 0.090625
  node_ip: 172.28.0.2
  pid: 3168
  time_since_restore: 27.060617208480835
  time_this_iter_s: 5.111379384994507
  time_total_s: 27.060617208480835
  timestamp: 1650422772
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: f8a47_00000
  warmup_time: 0.003980159759521484
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_cifar1_f8a47_00000,RUNNING,172.28.0.2:3168,1.06222e-07,0.628584,0.090625,5,27.0606


Result for train_cifar1_f8a47_00000:
  date: 2022-04-20_02-46-17
  done: false
  experiment_id: c79eb786ed0e4c59be1a3212475a4b14
  hostname: 753d77ac8148
  iterations_since_restore: 6
  mean_accuracy: 0.10625
  node_ip: 172.28.0.2
  pid: 3168
  time_since_restore: 32.14216995239258
  time_this_iter_s: 5.081552743911743
  time_total_s: 32.14216995239258
  timestamp: 1650422777
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: f8a47_00000
  warmup_time: 0.003980159759521484
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_cifar1_f8a47_00000,RUNNING,172.28.0.2:3168,1.06222e-07,0.628584,0.10625,6,32.1422


Result for train_cifar1_f8a47_00000:
  date: 2022-04-20_02-46-23
  done: false
  experiment_id: c79eb786ed0e4c59be1a3212475a4b14
  hostname: 753d77ac8148
  iterations_since_restore: 7
  mean_accuracy: 0.103125
  node_ip: 172.28.0.2
  pid: 3168
  time_since_restore: 37.28174901008606
  time_this_iter_s: 5.1395790576934814
  time_total_s: 37.28174901008606
  timestamp: 1650422783
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: f8a47_00000
  warmup_time: 0.003980159759521484
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_cifar1_f8a47_00000,RUNNING,172.28.0.2:3168,1.06222e-07,0.628584,0.103125,7,37.2817


Result for train_cifar1_f8a47_00000:
  date: 2022-04-20_02-46-28
  done: false
  experiment_id: c79eb786ed0e4c59be1a3212475a4b14
  hostname: 753d77ac8148
  iterations_since_restore: 8
  mean_accuracy: 0.10625
  node_ip: 172.28.0.2
  pid: 3168
  time_since_restore: 42.40127110481262
  time_this_iter_s: 5.1195220947265625
  time_total_s: 42.40127110481262
  timestamp: 1650422788
  timesteps_since_restore: 0
  training_iteration: 8
  trial_id: f8a47_00000
  warmup_time: 0.003980159759521484
  


Trial name,status,loc,lr,momentum,acc,iter,total time (s)
train_cifar1_f8a47_00000,RUNNING,172.28.0.2:3168,1.06222e-07,0.628584,0.10625,8,42.4013


Result for train_cifar1_f8a47_00000:
  date: 2022-04-20_02-46-33
  done: false
  experiment_id: c79eb786ed0e4c59be1a3212475a4b14
  hostname: 753d77ac8148
  iterations_since_restore: 9
  mean_accuracy: 0.084375
  node_ip: 172.28.0.2
  pid: 3168
  time_since_restore: 47.52450728416443
  time_this_iter_s: 5.123236179351807
  time_total_s: 47.52450728416443
  timestamp: 1650422793
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: f8a47_00000
  warmup_time: 0.003980159759521484
  


In [None]:
dfs = analysis.trial_dataframes
[d.mean_accuracy.plot() for d in dfs.values()]