<!--BOOK_INFORMATION-->
<img align="left" style="width:80px;height:98px;padding-right:20px;" src="https://raw.githubusercontent.com/joe-papa/pytorch-book/main/files/pytorch-book-cover.jpg">

This notebook contains an excerpt from the [PyTorch Pocket Reference](http://pytorchbook.com) book by [Joe Papa](http://joepapa.ai); content is available [on GitHub](https://github.com/joe-papa/pytorch-book).

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/joe-papa/pytorch-book/blob/main/06_03_Hyperparameter_Tuning.ipynb)

# Chapter 6 - PyTorch Acceleration & Optimization

## Model Optimization

### Hyperparameter Tuning

In [None]:
!pip install tensorboardX
!pip install ray

Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any.whl (308kB)
[K     |█                               | 10kB 24.6MB/s eta 0:00:01[K     |██▏                             | 20kB 30.3MB/s eta 0:00:01[K     |███▏                            | 30kB 23.0MB/s eta 0:00:01[K     |████▎                           | 40kB 26.6MB/s eta 0:00:01[K     |█████▎                          | 51kB 25.4MB/s eta 0:00:01[K     |██████▍                         | 61kB 17.8MB/s eta 0:00:01[K     |███████▍                        | 71kB 18.0MB/s eta 0:00:01[K     |████████▌                       | 81kB 19.5MB/s eta 0:00:01[K     |█████████▌                      | 92kB 18.0MB/s eta 0:00:01[K     |██████████▋                     | 102kB 18.0MB/s eta 0:00:01[K     |███████████▊                    | 112kB 18.0MB/s eta 0:00:01[K     |████████████▊                   

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, nodes_1=120, nodes_2=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, nodes_1) # <1>
        self.fc2 = nn.Linear(nodes_1, nodes_2) # <2>
        self.fc3 = nn.Linear(nodes_2, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [None]:
from ray import tune
import numpy as np

config = {
  "nodes_1": tune.sample_from(
      lambda _: 2 ** np.random.randint(2, 9)),
  "nodes_2": tune.sample_from(
      lambda _: 2 ** np.random.randint(2, 9)),
  "lr": tune.loguniform(1e-4, 1e-1),
  "batch_size": tune.choice([2, 4, 8, 16])  
  }

In [None]:
import torch
import torchvision
from torchvision import transforms

def load_data(data_dir="./data"):
  train_transforms = transforms.Compose([
      transforms.RandomCrop(32, padding=4),
      transforms.RandomHorizontalFlip(),
      transforms.ToTensor(),
      transforms.Normalize(
          (0.4914, 0.4822, 0.4465),
          (0.2023, 0.1994, 0.2010))])
  
  test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        (0.4914, 0.4822, 0.4465),
        (0.2023, 0.1994, 0.2010))])

  trainset = torchvision.datasets.CIFAR10(
      root=data_dir, train=True, 
      download=True, transform=train_transforms)

  testset = torchvision.datasets.CIFAR10(
      root=data_dir, train=False, 
      download=True, transform=test_transforms)

  return trainset, testset

In [None]:
from torch import optim
from torch import nn
from torch.utils.data import random_split

def train_model(config):
  device = torch.device("cuda" if
    torch.cuda.is_available() else "cpu")

  model = Net(config['nodes_1'],
      config['nodes_2']).to(device=device) #<1>

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(),
                        lr=config['lr'],
                        momentum=0.9) #<2>

  trainset, testset = load_data()

  test_abs = int(len(trainset) * 0.8)
  train_subset, val_subset = random_split(
      trainset, 
      [test_abs, len(trainset) - test_abs])

  trainloader = torch.utils.data.DataLoader(
      train_subset,
      batch_size=int(config["batch_size"]),
      shuffle=True) # <3>

  valloader = torch.utils.data.DataLoader(
      val_subset,
      batch_size=int(config["batch_size"]),
      shuffle=True) # <3>

  for epoch in range(10):
      train_loss = 0.0
      epoch_steps = 0
      for data in trainloader:
          inputs, labels = data
          inputs = inputs.to(device)
          labels = labels.to(device)

          optimizer.zero_grad()

          outputs = model(inputs)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()
          train_loss += loss.item()

      val_loss = 0.0
      total = 0
      correct = 0
      for data in valloader:
          with torch.no_grad():
              inputs, labels = data
              inputs = inputs.to(device)
              labels = labels.to(device)

              outputs = model(inputs)
              _, predicted = torch.max(
                          outputs.data, 1)
              total += labels.size(0)
              correct += \
                (predicted == labels).sum().item()

              loss = criterion(outputs, labels)
              val_loss += loss.cpu().numpy()

      print(f'epoch: {epoch} ',
            f'train_loss: ',
            f'{train_loss/len(trainloader)}',
            f'val_loss: ',
            f'{val_loss/len(valloader)}',
            f'val_acc: {correct/total}')
      tune.report(loss=(val_loss / len(valloader)),
                  accuracy=correct / total)

In [None]:
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=10,
    grace_period=1,
    reduction_factor=2)

reporter = CLIReporter(
    metric_columns=["loss", 
                    "accuracy", 
                    "training_iteration"])

In [None]:
from functools import partial

result = tune.run(
    partial(train_model),
    resources_per_trial={"cpu": 2, "gpu": 1},
    config=config,
    num_samples=10,
    scheduler=scheduler,
    progress_reporter=reporter)

2021-03-11 17:25:04,376	INFO services.py:1174 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2021-03-11 17:25:07,206	INFO registry.py:65 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Memory usage on this node: 1.3/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2/2 CPUs, 1/1 GPUs, 0.0/7.32 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/DEFAULT_2021-03-11_17-25-07
Number of trials: 1/10 (1 RUNNING)
+---------------------+----------+-------+--------------+------------+-----------+-----------+
| Trial name          | status   | loc   |   batch_size |         lr |   nodes_1 |   nodes_2 |
|---------------------+----------+-------+--------------+------------+-----------+-----------|
| DEFAULT_b965e_00000 | RUNNING  |       |            8 | 0.00166288 |       256 |       256 |
+---------------------+----------+-------+--------------+------------+-----------+-----------+


[2m[36m(pid=215)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


[2m[36m(pid=215)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:40, 4227791.30it/s]
  3%|▎         | 4921344/170498071 [00:00<00:28, 5804819.29it/s]
  6%|▋         | 10663936/170498071 [00:00<00:20, 7948264.90it/s]
 10%|▉         | 16496640/170498071 [00:00<00:14, 10728118.25it/s]
 13%|█▎        | 22266880/170498071 [00:00<00:10, 14191392.49it/s]
 16%|█▋        | 28115968/170498071 [00:00<00:07, 18363496.59it/s]
 20%|█▉        | 33852416/170498071 [00:00<00:05, 23068416.41it/s]
 23%|██▎       | 39688192/170498071 [00:00<00:04, 28180465.64it/s]
 27%|██▋       | 45433856/170498071 [00:00<00:03, 33241972.89it/s]
 30%|███       | 51262464/170498071 [00:01<00:03, 38160842.95it/s]
 33%|███▎      | 57017344/170498071 [00:01<00:02, 42404419.11it/s]
 37%|███▋      | 62826496/170498071 [00:01<00:02, 46141653.19it/s]
 40%|████      | 68593664/170498071 [00:01<00:02, 49084929.60it/s]
 44%|████▎     | 74335232/170498071 [00:01<00:01, 51289332.61it/s

[2m[36m(pid=215)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
[2m[36m(pid=215)[0m Files already downloaded and verified
Result for DEFAULT_b965e_00000:
  accuracy: 0.4254
  date: 2021-03-11_17-25-52
  done: false
  experiment_id: bf4ddddc774a48a9b2664cf5751076fe
  hostname: fa4a1775811d
  iterations_since_restore: 1
  loss: 1.5670228345394135
  node_ip: 172.28.0.2
  pid: 215
  time_since_restore: 44.2513267993927
  time_this_iter_s: 44.2513267993927
  time_total_s: 44.2513267993927
  timestamp: 1615483552
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b965e_00000
  
== Status ==
Memory usage on this node: 3.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -1.5670228345394135
Resources requested: 2/2 CPUs, 1/1 GPUs, 0.0/7.32 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/DEFAULT_2021-03-11_17-25-07
Number of trials: 2/10 (1 PENDING, 1 

[2m[36m(pid=216)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:42, 4045157.78it/s]
  4%|▍         | 6996992/170498071 [00:00<00:29, 5629840.38it/s]
 10%|▉         | 17027072/170498071 [00:00<00:19, 7853695.93it/s]
 16%|█▌        | 26980352/170498071 [00:00<00:13, 10852557.90it/s]
 21%|██        | 36176896/170498071 [00:00<00:09, 14757199.25it/s]
 26%|██▋       | 44925952/170498071 [00:00<00:06, 19660317.24it/s]
 31%|███▏      | 53315584/170498071 [00:00<00:04, 25522859.58it/s]
 37%|███▋      | 62484480/170498071 [00:00<00:03, 32574987.28it/s]
 42%|████▏     | 71297024/170498071 [00:00<00:02, 40171278.30it/s]
 47%|████▋     | 80265216/170498071 [00:01<00:01, 48144840.77it/s]
 52%|█████▏    | 89115648/170498071 [00:01<00:01, 55774953.07it/s]
 58%|█████▊    | 98416640/170498071 [00:01<00:01, 63386854.24it/s]
 63%|██████▎   | 108186624/170498071 [00:01<00:00, 70851972.84it/s]
 69%|██████▉   | 118089728/170498071 [00:01<00:00, 77463413.24it

[2m[36m(pid=216)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
[2m[36m(pid=216)[0m Files already downloaded and verified
Result for DEFAULT_b965e_00001:
  accuracy: 0.3296
  date: 2021-03-11_17-30-26
  done: true
  experiment_id: f7d400dc114a4bdfab8a32f6d002e44a
  hostname: fa4a1775811d
  iterations_since_restore: 1
  loss: 1.7470308704257012
  node_ip: 172.28.0.2
  pid: 216
  time_since_restore: 47.09719753265381
  time_this_iter_s: 47.09719753265381
  time_total_s: 47.09719753265381
  timestamp: 1615483826
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b965e_00001
  
[2m[36m(pid=216)[0m epoch: 0  train_loss:  1.9550833574175834 val_loss:  1.7470308704257012 val_acc: 0.3296
== Status ==
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 8.000: -1.2140959628105163 | Iter 4.000: -1.3276116551160813 | Iter 2.000: -1.4834082144737244 | Iter 1.000: -1.6570268524825573
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.32

[2m[36m(pid=277)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:42, 3976685.51it/s]
  4%|▍         | 6718464/170498071 [00:00<00:29, 5530752.70it/s]
 10%|█         | 17212416/170498071 [00:00<00:19, 7726541.70it/s]
 16%|█▋        | 27723776/170498071 [00:00<00:13, 10700794.14it/s]
 22%|██▏       | 37737472/170498071 [00:00<00:09, 14617367.52it/s]
 29%|██▊       | 48886784/170498071 [00:00<00:06, 19771036.57it/s]
 34%|███▍      | 58411008/170498071 [00:00<00:04, 25936524.60it/s]
 40%|███▉      | 68046848/170498071 [00:00<00:03, 33219795.27it/s]
 46%|████▌     | 77855744/170498071 [00:00<00:02, 41441712.35it/s]
 51%|█████▏    | 87557120/170498071 [00:01<00:01, 50040658.97it/s]
 57%|█████▋    | 97101824/170498071 [00:01<00:01, 58368604.93it/s]
 62%|██████▏   | 106557440/170498071 [00:01<00:00, 65296224.20it/s]
 68%|██████▊   | 115886080/170498071 [00:01<00:00, 69374403.13it/s]
 73%|███████▎  | 125290496/170498071 [00:01<00:00, 75299840.24i

[2m[36m(pid=277)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
[2m[36m(pid=277)[0m Files already downloaded and verified
Result for DEFAULT_b965e_00002:
  accuracy: 0.3814
  date: 2021-03-11_17-31-15
  done: true
  experiment_id: ba1c7ef8088e4968b4286ba728f93aff
  hostname: fa4a1775811d
  iterations_since_restore: 1
  loss: 1.6622767435073853
  node_ip: 172.28.0.2
  pid: 277
  time_since_restore: 48.03993368148804
  time_this_iter_s: 48.03993368148804
  time_total_s: 48.03993368148804
  timestamp: 1615483875
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b965e_00002
  
== Status ==
Memory usage on this node: 2.8/12.7 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 8.000: -1.2140959628105163 | Iter 4.000: -1.3276116551160813 | Iter 2.000: -1.4834082144737244 | Iter 1.000: -1.6622767435073853
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/DEFAULT_2021-03-11

[2m[36m(pid=301)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:42, 4040517.35it/s]
  4%|▍         | 6637568/170498071 [00:00<00:29, 5615166.05it/s]
  9%|▉         | 15935488/170498071 [00:00<00:19, 7819271.61it/s]
 15%|█▍        | 25516032/170498071 [00:00<00:13, 10792841.95it/s]
 20%|██        | 34472960/170498071 [00:00<00:09, 14661212.05it/s]
 25%|██▌       | 43420672/170498071 [00:00<00:06, 19570265.72it/s]
 31%|███       | 52233216/170498071 [00:00<00:04, 25526209.10it/s]
 35%|███▌      | 60195840/170498071 [00:00<00:03, 32060654.93it/s]
 40%|███▉      | 68058112/170498071 [00:00<00:02, 38807515.93it/s]
 45%|████▍     | 76170240/170498071 [00:01<00:02, 45996953.53it/s]
 49%|████▉     | 84125696/170498071 [00:01<00:01, 52660679.84it/s]
 54%|█████▍    | 92219392/170498071 [00:01<00:01, 58825594.96it/s]
 59%|█████▉    | 100451328/170498071 [00:01<00:01, 64329764.82it/s]
 64%|██████▎   | 108493824/170498071 [00:01<00:00, 63147815.32it

[2m[36m(pid=301)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
[2m[36m(pid=301)[0m Files already downloaded and verified
Result for DEFAULT_b965e_00003:
  accuracy: 0.3349
  date: 2021-03-11_17-32-05
  done: true
  experiment_id: 7dffd4c2108049488b8bad91154fef6d
  hostname: fa4a1775811d
  iterations_since_restore: 1
  loss: 1.8008183712244035
  node_ip: 172.28.0.2
  pid: 301
  time_since_restore: 48.2304310798645
  time_this_iter_s: 48.2304310798645
  time_total_s: 48.2304310798645
  timestamp: 1615483925
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b965e_00003
  
[2m[36m(pid=301)[0m epoch: 0  train_loss:  1.9018759758770465 val_loss:  1.8008183712244035 val_acc: 0.3349
== Status ==
Memory usage on this node: 2.8/12.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 8.000: -1.2140959628105163 | Iter 4.000: -1.3276116551160813 | Iter 2.000: -1.4834082144737244 | Iter 1.000: -1.7046538069665433
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.32 Gi

[2m[36m(pid=325)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:41, 4084013.39it/s]
  4%|▍         | 7586816/170498071 [00:00<00:28, 5694617.33it/s]
 11%|█▏        | 19193856/170498071 [00:00<00:18, 7967634.97it/s]
 18%|█▊        | 30739456/170498071 [00:00<00:12, 11055363.48it/s]
 25%|██▍       | 42062848/170498071 [00:00<00:08, 15159077.27it/s]
 31%|███       | 53016576/170498071 [00:00<00:05, 20443258.42it/s]
 38%|███▊      | 64252928/170498071 [00:00<00:03, 27092100.65it/s]
 44%|████▍     | 75318272/170498071 [00:00<00:02, 35026846.52it/s]
 51%|█████     | 86359040/170498071 [00:00<00:01, 44049095.32it/s]
 57%|█████▋    | 96806912/170498071 [00:01<00:01, 53295736.28it/s]
 63%|██████▎   | 107225088/170498071 [00:01<00:01, 60469503.48it/s]
 69%|██████▊   | 117161984/170498071 [00:01<00:00, 66800577.42it/s]
 74%|███████▍  | 126770176/170498071 [00:01<00:00, 72459702.01it/s]
 80%|███████▉  | 136205312/170498071 [00:01<00:00, 76652475.62

[2m[36m(pid=325)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
[2m[36m(pid=325)[0m Files already downloaded and verified
Result for DEFAULT_b965e_00004:
  accuracy: 0.3773
  date: 2021-03-11_17-32-54
  done: false
  experiment_id: b0c185effe364ea4adb4da674c8205d8
  hostname: fa4a1775811d
  iterations_since_restore: 1
  loss: 1.6883320343971253
  node_ip: 172.28.0.2
  pid: 325
  time_since_restore: 47.526949882507324
  time_this_iter_s: 47.526949882507324
  time_total_s: 47.526949882507324
  timestamp: 1615483974
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b965e_00004
  
[2m[36m(pid=325)[0m epoch: 0  train_loss:  1.947762777340412 val_loss:  1.6883320343971253 val_acc: 0.3773
== Status ==
Memory usage on this node: 3.0/12.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 8.000: -1.2140959628105163 | Iter 4.000: -1.3276116551160813 | Iter 2.000: -1.4834082144737244 | Iter 1.000: -1.6883320343971253
Resources requested: 2/2 CPUs, 1/1 GPUs, 0.0/7

[2m[36m(pid=349)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:42, 3961480.20it/s]
  3%|▎         | 5872640/170498071 [00:00<00:29, 5487647.52it/s]
  7%|▋         | 12306432/170498071 [00:00<00:20, 7563011.86it/s]
 11%|█▏        | 19448832/170498071 [00:00<00:14, 10335187.17it/s]
 15%|█▌        | 25887744/170498071 [00:00<00:10, 13813098.96it/s]
 19%|█▉        | 32766976/170498071 [00:00<00:07, 18169353.07it/s]
 23%|██▎       | 38716416/170498071 [00:00<00:05, 22934002.25it/s]
 27%|██▋       | 45614080/170498071 [00:00<00:04, 28655049.12it/s]
 30%|███       | 51624960/170498071 [00:00<00:03, 33616683.93it/s]
 34%|███▍      | 58295296/170498071 [00:01<00:02, 39469365.66it/s]
 38%|███▊      | 64419840/170498071 [00:01<00:02, 43330684.11it/s]
 42%|████▏     | 71028736/170498071 [00:01<00:02, 48322748.37it/s]
 45%|████▌     | 77192192/170498071 [00:01<00:01, 50467117.29it/s]
 50%|█████     | 85405696/170498071 [00:01<00:01, 57067694.12it/s

[2m[36m(pid=349)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
[2m[36m(pid=349)[0m Files already downloaded and verified
Result for DEFAULT_b965e_00005:
  accuracy: 0.2815
  date: 2021-03-11_17-34-20
  done: true
  experiment_id: 7fabbb0aeb584e1fbd85dbbf2bdaa92c
  hostname: fa4a1775811d
  iterations_since_restore: 1
  loss: 1.9031587094306945
  node_ip: 172.28.0.2
  pid: 349
  time_since_restore: 48.68596124649048
  time_this_iter_s: 48.68596124649048
  time_total_s: 48.68596124649048
  timestamp: 1615484060
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b965e_00005
  
== Status ==
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 8.000: -1.2140959628105163 | Iter 4.000: -1.3276116551160813 | Iter 2.000: -1.5345812916874886 | Iter 1.000: -1.7176814524114132
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/DEFAULT_2021-03-11

[2m[36m(pid=373)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:41, 4133760.20it/s]
  4%|▍         | 6890496/170498071 [00:00<00:28, 5747432.22it/s]
  9%|▉         | 16086016/170498071 [00:00<00:19, 7996411.25it/s]
 15%|█▌        | 26100736/170498071 [00:00<00:13, 11045383.19it/s]
 21%|██        | 35701760/170498071 [00:00<00:08, 15037678.61it/s]
 26%|██▋       | 45120512/170498071 [00:00<00:06, 20106574.23it/s]
 32%|███▏      | 54188032/170498071 [00:00<00:04, 26230623.36it/s]
 37%|███▋      | 63357952/170498071 [00:00<00:03, 33379241.29it/s]
 43%|████▎     | 72534016/170498071 [00:00<00:02, 41253198.17it/s]
 48%|████▊     | 81441792/170498071 [00:01<00:01, 49173108.61it/s]
 53%|█████▎    | 90309632/170498071 [00:01<00:01, 56690228.63it/s]
 58%|█████▊    | 99070976/170498071 [00:01<00:01, 63203737.43it/s]
 63%|██████▎   | 107795456/170498071 [00:01<00:00, 67847588.39it/s]
 68%|██████▊   | 116520960/170498071 [00:01<00:00, 72672461.98it

[2m[36m(pid=373)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
[2m[36m(pid=373)[0m Files already downloaded and verified
Result for DEFAULT_b965e_00006:
  accuracy: 0.0985
  date: 2021-03-11_17-35-09
  done: true
  experiment_id: 8f51d29b0a7b4b83bc0ad4bc17a49800
  hostname: fa4a1775811d
  iterations_since_restore: 1
  loss: 2.3105966159820555
  node_ip: 172.28.0.2
  pid: 373
  time_since_restore: 47.326388120651245
  time_this_iter_s: 47.326388120651245
  time_total_s: 47.326388120651245
  timestamp: 1615484109
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b965e_00006
  
[2m[36m(pid=373)[0m epoch: 0  train_loss:  2.3046072990894317 val_loss:  2.3105966159820555 val_acc: 0.0985
== Status ==
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 8.000: -1.2140959628105163 | Iter 4.000: -1.3276116551160813 | Iter 2.000: -1.5345812916874886 | Iter 1.000: -1.7470308704257012
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7

[2m[36m(pid=397)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:41, 4066270.12it/s]
  4%|▍         | 6644736/170498071 [00:00<00:28, 5650176.76it/s]
  9%|▉         | 15208448/170498071 [00:00<00:19, 7849714.29it/s]
 14%|█▍        | 23818240/170498071 [00:00<00:13, 10792185.46it/s]
 19%|█▉        | 32631808/170498071 [00:00<00:09, 14648667.59it/s]
 24%|██▍       | 41326592/170498071 [00:00<00:06, 19517309.66it/s]
 29%|██▉       | 50251776/170498071 [00:00<00:04, 25492666.75it/s]
 35%|███▍      | 58942464/170498071 [00:00<00:03, 32350804.47it/s]
 40%|███▉      | 67715072/170498071 [00:00<00:02, 39907859.60it/s]
 45%|████▍     | 76484608/170498071 [00:01<00:01, 47706440.05it/s]
 50%|█████     | 85263360/170498071 [00:01<00:01, 55275217.18it/s]
 55%|█████▌    | 93923328/170498071 [00:01<00:01, 62001903.21it/s]
 60%|██████    | 102464512/170498071 [00:01<00:01, 67320560.05it/s]
 65%|██████▌   | 110968832/170498071 [00:01<00:00, 69174734.47it

[2m[36m(pid=397)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
[2m[36m(pid=397)[0m Files already downloaded and verified
Result for DEFAULT_b965e_00007:
  accuracy: 0.2192
  date: 2021-03-11_17-35-47
  done: true
  experiment_id: 1e1613d7aa584689b768ddc8b8955811
  hostname: fa4a1775811d
  iterations_since_restore: 1
  loss: 1.9529639487266541
  node_ip: 172.28.0.2
  pid: 397
  time_since_restore: 36.35672903060913
  time_this_iter_s: 36.35672903060913
  time_total_s: 36.35672903060913
  timestamp: 1615484147
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b965e_00007
  
[2m[36m(pid=397)[0m epoch: 0  train_loss:  2.0102850666046144 val_loss:  1.9529639487266541 val_acc: 0.2192
== Status ==
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 8.000: -1.2140959628105163 | Iter 4.000: -1.3276116551160813 | Iter 2.000: -1.5345812916874886 | Iter 1.000: -1.7739246208250523
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.32

[2m[36m(pid=421)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:42, 3972226.21it/s]
  4%|▍         | 6860800/170498071 [00:00<00:29, 5527964.48it/s]
  9%|▉         | 15973376/170498071 [00:00<00:20, 7696981.36it/s]
 15%|█▌        | 26335232/170498071 [00:00<00:13, 10656385.17it/s]
 22%|██▏       | 36878336/170498071 [00:00<00:09, 14591312.34it/s]
 28%|██▊       | 48146432/170498071 [00:00<00:06, 19748631.81it/s]
 35%|███▌      | 60014592/170498071 [00:00<00:04, 26334264.21it/s]
 42%|████▏     | 71728128/170498071 [00:00<00:02, 34314030.56it/s]
 49%|████▊     | 82719744/170498071 [00:00<00:02, 43235266.21it/s]
 55%|█████▍    | 93150208/170498071 [00:01<00:01, 52201399.83it/s]
 61%|██████    | 104139776/170498071 [00:01<00:01, 61959612.58it/s]
 67%|██████▋   | 114671616/170498071 [00:01<00:00, 67973040.66it/s]
 73%|███████▎  | 124985344/170498071 [00:01<00:00, 75717713.94it/s]
 79%|███████▉  | 135086080/170498071 [00:01<00:00, 80983113.54

[2m[36m(pid=421)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
[2m[36m(pid=421)[0m Files already downloaded and verified
Result for DEFAULT_b965e_00008:
  accuracy: 0.0995
  date: 2021-03-11_17-36-59
  done: true
  experiment_id: 0257b1567ea0497db7dbd75d3ee5367d
  hostname: fa4a1775811d
  iterations_since_restore: 1
  loss: 2.413724879360199
  node_ip: 172.28.0.2
  pid: 421
  time_since_restore: 70.95776081085205
  time_this_iter_s: 70.95776081085205
  time_total_s: 70.95776081085205
  timestamp: 1615484219
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b965e_00008
  
[2m[36m(pid=421)[0m epoch: 0  train_loss:  2.4061736549198627 val_loss:  2.413724879360199 val_acc: 0.0995
== Status ==
Memory usage on this node: 2.7/12.7 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 8.000: -1.2140959628105163 | Iter 4.000: -1.3276116551160813 | Iter 2.000: -1.5345812916874886 | Iter 1.000: -1.8008183712244035
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.32 G

[2m[36m(pid=445)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:40, 4155197.21it/s]
  4%|▍         | 7455744/170498071 [00:00<00:28, 5788981.86it/s]
 11%|█         | 18044928/170498071 [00:00<00:18, 8080639.50it/s]
 17%|█▋        | 29270016/170498071 [00:00<00:12, 11198265.67it/s]
 23%|██▎       | 39843840/170498071 [00:00<00:08, 15302887.96it/s]
 30%|██▉       | 50900992/170498071 [00:00<00:05, 20637153.06it/s]
 36%|███▌      | 61712384/170498071 [00:00<00:03, 27252040.31it/s]
 43%|████▎     | 73155584/170498071 [00:00<00:02, 35325790.98it/s]
 49%|████▉     | 83978240/170498071 [00:00<00:01, 44271437.24it/s]
 55%|█████▌    | 94503936/170498071 [00:01<00:01, 53584552.99it/s]
 61%|██████▏   | 104842240/170498071 [00:01<00:01, 60172937.03it/s]
 67%|██████▋   | 114709504/170498071 [00:01<00:00, 68147923.52it/s]
 74%|███████▎  | 125428736/170498071 [00:01<00:00, 76506224.31it/s]
 80%|████████  | 136685568/170498071 [00:01<00:00, 84640654.50

[2m[36m(pid=445)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
[2m[36m(pid=445)[0m Files already downloaded and verified
Result for DEFAULT_b965e_00009:
  accuracy: 0.3751
  date: 2021-03-11_17-38-10
  done: false
  experiment_id: 5d967723116f4ec59fd2d465d42d611e
  hostname: fa4a1775811d
  iterations_since_restore: 1
  loss: 1.6837590220183134
  node_ip: 172.28.0.2
  pid: 445
  time_since_restore: 69.59433126449585
  time_this_iter_s: 69.59433126449585
  time_total_s: 69.59433126449585
  timestamp: 1615484290
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: b965e_00009
  
[2m[36m(pid=445)[0m epoch: 0  train_loss:  1.8940870077863337 val_loss:  1.6837590220183134 val_acc: 0.3751
== Status ==
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 8.000: -1.2140959628105163 | Iter 4.000: -1.3276116551160813 | Iter 2.000: -1.5345812916874886 | Iter 1.000: -1.7739246208250523
Resources requested: 2/2 CPUs, 1/1 GPUs, 0.0/7.3

2021-03-11 17:39:10,379	INFO tune.py:450 -- Total run time: 846.56 seconds (842.94 seconds for the tuning loop).


Result for DEFAULT_b965e_00009:
  accuracy: 0.4342
  date: 2021-03-11_17-39-10
  done: true
  experiment_id: 5d967723116f4ec59fd2d465d42d611e
  hostname: fa4a1775811d
  iterations_since_restore: 2
  loss: 1.557911741732061
  node_ip: 172.28.0.2
  pid: 445
  time_since_restore: 129.24746680259705
  time_this_iter_s: 59.653135538101196
  time_total_s: 129.24746680259705
  timestamp: 1615484350
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: b965e_00009
  
[2m[36m(pid=445)[0m epoch: 1  train_loss:  1.5901369889194146 val_loss:  1.557911741732061 val_acc: 0.4342
== Status ==
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 8.000: -1.2140959628105163 | Iter 4.000: -1.3276116551160813 | Iter 2.000: -1.557911741732061 | Iter 1.000: -1.7739246208250523
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/2.49 GiB objects (0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/DEFAULT_2021-03-11_17-25-07
Number of tri

In [None]:
best_trial = result.get_best_trial(
    "loss", "min", "last")
print("Best trial config: {}".format(
    best_trial.config))
print("Best trial final validation loss:",
      "{}".format(
          best_trial.last_result["loss"]))
print("Best trial final validation accuracy:",
      "{}".format(
          best_trial.last_result["accuracy"]))

Best trial config: {'nodes_1': 256, 'nodes_2': 256, 'lr': 0.0016628841178862304, 'batch_size': 8}
Best trial final validation loss: 1.1886363194227219
Best trial final validation accuracy: 0.5855
