<a href="https://colab.research.google.com/github/josealgruiz/SYDE770_Project1/blob/main/SYDE770_Project_CNN1_Working.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Required Libraries

In [None]:
!pip install medmnist
from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

import medmnist
from medmnist import INFO, Evaluator

Collecting medmnist
  Downloading medmnist-2.0.2-py3-none-any.whl (21 kB)
Collecting fire
  Downloading fire-0.4.0.tar.gz (87 kB)
[K     |████████████████████████████████| 87 kB 3.4 MB/s 
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.4.0-py2.py3-none-any.whl size=115942 sha256=599d83bedcc3b3466913c616e030131736059e57cb24e376762cd0444220bf1a
  Stored in directory: /root/.cache/pip/wheels/8a/67/fb/2e8a12fa16661b9d5af1f654bd199366799740a85c64981226
Successfully built fire
Installing collected packages: fire, medmnist
Successfully installed fire-0.4.0 medmnist-2.0.2


Testing if the medmnist import is ok


In [None]:
print(f"MedMNIST v{medmnist.__version__} @ {medmnist.HOMEPAGE}")

MedMNIST v2.0.2 @ https://github.com/MedMNIST/MedMNIST/


Define Data to be used

In [None]:
data_flag = 'pathmnist' # To decide which dataset will be used. Like data_flag = 'breastmnist'
download = True

BATCH_SIZE = 128

info = INFO[data_flag]
task = info['task']
n_channels = info['n_channels']
n_classes = len(info['label'])

DataClass = getattr(medmnist, info['python_class'])

Read the MedMNIST data, preprocess them and encapsulate them into dataloader form.

In [None]:
# preprocessing
data_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[.5], std=[.5])
])

# load the data
train_dataset = DataClass(split='train', transform=data_transform, download=download)
test_dataset = DataClass(split='test', transform=data_transform, download=download)

pil_dataset = DataClass(split='train', download=download)

# encapsulate data into dataloader form
train_loader = data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
train_loader_at_eval = data.DataLoader(dataset=train_dataset, batch_size=2*BATCH_SIZE, shuffle=False)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=2*BATCH_SIZE, shuffle=False)

Downloading https://zenodo.org/record/5208230/files/pathmnist.npz?download=1 to /root/.medmnist/pathmnist.npz


  0%|          | 0/205615438 [00:00<?, ?it/s]

Using downloaded and verified file: /root/.medmnist/pathmnist.npz
Using downloaded and verified file: /root/.medmnist/pathmnist.npz


In [None]:
print(train_dataset)
print('      ')
print(test_dataset)

Dataset PathMNIST (pathmnist)
    Number of datapoints: 89996
    Root location: /root/.medmnist
    Split: train
    Task: multi-class
    Number of channels: 3
    Meaning of labels: {'0': 'adipose', '1': 'background', '2': 'debris', '3': 'lymphocytes', '4': 'mucus', '5': 'smooth muscle', '6': 'normal colon mucosa', '7': 'cancer-associated stroma', '8': 'colorectal adenocarcinoma epithelium'}
    Number of samples: {'train': 89996, 'val': 10004, 'test': 7180}
    Description: The PathMNIST is based on a prior study for predicting survival from colorectal cancer histology slides, providing a dataset (NCT-CRC-HE-100K) of 100,000 non-overlapping image patches from hematoxylin & eosin stained histological images, and a test dataset (CRC-VAL-HE-7K) of 7,180 image patches from a different clinical center. The dataset is comprised of 9 types of tissues, resulting in a multi-class classification task. We resize the source images of 3×224×224 into 3×28×28, and split NCT-CRC-HE-100K into train

Optimizers

In [None]:
#@title ADAM
import math
from torch.optim import Optimizer

class ADAMITOptimizer(Optimizer):
    """
    implements ADAM Algorithm, as a preceding step.
    """
    def __init__(self, params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False):
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        super(ADAMITOptimizer, self).__init__(params, defaults)
        
    def step(self):
        """
        Performs a single optimization step.
        """
        loss = None
        for group in self.param_groups:

            for p in group['params']:
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Momentum (Exponential MA of gradients)
                    state['exp_avg'] = torch.zeros_like(p.data)
                    #print(p.data.size())
                    # RMS Prop componenet. (Exponential MA of squared gradients). Denominator.
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']

                b1, b2 = group['betas']
                state['step'] += 1
                
                # L2 penalty. Gotta add to Gradient as well.
                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)

                # Momentum
                exp_avg = torch.mul(exp_avg, b1) + (1 - b1)*grad
                # RMS
                exp_avg_sq = torch.mul(exp_avg_sq, b2) + (1-b2)*(grad*grad)
                
                denom = exp_avg_sq.sqrt() + group['eps']

                bias_correction1 = 1 / (1 - b1 ** state['step'])
                bias_correction2 = 1 / (1 - b2 ** state['step'])
                
                adapted_learning_rate = group['lr'] * bias_correction1 / math.sqrt(bias_correction2)

                p.data = p.data - adapted_learning_rate * exp_avg / denom 
                
                if state['step']  % 10000 ==0:
                    print ("group:", group)
                    print("p: ",p)
                    print("p.data: ", p.data) # W = p.data
                
        return loss

In [None]:
#@title AdaBound
import math
import torch
from torch.optim import Optimizer


class AdaBound(Optimizer):
    """Implements AdaBound algorithm.
    It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): Adam learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        final_lr (float, optional): final (SGD) learning rate (default: 0.1)
        gamma (float, optional): convergence speed of the bound functions (default: 1e-3)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm
    .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate:
        https://openreview.net/forum?id=Bkg3g2R9FX
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3,
                 eps=1e-8, weight_decay=0, amsbound=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        if not 0.0 <= final_lr:
            raise ValueError("Invalid final learning rate: {}".format(final_lr))
        if not 0.0 <= gamma < 1.0:
            raise ValueError("Invalid gamma parameter: {}".format(gamma))
        defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps,
                        weight_decay=weight_decay, amsbound=amsbound)
        super(AdaBound, self).__init__(params, defaults)

        self.base_lrs = list(map(lambda group: group['lr'], self.param_groups))

    def __setstate__(self, state):
        super(AdaBound, self).__setstate__(state)
        for group in self.param_groups:
            group.setdefault('amsbound', False)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group, base_lr in zip(self.param_groups, self.base_lrs):
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError(
                        'Adam does not support sparse gradients, please consider SparseAdam instead')
                amsbound = group['amsbound']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if amsbound:
                        # Maintains max of all exp. moving avg. of sq. grad. values
                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                if amsbound:
                    max_exp_avg_sq = state['max_exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                if amsbound:
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                else:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                # Applies bounds on actual learning rate
                # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay
                final_lr = group['final_lr'] * group['lr'] / base_lr
                lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1))
                upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step']))
                step_size = torch.full_like(denom, step_size)
                step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg)

                p.data.add_(-step_size)

        return loss

In [None]:
#@title ND-Adam
import math
import torch
from torch.optim import Optimizer


class NDAdam(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0, vec_axes=None):
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay, vec_axes=vec_axes)
        super(NDAdam, self).__init__(params, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('ND-Adam does not support sparse gradients, please consider SparseAdam instead')

                state = self.state[p]

                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    if group['vec_axes']:
                        shape = list(p.size())
                        for i in group['vec_axes']:
                            shape[i] = 1
                        state['exp_avg_sq'] = torch.zeros(shape)
                        if torch.cuda.is_available():
                            state['exp_avg_sq'] = state['exp_avg_sq'].cuda()
                    else:
                        state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                if group['vec_axes']:
                    g_proj = grad * p.data
                    for i in group['vec_axes']:
                        g_proj = torch.sum(g_proj, i, True)
                    grad.add_(-g_proj * p.data)
                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
                    g_sqr = grad * grad
                    for i in group['vec_axes']:
                        g_sqr = torch.sum(g_sqr, i, True)
                    exp_avg_sq.mul_(beta2).add_(1 - beta2, g_sqr)
                else:
                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
                    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

                if group['vec_axes']:
                    norm = p.data * p.data
                    for i in group['vec_axes']:
                        norm = torch.sum(norm, i, True)
                    norm.sqrt_()
                    p.data.div_(norm)

        return loss

Define CNN model

In [None]:
class Net(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(Net, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels, 16, kernel_size=3),
            nn.BatchNorm2d(16),
            nn.ReLU())

        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 16, kernel_size=3),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.layer3 = nn.Sequential(
            nn.Conv2d(16, 64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU())
        
        self.layer4 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU())

        self.layer5 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.fc = nn.Sequential(
            nn.Linear(64 * 4 * 4, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes))

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

model = Net(in_channels=n_channels, num_classes=n_classes)

#device=torch.device("cuda:0" if torch.cuda.is_available else "cpu")
device=torch.device("cpu")
model.to(device)    
# define loss function and optimizer
if task == "multi-label, binary-class":
    criterion = nn.BCEWithLogitsLoss()
else:
    loss_fn=nn.CrossEntropyLoss()
    criterion = nn.CrossEntropyLoss()
print(model.parameters())
#Optimizers selection 
#optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
#optimizer = ADAMITOptimizer(model.parameters())
#optimizer = AdaBound(model.parameters())
#optimizer = NDAdam(model.parameters())

<generator object Module.parameters at 0x7f834a0b7150>


Train 

In [None]:
def train(epochs):
  print('\nEpoch : %d'%epoch)
  
  model.train()

  running_loss=0
  correct=0
  total=0

  for data in tqdm(train_loader):
    
    inputs,labels=data[0].to(device),data[1].to(device)
    
    optimizer.zero_grad()
    outputs=model(inputs)

    if task == 'multi-label, binary-class':
      labels = labels.to(torch.float32)
      loss = criterion(outputs, labels)
    else: 
      labels = labels.squeeze().long()
      loss = criterion(outputs, labels)
 
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    
    _, predicted = outputs.max(1)
    total += labels.size(0)
    correct += predicted.eq(labels).sum().item()
      
  train_loss=running_loss/len(train_loader)
  accu=100.*correct/total
  print('Train Loss: %.3f | Accuracy: %.3f'%(train_loss,accu))
  return accu,train_loss

Test

In [None]:
def test(epoch):
  model.eval()

  running_loss=0
  correct=0
  total=0

  with torch.no_grad():
    for data in tqdm(test_loader):
      images=data[0].to(device)

      
      labels = data[1].to(device)
      labels = labels.type(torch.LongTensor) 
      outputs=model(images)

      if task == 'multi-label, binary-class':
        labels = labels.to(torch.float32)
        outputs = outputs.softmax(dim=-1)
      else:
        labels = labels.squeeze().long()
        outputs = outputs.softmax(dim=-1)
        labels = labels.float().resize_(len(labels), 1)

      labels = torch.flatten(labels)
      loss= loss_fn(outputs,labels.long())
      running_loss+=loss.item()
      
      _, predicted = outputs.max(1)
      total += labels.size(0)
      correct += predicted.eq(labels).sum().item()
  
  test_loss=running_loss/len(test_loader)
  accu=100.*correct/total

  print('Test Loss: %.3f | Accuracy: %.3f'%(test_loss,accu)) 
  return accu,test_loss

In [None]:
def SelectOpt(x):
  if x == 1:
    optimizer = ADAMITOptimizer(model.parameters())
    return optimizer
  if x == 2:
    optimizer = AdaBound(model.parameters())
    return optimizer
  else:
    optimizer = NDAdam(model.parameters())
    return optimizer

In [None]:
train_accu_Adam=[0,]
train_losses_Adam=[0,]
eval_losses_Adam=[0,]
eval_accu_Adam=[0,]
train_accu_AdaBound=[0,]
train_losses_AdaBound=[0,]
eval_losses_AdaBound=[0,]
eval_accu_AdaBound=[0,]
train_accu_NDAdam=[0,]
train_losses_NDAdam=[0,]
eval_losses_NDAdam=[0,]
eval_accu_NDAdam=[0,]

epochs=50

for x in range(1,4):
  print(x)
  optimizer = SelectOpt(x)
  print('Optimizer: %s' % optimizer)

  for epoch in range(1,epochs+1):
    train_accu,train_loss = train(epoch)
    test_accu,test_loss = test(epoch)
    if x == 1:
      train_accu_Adam.append(train_accu)
      train_losses_Adam.append(train_loss)
      eval_losses_Adam.append(test_loss)
      eval_accu_Adam.append(test_accu)
    if x == 2:
      train_accu_AdaBound.append(train_accu)
      train_losses_AdaBound.append(train_loss)
      eval_losses_AdaBound.append(test_loss)
      eval_accu_AdaBound.append(test_accu)
    if x == 3:
      train_accu_NDAdam.append(train_accu)
      train_losses_NDAdam.append(train_loss)
      eval_losses_NDAdam.append(test_loss)
      eval_accu_NDAdam.append(test_accu)

1
Optimizer: ADAMITOptimizer (
Parameter Group 0
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

Epoch : 1


100%|██████████| 704/704 [02:56<00:00,  3.98it/s]


Train Loss: 0.936 | Accuracy: 67.801


100%|██████████| 29/29 [00:05<00:00,  5.41it/s]


Test Loss: 1.804 | Accuracy: 58.886

Epoch : 2


100%|██████████| 704/704 [02:49<00:00,  4.16it/s]


Train Loss: 1.321 | Accuracy: 69.359


100%|██████████| 29/29 [00:05<00:00,  5.39it/s]


Test Loss: 1.705 | Accuracy: 65.808

Epoch : 3


100%|██████████| 704/704 [02:50<00:00,  4.13it/s]


Train Loss: 1.495 | Accuracy: 70.091


100%|██████████| 29/29 [00:05<00:00,  5.46it/s]


Test Loss: 1.801 | Accuracy: 58.287

Epoch : 4


100%|██████████| 704/704 [02:57<00:00,  3.96it/s]


Train Loss: 1.599 | Accuracy: 71.180


100%|██████████| 29/29 [00:05<00:00,  5.46it/s]


Test Loss: 1.717 | Accuracy: 65.446

Epoch : 5


100%|██████████| 704/704 [03:06<00:00,  3.77it/s]


Train Loss: 1.798 | Accuracy: 72.161


100%|██████████| 29/29 [00:05<00:00,  5.44it/s]


Test Loss: 1.624 | Accuracy: 74.150

Epoch : 6


100%|██████████| 704/704 [03:01<00:00,  3.87it/s]


Train Loss: 1.928 | Accuracy: 73.689


100%|██████████| 29/29 [00:05<00:00,  5.39it/s]


Test Loss: 1.886 | Accuracy: 49.763

Epoch : 7


100%|██████████| 704/704 [03:04<00:00,  3.82it/s]


Train Loss: 1.978 | Accuracy: 75.112


100%|██████████| 29/29 [00:05<00:00,  5.45it/s]


Test Loss: 1.644 | Accuracy: 72.187

Epoch : 8


100%|██████████| 704/704 [03:04<00:00,  3.82it/s]


Train Loss: 2.257 | Accuracy: 76.770


100%|██████████| 29/29 [00:05<00:00,  5.52it/s]


Test Loss: 1.783 | Accuracy: 58.774

Epoch : 9


100%|██████████| 704/704 [03:04<00:00,  3.82it/s]


Train Loss: 1.936 | Accuracy: 78.545


100%|██████████| 29/29 [00:05<00:00,  5.46it/s]


Test Loss: 1.805 | Accuracy: 56.031

Epoch : 10


100%|██████████| 704/704 [03:01<00:00,  3.88it/s]


Train Loss: 1.982 | Accuracy: 78.135


100%|██████████| 29/29 [00:05<00:00,  5.46it/s]


Test Loss: 1.690 | Accuracy: 68.705

Epoch : 11


100%|██████████| 704/704 [02:58<00:00,  3.95it/s]


Train Loss: 2.316 | Accuracy: 78.639


100%|██████████| 29/29 [00:05<00:00,  5.47it/s]


Test Loss: 1.713 | Accuracy: 65.223

Epoch : 12


100%|██████████| 704/704 [03:01<00:00,  3.88it/s]


Train Loss: 2.277 | Accuracy: 80.006


100%|██████████| 29/29 [00:05<00:00,  5.44it/s]


Test Loss: 1.564 | Accuracy: 80.195

Epoch : 13


100%|██████████| 704/704 [03:03<00:00,  3.83it/s]


Train Loss: 2.363 | Accuracy: 80.588


100%|██████████| 29/29 [00:05<00:00,  5.46it/s]


Test Loss: 1.726 | Accuracy: 64.749

Epoch : 14


100%|██████████| 704/704 [03:00<00:00,  3.89it/s]


Train Loss: 1.907 | Accuracy: 81.161


100%|██████████| 29/29 [00:05<00:00,  5.56it/s]


Test Loss: 1.905 | Accuracy: 45.446

Epoch : 15


 20%|██        | 143/704 [00:35<02:16,  4.11it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
          [ 3.9667e-01,  7.0356e-01,  5.2928e-01]],

         [[-1.6006e-01,  4.3180e-02,  1.2574e-02],
          [ 3.4219e-02, -9.8534e-02, -4.2681e-01],
          [ 7.7380e-01, -5.7061e-02,  4.0285e-02]]],


        [[[ 8.5832e-03, -2.3039e-01, -4.4235e-01],
          [-4.5491e-02, -1.7488e-01, -3.7034e-01],
          [ 1.8145e-01, -7.2036e-02, -1.5896e-01]],

         [[ 3.3476e-01, -8.0876e-02, -4.6738e-02],
          [ 1.8877e-01, -3.3798e-01, -1.9834e-01],
          [-4.7851e-02, -5.5604e-02, -1.3832e-01]],

         [[-4.7322e-02, -3.8186e-02,  2.4021e-01],
          [-8.2685e-02, -3.7242e-01, -1.5755e-01],
          [-3.0270e-01, -2.2066e-01, -9.0877e-02]],

         ...,

         [[ 6.9889e-01, -5.7631e-01, -7.5757e-01],
          [ 7.0252e-01, -3.4568e-01, -5.1236e-01],
          [ 7.4583e-01,  1.6170e-01, -2.5867e-01]],

         [[ 4.5161e-01,  7.0441e-01,  6.2013e-01],
          [ 3.6607e-01,  2.4685e-01,  4

 20%|██        | 144/704 [00:37<05:39,  1.65it/s]

{'params': [Parameter containing:
tensor([[[[-0.0938, -0.0087, -0.0537],
          [ 0.0136,  0.1913,  0.0227],
          [-0.0350,  0.2050,  0.1047]],

         [[ 0.1007, -0.2436, -0.0891],
          [ 0.5411, -0.0219, -0.5111],
          [ 0.0245, -0.0081, -0.4873]],

         [[ 0.3645,  0.1468,  0.1118],
          [ 1.6081,  0.6157,  0.1723],
          [ 0.4834,  0.2051,  0.0199]]],


        [[[-0.3244, -0.7818, -0.2061],
          [-0.0105,  0.2515, -0.3334],
          [-0.0991,  0.1167,  0.1719]],

         [[ 0.1069,  0.1282,  0.1084],
          [-0.0875,  0.2135,  0.0719],
          [-0.0408, -0.2772,  0.0226]],

         [[-0.0034,  0.1633, -0.0739],
          [ 0.1409,  0.1855,  0.0469],
          [ 0.3180, -0.1589, -0.1949]]],


        [[[-0.0661,  0.0999, -0.2063],
          [-0.0890, -0.1009, -0.2893],
          [-0.1720, -0.0533, -0.0976]],

         [[-0.0572,  0.1216,  0.3849],
          [ 0.2308,  0.1401,  0.0749],
          [ 0.1953,  0.0730,  0.2266]],

         [

100%|██████████| 704/704 [02:59<00:00,  3.91it/s]


Train Loss: 1.799 | Accuracy: 81.381


100%|██████████| 29/29 [00:05<00:00,  5.39it/s]


Test Loss: 1.641 | Accuracy: 72.883

Epoch : 16


100%|██████████| 704/704 [03:04<00:00,  3.81it/s]


Train Loss: 1.860 | Accuracy: 81.115


100%|██████████| 29/29 [00:05<00:00,  5.36it/s]


Test Loss: 1.698 | Accuracy: 66.407

Epoch : 17


100%|██████████| 704/704 [03:19<00:00,  3.52it/s]


Train Loss: 1.946 | Accuracy: 81.276


100%|██████████| 29/29 [00:05<00:00,  5.43it/s]


Test Loss: 1.605 | Accuracy: 76.922

Epoch : 18


100%|██████████| 704/704 [03:08<00:00,  3.73it/s]


Train Loss: 1.961 | Accuracy: 81.019


100%|██████████| 29/29 [00:05<00:00,  5.40it/s]


Test Loss: 1.783 | Accuracy: 59.011

Epoch : 19


100%|██████████| 704/704 [03:11<00:00,  3.67it/s]


Train Loss: 2.246 | Accuracy: 81.400


100%|██████████| 29/29 [00:05<00:00,  5.36it/s]


Test Loss: 1.790 | Accuracy: 57.465

Epoch : 20


100%|██████████| 704/704 [03:09<00:00,  3.72it/s]


Train Loss: 2.026 | Accuracy: 81.973


100%|██████████| 29/29 [00:05<00:00,  5.40it/s]


Test Loss: 1.594 | Accuracy: 77.660

Epoch : 21


 63%|██████▎   | 441/704 [01:56<01:09,  3.76it/s]

In [None]:
plt.plot(train_accu_Adam)
plt.plot(train_accu_AdaBound)
plt.plot(train_accu_NDAdam)
plt.xlabel('epoch')
plt.xlim(1, epoch)
plt.ylabel('accuracy')
plt.legend(['Adam','AdaBound','NDAdam'])
plt.title('Train Accuracy')

plt.show()

In [None]:
plt.plot(train_losses_Adam)
plt.plot(train_losses_AdaBound)
plt.plot(train_losses_NDAdam)
plt.xlabel('epoch')
plt.xlim(1, epoch)
plt.ylabel('loss')
plt.legend(['Adam','AdaBound','NDAdam'])
plt.title('Train Losses')

plt.show()

In [None]:
plt.plot(eval_accu_Adam)
plt.plot(eval_accu_AdaBound)
plt.plot(eval_accu_NDAdam)
plt.xlabel('epoch')
plt.xlim(1, epoch)
plt.ylabel('accuracy')
plt.legend(['Adam','AdaBound','NDAdam'])
plt.title('Test Accuracy')

plt.show()

In [None]:
plt.plot(eval_losses_Adam)
plt.plot(eval_losses_AdaBound)
plt.plot(eval_losses_NDAdam)
plt.xlabel('epoch')
plt.xlim(1, epoch)
plt.ylabel('loss')
plt.legend(['Adam','AdaBound','NDAdam'])
plt.title('Test Losses')

plt.show()