In [1]:
!nvidia-smi

Thu Aug 12 12:55:04 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  RTX A6000           On   | 00000000:09:00.0 Off |                  Off |
| 63%   79C    P2   110W / 300W |   1281MiB / 48682MiB |     19%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
batch = 1024
n_epochs = 100
opt = "adam"
init_lr = 1e-3
use_amp = True

In [3]:
import argparse
import os
import shutil
import time
import pandas as pd
import network.resnet_orig as resnet
import matplotlib.pyplot as plt

import time
import numpy as np
import pandas as pd
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler, RandomSampler, SequentialSampler
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau

model_names = sorted(name for name in resnet.__dict__
    if name.islower() and not name.startswith("__")
                     and name.startswith("resnet")
                     and callable(resnet.__dict__[name]))

DATA_DIR = "train"

print_freq = 50

mixup = False

In [4]:
train_df = pd.read_csv("trainLabels.csv")
train_df.head()

Unnamed: 0,id,label
0,1,frog
1,2,truck
2,3,truck
3,4,deer
4,5,automobile


In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_df['label_enc'] = le.fit_transform(train_df['label'])
train_df.head()

Unnamed: 0,id,label,label_enc
0,1,frog,6
1,2,truck,9
2,3,truck,9
3,4,deer,4
4,5,automobile,1


In [6]:
# 5-flod
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_df["fold"] = -1
for i, (train_index, test_index) in enumerate(skf.split(train_df.id, train_df.label_enc)):
    train_df.loc[test_index, 'fold'] = i
train_df.head()

Unnamed: 0,id,label,label_enc,fold
0,1,frog,6,1
1,2,truck,9,1
2,3,truck,9,2
3,4,deer,4,1
4,5,automobile,1,2


# prepare dataset

In [7]:
mixup = False

class cifarDataset(Dataset):
    def __init__(self,
                 df,
                 rand=False,
                 transform=None,
                 test=False
                ):

        self.df = df.reset_index(drop=True)
        self.rand = rand
        self.transform = transform
        self.test = test

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        img_id = row.id
        
        images = cv2.imread(os.path.join(DATA_DIR, str(img_id)+".png"))
        
        # Load labels
        label = row.label_enc
        
        # aug
        if self.transform is not None:
            images = self.transform(image=images)['image']
        
        # Mixup part
        """
        rd = torch.rand(1)
        label2 = label
        gamma = np.array(np.ones(1)).astype(np.float32)[0]
        if mixup and rd < 0.3 and self.transform is not None and not self.test:
            mix_idx = np.random.random_integers(0, len(self.df))
            row2 = self.df.iloc[mix_idx]
            img_id2 = row2.id
            images2 = cv2.imread(os.path.join(DATA_DIR, str(img_id2)+".png"))
            
            if self.transform is not None:
                images2 = self.transform(image=images2)['image']
            
            # blend image
            gamma = np.array(np.random.beta(1,1)).astype(np.float32)
            images = ((images*gamma + images2*(1-gamma))).astype(np.uint8)
            # blend labels
            label2 = row2.label_enc
        """
              
        images = images.astype(np.float32)
        images /= 255
        images = images.transpose(2, 0, 1)
        
        label = label.astype(np.float32)
        #label2 = label2.astype(np.float32)
        return torch.tensor(images), torch.tensor(label)

In [8]:
import albumentations as A
import albumentations

imsize = 32
transforms_train = albumentations.Compose([
    albumentations.ShiftScaleRotate(scale_limit=0.3, rotate_limit=180,p=0.5),
    A.OneOf([
        A.HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit= 0.2, 
                             val_shift_limit=0.2, p=0.5),
        A.RandomBrightnessContrast(brightness_limit=0.2, 
                                   contrast_limit=0.2, p=0.5),
    ],p=0.9),
    A.Cutout(num_holes=12, max_h_size=4, max_w_size=4, fill_value=0, p=0.5),
    albumentations.Rotate(p=0.5),
    albumentations.Transpose(p=0.5),
    #albumentations.VerticalFlip(p=0.5),
    albumentations.HorizontalFlip(p=0.5),   
    albumentations.Resize(imsize, imsize, p=1.0), 
])
transforms_val = albumentations.Compose([albumentations.Resize(imsize, imsize, p=1.0)])



show imgs

In [9]:
dataset_show = cifarDataset(train_df, transform=transforms_train)
from pylab import rcParams
rcParams['figure.figsize'] = 20,10
for i in range(3):
    f, axarr = plt.subplots(1,5)
    for p in range(5):
        idx = np.random.randint(0, len(dataset_show))
        img, label = dataset_show[idx]
        img = img.flip(0) #BGR2RGB
        axarr[p].imshow(img.transpose(0,1).transpose(1,2))
        axarr[p].set_title(str(label))

# Trainer

In [10]:
def mixup_data(x, y, alpha=1.0, use_cuda=True):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    if use_cuda:
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

In [11]:
def train(train_loader, model, criterion, optimizer, epoch):
    """
        Run one train epoch
    """
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    lambda_alpha = 0.00002
    for i, (input, target) in enumerate(train_loader):

        # measure data loading time
        data_time.update(time.time() - end)

        target = target.long().cuda()
        input_var = input.cuda()
        target_var = target

        if not mixup:
          alpha = 0
        else:
          alpha = 1
        
        with torch.cuda.amp.autocast(enabled=use_amp):
            # compute output
            output = model(input_var)
            loss = criterion(output, target_var)

        # L2 regularization
        l2_alpha = 0.0
        for name, param in model.named_parameters():
            if "alpha" in name:
                l2_alpha += torch.pow(param, 2)
        loss += lambda_alpha * l2_alpha

        # compute gradient and do SGD step          
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        output = output.float()
        loss = loss.float()
        # measure accuracy and record loss
        prec1 = accuracy(output.data, target)[0]
        losses.update(loss.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      epoch, i, len(train_loader), batch_time=batch_time,
                      data_time=data_time, loss=losses, top1=top1))
    return losses.avg

def validate(val_loader, model, criterion):
    """
    Run evaluation
    """
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
            target = target.long().cuda()
            input_var = input.cuda()
            target_var = target.cuda()

            # compute output
            output = model(input_var)
            loss = criterion(output, target_var)

            output = output.float()
            loss = loss.float()

            # measure accuracy and record loss
            prec1 = accuracy(output.data, target)[0]
            losses.update(loss.item(), input.size(0))
            top1.update(prec1.item(), input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0:
                print('Test: [{0}/{1}]\t'
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                          i, len(val_loader), batch_time=batch_time, loss=losses,
                          top1=top1))

    print(' * Prec@1 {top1.avg:.3f}'
          .format(top1=top1))
    for name, param in model.named_parameters():
        if "alpha" in name:
            print(name, param.item())
    return top1.avg, losses.avg

def save_checkpoint(state, filename='checkpoint.pth'):
    """
    Save the training model
    """
    torch.save(state, filename)

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

# Train loop

In [12]:
train_dataset = cifarDataset(train_df[train_df.fold!=0], transform=transforms_train)
val_dataset = cifarDataset(train_df[train_df.fold==0], transform=transforms_val, test=True)

train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch, shuffle=True,
        num_workers=8, pin_memory=True)

val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=batch, shuffle=False,
    num_workers=8, pin_memory=True)

In [None]:
for net in ["resnet34", "resnet50"]:
    for K in [3,4,6,8]:
        model = resnet.__dict__[net](K)
        model = model.cuda()

        import wandb
        watermark = "{}_pact{}bit".format(net, K)
        wandb.init(project="pact_forpaper",
                    name=watermark)

        # define loss function (criterion) and pptimizer
        criterion = nn.CrossEntropyLoss().cuda()

        def mixup_criterion(criterion, pred, y_a, y_b, lam):
            return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

        if opt=="adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=init_lr)
        
        scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs-1)

        best_prec1 = 0
        os.makedirs("models", exist_ok=True)

        for epoch in range(n_epochs):
            # train for one epoch
            start = time.time()
            print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr']))
            tloss = train(train_loader, model, criterion, optimizer, epoch)
            lr_scheduler.step()

            # evaluate on validation set
            prec1, valloss = validate(val_loader, model, criterion)
            
            end = time.time() - start
            # wandb
            wandb.log({'epoch': epoch, "prec":prec1, "train_loss": tloss, 'val_loss': valloss, "epoch_time":end, "lr": optimizer.param_groups[0]["lr"],})

            # remember best prec@1 and save checkpoint
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)

            print("Best prec1 : ", best_prec1)
            if is_best:
                torch.save(model.state_dict(), os.path.join(f'models/{net}_quant{K}.pth'))

bit width: 3


[34m[1mwandb[0m: Currently logged in as: [33marutema47[0m (use `wandb login --relogin` to force relogin)


current lr 1.00000e-03
Epoch: [0][0/40]	Time 2.675 (2.675)	Data 1.630 (1.630)	Loss 7.9151 (7.9151)	Prec@1 10.254 (10.254)
Test: [0/10]	Time 0.708 (0.708)	Loss 3.7535 (3.7535)	Prec@1 15.430 (15.430)
 * Prec@1 14.070
alpha1 9.961016654968262
layer1.0.alpha1 9.961016654968262
layer1.0.alpha2 9.963092803955078
layer1.1.alpha1 9.961016654968262
layer1.1.alpha2 9.967028617858887
layer1.2.alpha1 9.961016654968262
layer1.2.alpha2 9.986981391906738
layer2.0.alpha1 9.961016654968262
layer2.0.alpha2 9.961400985717773
layer2.1.alpha1 9.961016654968262
layer2.1.alpha2 9.964095115661621
layer2.2.alpha1 9.961016654968262
layer2.2.alpha2 9.974077224731445
layer2.3.alpha1 9.961016654968262
layer2.3.alpha2 9.991313934326172
layer3.0.alpha1 9.961016654968262
layer3.0.alpha2 9.961020469665527
layer3.1.alpha1 9.961016654968262
layer3.1.alpha2 9.961971282958984
layer3.2.alpha1 9.961016654968262
layer3.2.alpha2 9.967643737792969
layer3.3.alpha1 9.961016654968262
layer3.3.alpha2 9.980507850646973
layer3.4.alp

Test: [0/10]	Time 0.720 (0.720)	Loss 1.7971 (1.7971)	Prec@1 37.891 (37.891)
 * Prec@1 39.480
alpha1 9.723559379577637
layer1.0.alpha1 9.72354793548584
layer1.0.alpha2 9.728409767150879
layer1.1.alpha1 9.723597526550293
layer1.1.alpha2 9.741388320922852
layer1.2.alpha1 9.723562240600586
layer1.2.alpha2 9.866499900817871
layer2.0.alpha1 9.723562240600586
layer2.0.alpha2 9.724542617797852
layer2.1.alpha1 9.723562240600586
layer2.1.alpha2 9.732161521911621
layer2.2.alpha1 9.723562240600586
layer2.2.alpha2 9.765347480773926
layer2.3.alpha1 9.723562240600586
layer2.3.alpha2 9.885980606079102
layer3.0.alpha1 9.723562240600586
layer3.0.alpha2 9.72352123260498
layer3.1.alpha1 9.723562240600586
layer3.1.alpha2 9.725334167480469
layer3.2.alpha1 9.723562240600586
layer3.2.alpha2 9.740091323852539
layer3.3.alpha1 9.723562240600586
layer3.3.alpha2 9.811277389526367
layer3.4.alpha1 9.723562240600586
layer3.4.alpha2 9.889022827148438
layer3.5.alpha1 9.723562240600586
layer3.5.alpha2 9.982885360717773


current lr 9.58054e-04
Epoch: [13][0/40]	Time 1.581 (1.581)	Data 1.219 (1.219)	Loss 1.7720 (1.7720)	Prec@1 38.867 (38.867)
Test: [0/10]	Time 0.802 (0.802)	Loss 1.5923 (1.5923)	Prec@1 44.629 (44.629)
 * Prec@1 43.270
alpha1 9.455976486206055
layer1.0.alpha1 9.456046104431152
layer1.0.alpha2 9.46408748626709
layer1.1.alpha1 9.455964088439941
layer1.1.alpha2 9.48037052154541
layer1.2.alpha1 9.455748558044434
layer1.2.alpha2 9.67140007019043
layer2.0.alpha1 9.455869674682617
layer2.0.alpha2 9.45744800567627
layer2.1.alpha1 9.455892562866211
layer2.1.alpha2 9.466222763061523
layer2.2.alpha1 9.455869674682617
layer2.2.alpha2 9.51450252532959
layer2.3.alpha1 9.455869674682617
layer2.3.alpha2 9.721048355102539
layer3.0.alpha1 9.455869674682617
layer3.0.alpha2 9.45604419708252
layer3.1.alpha1 9.455869674682617
layer3.1.alpha2 9.460098266601562
layer3.2.alpha1 9.455869674682617
layer3.2.alpha2 9.481864929199219
layer3.3.alpha1 9.455869674682617
layer3.3.alpha2 9.604792594909668
layer3.4.alpha1 9

Test: [0/10]	Time 0.803 (0.803)	Loss 1.3570 (1.3570)	Prec@1 53.809 (53.809)
 * Prec@1 53.730
alpha1 9.23940372467041
layer1.0.alpha1 9.23880386352539
layer1.0.alpha2 9.251248359680176
layer1.1.alpha1 9.238682746887207
layer1.1.alpha2 9.268304824829102
layer1.2.alpha1 9.238452911376953
layer1.2.alpha2 9.504947662353516
layer2.0.alpha1 9.23854923248291
layer2.0.alpha2 9.240507125854492
layer2.1.alpha1 9.238579750061035
layer2.1.alpha2 9.247025489807129
layer2.2.alpha1 9.23854923248291
layer2.2.alpha2 9.312469482421875
layer2.3.alpha1 9.23854923248291
layer2.3.alpha2 9.594168663024902
layer3.0.alpha1 9.23854923248291
layer3.0.alpha2 9.238606452941895
layer3.1.alpha1 9.23854923248291
layer3.1.alpha2 9.246543884277344
layer3.2.alpha1 9.23854923248291
layer3.2.alpha2 9.280101776123047
layer3.3.alpha1 9.23854923248291
layer3.3.alpha2 9.452621459960938
layer3.4.alpha1 9.23854923248291
layer3.4.alpha2 9.671212196350098
layer3.5.alpha1 9.23854923248291
layer3.5.alpha2 9.940933227539062
layer4.0.

current lr 8.39255e-04
Epoch: [26][0/40]	Time 1.673 (1.673)	Data 1.311 (1.311)	Loss 1.2413 (1.2413)	Prec@1 59.180 (59.180)
Test: [0/10]	Time 0.710 (0.710)	Loss 1.0412 (1.0412)	Prec@1 63.672 (63.672)
 * Prec@1 62.460
alpha1 9.004776954650879
layer1.0.alpha1 9.003866195678711
layer1.0.alpha2 9.016007423400879
layer1.1.alpha1 9.003510475158691
layer1.1.alpha2 9.035140991210938
layer1.2.alpha1 9.00344181060791
layer1.2.alpha2 9.328361511230469
layer2.0.alpha1 9.003539085388184
layer2.0.alpha2 9.005593299865723
layer2.1.alpha1 9.003579139709473
layer2.1.alpha2 9.016892433166504
layer2.2.alpha1 9.003546714782715
layer2.2.alpha2 9.10080337524414
layer2.3.alpha1 9.003546714782715
layer2.3.alpha2 9.462677001953125
layer3.0.alpha1 9.003546714782715
layer3.0.alpha2 9.003273010253906
layer3.1.alpha1 9.003546714782715
layer3.1.alpha2 9.018997192382812
layer3.2.alpha1 9.003548622131348
layer3.2.alpha2 9.084328651428223
layer3.3.alpha1 9.003546714782715
layer3.3.alpha2 9.299187660217285
layer3.4.alph

Test: [0/10]	Time 0.699 (0.699)	Loss 1.1544 (1.1544)	Prec@1 60.547 (60.547)
 * Prec@1 60.040
alpha1 8.821176528930664
layer1.0.alpha1 8.821073532104492
layer1.0.alpha2 8.836234092712402
layer1.1.alpha1 8.820929527282715
layer1.1.alpha2 8.859259605407715
layer1.2.alpha1 8.8208589553833
layer1.2.alpha2 9.19146728515625
layer2.0.alpha1 8.820907592773438
layer2.0.alpha2 8.823227882385254
layer2.1.alpha1 8.820951461791992
layer2.1.alpha2 8.843689918518066
layer2.2.alpha1 8.820917129516602
layer2.2.alpha2 8.936962127685547
layer2.3.alpha1 8.820917129516602
layer2.3.alpha2 9.387287139892578
layer3.0.alpha1 8.820917129516602
layer3.0.alpha2 8.821667671203613
layer3.1.alpha1 8.820916175842285
layer3.1.alpha2 8.85511589050293
layer3.2.alpha1 8.820898056030273
layer3.2.alpha2 8.944924354553223
layer3.3.alpha1 8.820917129516602
layer3.3.alpha2 9.200823783874512
layer3.4.alpha1 8.820917129516602
layer3.4.alpha2 9.538865089416504
layer3.5.alpha1 8.820917129516602
layer3.5.alpha2 9.901671409606934
la

 * Prec@1 69.350
alpha1 8.658278465270996
layer1.0.alpha1 8.657696723937988
layer1.0.alpha2 8.671545028686523
layer1.1.alpha1 8.657767295837402
layer1.1.alpha2 8.697294235229492
layer1.2.alpha1 8.65772819519043
layer1.2.alpha2 9.080001831054688
layer2.0.alpha1 8.657708168029785
layer2.0.alpha2 8.662351608276367
layer2.1.alpha1 8.657752990722656
layer2.1.alpha2 8.688854217529297
layer2.2.alpha1 8.657716751098633
layer2.2.alpha2 8.801848411560059
layer2.3.alpha1 8.65771484375
layer2.3.alpha2 9.309135437011719
layer3.0.alpha1 8.657716751098633
layer3.0.alpha2 8.658821105957031
layer3.1.alpha1 8.657715797424316
layer3.1.alpha2 8.706208229064941
layer3.2.alpha1 8.657681465148926
layer3.2.alpha2 8.81749439239502
layer3.3.alpha1 8.657716751098633
layer3.3.alpha2 9.093826293945312
layer3.4.alpha1 8.657716751098633
layer3.4.alpha2 9.487510681152344
layer3.5.alpha1 8.657716751098633
layer3.5.alpha2 9.876951217651367
layer4.0.alpha1 8.657716751098633
layer4.0.alpha2 8.660270690917969
layer4.1.alp

current lr 5.71157e-04
Epoch: [45][0/40]	Time 1.527 (1.527)	Data 1.165 (1.165)	Loss 1.0252 (1.0252)	Prec@1 66.797 (66.797)
Test: [0/10]	Time 0.693 (0.693)	Loss 0.9529 (0.9529)	Prec@1 68.555 (68.555)
 * Prec@1 65.320
alpha1 8.492066383361816
layer1.0.alpha1 8.493112564086914
layer1.0.alpha2 8.50615406036377
layer1.1.alpha1 8.493728637695312
layer1.1.alpha2 8.539142608642578
layer1.2.alpha1 8.493714332580566
layer1.2.alpha2 8.98875617980957
layer2.0.alpha1 8.493563652038574
layer2.0.alpha2 8.499723434448242
layer2.1.alpha1 8.493638038635254
layer2.1.alpha2 8.533878326416016
layer2.2.alpha1 8.493598937988281
layer2.2.alpha2 8.671133041381836
layer2.3.alpha1 8.493606567382812
layer2.3.alpha2 9.250310897827148
layer3.0.alpha1 8.49360179901123
layer3.0.alpha2 8.495264053344727
layer3.1.alpha1 8.493610382080078
layer3.1.alpha2 8.560054779052734
layer3.2.alpha1 8.493597030639648
layer3.2.alpha2 8.69347095489502
layer3.3.alpha1 8.493602752685547
layer3.3.alpha2 9.011168479919434
layer3.4.alpha1

Test: [0/10]	Time 0.800 (0.800)	Loss 0.6974 (0.6974)	Prec@1 76.270 (76.270)
 * Prec@1 75.480
alpha1 8.372021675109863
layer1.0.alpha1 8.375829696655273
layer1.0.alpha2 8.391371726989746
layer1.1.alpha1 8.376425743103027
layer1.1.alpha2 8.429255485534668
layer1.2.alpha1 8.376311302185059
layer1.2.alpha2 8.932674407958984
layer2.0.alpha1 8.376167297363281
layer2.0.alpha2 8.385574340820312
layer2.1.alpha1 8.376291275024414
layer2.1.alpha2 8.43358325958252
layer2.2.alpha1 8.376236915588379
layer2.2.alpha2 8.591208457946777
layer2.3.alpha1 8.376254081726074
layer2.3.alpha2 9.2060546875
layer3.0.alpha1 8.376238822937012
layer3.0.alpha2 8.378664016723633
layer3.1.alpha1 8.376246452331543
layer3.1.alpha2 8.45755672454834
layer3.2.alpha1 8.37619400024414
layer3.2.alpha2 8.606171607971191
layer3.3.alpha1 8.37624454498291
layer3.3.alpha2 8.95334243774414
layer3.4.alpha1 8.376240730285645
layer3.4.alpha2 9.407678604125977
layer3.5.alpha1 8.376240730285645
layer3.5.alpha2 9.839092254638672
layer4.0

 * Prec@1 76.740
alpha1 8.274807929992676
layer1.0.alpha1 8.279826164245605
layer1.0.alpha2 8.292598724365234
layer1.1.alpha1 8.280461311340332
layer1.1.alpha2 8.347981452941895
layer1.2.alpha1 8.280370712280273
layer1.2.alpha2 8.885651588439941
layer2.0.alpha1 8.280192375183105
layer2.0.alpha2 8.294291496276855
layer2.1.alpha1 8.280351638793945
layer2.1.alpha2 8.352629661560059
layer2.2.alpha1 8.28029727935791
layer2.2.alpha2 8.51950740814209
layer2.3.alpha1 8.280314445495605
layer2.3.alpha2 9.178206443786621
layer3.0.alpha1 8.280299186706543
layer3.0.alpha2 8.283313751220703
layer3.1.alpha1 8.280306816101074
layer3.1.alpha2 8.368911743164062
layer3.2.alpha1 8.280251502990723
layer3.2.alpha2 8.535609245300293
layer3.3.alpha1 8.28031063079834
layer3.3.alpha2 8.899867057800293
layer3.4.alpha1 8.280301094055176
layer3.4.alpha2 9.384349822998047
layer3.5.alpha1 8.280301094055176
layer3.5.alpha2 9.819148063659668
layer4.0.alpha1 8.280343055725098
layer4.0.alpha2 8.284207344055176
layer4.1.

current lr 2.77967e-04
Epoch: [64][0/40]	Time 1.741 (1.741)	Data 1.380 (1.380)	Loss 0.7809 (0.7809)	Prec@1 74.512 (74.512)
Test: [0/10]	Time 0.711 (0.711)	Loss 0.5513 (0.5513)	Prec@1 81.152 (81.152)
 * Prec@1 78.570
alpha1 8.188345909118652
layer1.0.alpha1 8.193784713745117
layer1.0.alpha2 8.204623222351074
layer1.1.alpha1 8.194661140441895
layer1.1.alpha2 8.274359703063965
layer1.2.alpha1 8.194559097290039
layer1.2.alpha2 8.846793174743652
layer2.0.alpha1 8.194478034973145
layer2.0.alpha2 8.21268367767334
layer2.1.alpha1 8.194568634033203
layer2.1.alpha2 8.276836395263672
layer2.2.alpha1 8.194513320922852
layer2.2.alpha2 8.46652889251709
layer2.3.alpha1 8.19451904296875
layer2.3.alpha2 9.158225059509277
layer3.0.alpha1 8.1945161819458
layer3.0.alpha2 8.198034286499023
layer3.1.alpha1 8.194520950317383
layer3.1.alpha2 8.295705795288086
layer3.2.alpha1 8.194491386413574
layer3.2.alpha2 8.473134994506836
layer3.3.alpha1 8.194544792175293
layer3.3.alpha2 8.85500431060791
layer3.4.alpha1 8

Test: [0/10]	Time 0.788 (0.788)	Loss 0.5147 (0.5147)	Prec@1 82.324 (82.324)
 * Prec@1 80.800
alpha1 8.134001731872559
layer1.0.alpha1 8.1410493850708
layer1.0.alpha2 8.152087211608887
layer1.1.alpha1 8.141799926757812
layer1.1.alpha2 8.232278823852539
layer1.2.alpha1 8.141716957092285
layer1.2.alpha2 8.827899932861328
layer2.0.alpha1 8.141727447509766
layer2.0.alpha2 8.163111686706543
layer2.1.alpha1 8.141773223876953
layer2.1.alpha2 8.231544494628906
layer2.2.alpha1 8.141717910766602
layer2.2.alpha2 8.431575775146484
layer2.3.alpha1 8.1417236328125
layer2.3.alpha2 9.147039413452148
layer3.0.alpha1 8.14172077178955
layer3.0.alpha2 8.146195411682129
layer3.1.alpha1 8.141725540161133
layer3.1.alpha2 8.246376037597656
layer3.2.alpha1 8.141695976257324
layer3.2.alpha2 8.432709693908691
layer3.3.alpha1 8.141749382019043
layer3.3.alpha2 8.822367668151855
layer3.4.alpha1 8.141709327697754
layer3.4.alpha2 9.322465896606445
layer3.5.alpha1 8.141721725463867
layer3.5.alpha2 9.788116455078125
lay

 * Prec@1 80.040
alpha1 8.098967552185059
layer1.0.alpha1 8.105289459228516
layer1.0.alpha2 8.115494728088379
layer1.1.alpha1 8.10596752166748
layer1.1.alpha2 8.202220916748047
layer1.2.alpha1 8.105854034423828
layer1.2.alpha2 8.812073707580566
layer2.0.alpha1 8.105859756469727
layer2.0.alpha2 8.128802299499512
layer2.1.alpha1 8.105915069580078
layer2.1.alpha2 8.202320098876953
layer2.2.alpha1 8.105859756469727
layer2.2.alpha2 8.41067886352539
layer2.3.alpha1 8.105863571166992
layer2.3.alpha2 9.137365341186523
layer3.0.alpha1 8.105862617492676
layer3.0.alpha2 8.111330032348633
layer3.1.alpha1 8.105855941772461
layer3.1.alpha2 8.214804649353027
layer3.2.alpha1 8.10583782196045
layer3.2.alpha2 8.407793045043945
layer3.3.alpha1 8.105890274047852
layer3.3.alpha2 8.80375862121582
layer3.4.alpha1 8.105857849121094
layer3.4.alpha2 9.303479194641113
layer3.5.alpha1 8.105863571166992
layer3.5.alpha2 9.778326988220215
layer4.0.alpha1 8.105913162231445
layer4.0.alpha2 8.109601974487305
layer4.1.a

Epoch: [83][0/40]	Time 1.741 (1.741)	Data 1.379 (1.379)	Loss 0.6702 (0.6702)	Prec@1 78.418 (78.418)
Test: [0/10]	Time 0.737 (0.737)	Loss 0.4738 (0.4738)	Prec@1 83.594 (83.594)
 * Prec@1 82.580
alpha1 8.075174331665039
layer1.0.alpha1 8.081049919128418
layer1.0.alpha2 8.093271255493164
layer1.1.alpha1 8.081897735595703
layer1.1.alpha2 8.182172775268555
layer1.2.alpha1 8.081811904907227
layer1.2.alpha2 8.803154945373535
layer2.0.alpha1 8.081840515136719
layer2.0.alpha2 8.106327056884766
layer2.1.alpha1 8.081844329833984
layer2.1.alpha2 8.183069229125977
layer2.2.alpha1 8.081803321838379
layer2.2.alpha2 8.396696090698242
layer2.3.alpha1 8.081796646118164
layer2.3.alpha2 9.129985809326172
layer3.0.alpha1 8.081802368164062
layer3.0.alpha2 8.087589263916016
layer3.1.alpha1 8.081796646118164
layer3.1.alpha2 8.195082664489746
layer3.2.alpha1 8.081789016723633
layer3.2.alpha2 8.390138626098633
layer3.3.alpha1 8.081830024719238
layer3.3.alpha2 8.786763191223145
layer3.4.alpha1 8.081799507141113


Test: [0/10]	Time 0.793 (0.793)	Loss 0.4678 (0.4678)	Prec@1 84.766 (84.766)
 * Prec@1 82.690
alpha1 8.065866470336914
layer1.0.alpha1 8.071732521057129
layer1.0.alpha2 8.085362434387207
layer1.1.alpha1 8.07264232635498
layer1.1.alpha2 8.176408767700195
layer1.2.alpha1 8.072565078735352
layer1.2.alpha2 8.79951000213623
layer2.0.alpha1 8.072595596313477
layer2.0.alpha2 8.097341537475586
layer2.1.alpha1 8.072575569152832
layer2.1.alpha2 8.17585563659668
layer2.2.alpha1 8.07253360748291
layer2.2.alpha2 8.391183853149414
layer2.3.alpha1 8.07253360748291
layer2.3.alpha2 9.128350257873535
layer3.0.alpha1 8.072532653808594
layer3.0.alpha2 8.078411102294922
layer3.1.alpha1 8.07253646850586
layer3.1.alpha2 8.18690299987793
layer3.2.alpha1 8.072527885437012
layer3.2.alpha2 8.383131980895996
layer3.3.alpha1 8.07256031036377
layer3.3.alpha2 8.780062675476074
layer3.4.alpha1 8.072529792785645
layer3.4.alpha2 9.2841215133667
layer3.5.alpha1 8.07253360748291
layer3.5.alpha2 9.767273902893066
layer4.0.

 * Prec@1 83.470
alpha1 8.063238143920898
layer1.0.alpha1 8.069098472595215
layer1.0.alpha2 8.082926750183105
layer1.1.alpha1 8.070008277893066
layer1.1.alpha2 8.174545288085938
layer1.2.alpha1 8.06993293762207
layer1.2.alpha2 8.798820495605469
layer2.0.alpha1 8.069963455200195
layer2.0.alpha2 8.09475040435791
layer2.1.alpha1 8.06994342803955
layer2.1.alpha2 8.173737525939941
layer2.2.alpha1 8.069901466369629
layer2.2.alpha2 8.389547348022461
layer2.3.alpha1 8.069901466369629
layer2.3.alpha2 9.127410888671875
layer3.0.alpha1 8.069900512695312
layer3.0.alpha2 8.075773239135742
layer3.1.alpha1 8.069904327392578
layer3.1.alpha2 8.184571266174316
layer3.2.alpha1 8.06989574432373
layer3.2.alpha2 8.380906105041504
layer3.3.alpha1 8.069928169250488
layer3.3.alpha2 8.77812671661377
layer3.4.alpha1 8.069897651672363
layer3.4.alpha2 9.282471656799316
layer3.5.alpha1 8.069901466369629
layer3.5.alpha2 9.76623249053955
layer4.0.alpha1 8.07004451751709
layer4.0.alpha2 8.074067115783691
layer4.1.alph

VBox(children=(Label(value=' 0.77MB of 0.77MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,99.0
prec,83.43
train_loss,0.61667
val_loss,0.49971
epoch_time,18.10552
lr,0.0
_runtime,1806.0
_timestamp,1628742315.0
_step,99.0


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
prec,▁▃▃▄▃▄▄▅▅▅▆▆▆▆▆▇▆▇▆▇▇▇▇▇████████████████
train_loss,█▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▄▄▄▃▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_time,█▄▂▄▄▄▂▃▃▅▄▅▂▄▃▃▂▂▁▄▁▃▅▃▂▂▄▅▄▃▂▂▃▃▄▅▅▅▁▅
lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


current lr 1.00000e-03
Epoch: [0][0/40]	Time 1.639 (1.639)	Data 1.274 (1.274)	Loss 9.9516 (9.9516)	Prec@1 9.082 (9.082)
Test: [0/10]	Time 0.706 (0.706)	Loss 4.0748 (4.0748)	Prec@1 13.281 (13.281)
 * Prec@1 13.720
alpha1 9.961085319519043
layer1.0.alpha1 9.961019515991211
layer1.0.alpha2 9.96909236907959
layer1.1.alpha1 9.960952758789062
layer1.1.alpha2 9.977738380432129
layer1.2.alpha1 9.960866928100586
layer1.2.alpha2 9.98956298828125
layer2.0.alpha1 9.961016654968262
layer2.0.alpha2 9.961173057556152
layer2.1.alpha1 9.961016654968262
layer2.1.alpha2 9.984134674072266
layer2.2.alpha1 9.961037635803223
layer2.2.alpha2 9.987861633300781
layer2.3.alpha1 9.960981369018555
layer2.3.alpha2 9.994668960571289
layer3.0.alpha1 9.961016654968262
layer3.0.alpha2 9.974977493286133
layer3.1.alpha1 9.960996627807617
layer3.1.alpha2 9.990006446838379
layer3.2.alpha1 9.961016654968262
layer3.2.alpha2 9.994083404541016
layer3.3.alpha1 9.961016654968262
layer3.3.alpha2 9.986119270324707
layer3.4.alpha1 

Test: [0/10]	Time 0.726 (0.726)	Loss 1.8869 (1.8869)	Prec@1 36.816 (36.816)
 * Prec@1 35.620
alpha1 9.723833084106445
layer1.0.alpha1 9.724095344543457
layer1.0.alpha2 9.741412162780762
layer1.1.alpha1 9.723432540893555
layer1.1.alpha2 9.783108711242676
layer1.2.alpha1 9.723172187805176
layer1.2.alpha2 9.918905258178711
layer2.0.alpha1 9.723562240600586
layer2.0.alpha2 9.721986770629883
layer2.1.alpha1 9.723562240600586
layer2.1.alpha2 9.842733383178711
layer2.2.alpha1 9.72361946105957
layer2.2.alpha2 9.904557228088379
layer2.3.alpha1 9.723472595214844
layer2.3.alpha2 9.971263885498047
layer3.0.alpha1 9.723562240600586
layer3.0.alpha2 9.7464599609375
layer3.1.alpha1 9.723505973815918
layer3.1.alpha2 9.80484390258789
layer3.2.alpha1 9.723562240600586
layer3.2.alpha2 9.808235168457031
layer3.3.alpha1 9.723562240600586
layer3.3.alpha2 9.846105575561523
layer3.4.alpha1 9.723562240600586
layer3.4.alpha2 9.937726974487305
layer3.5.alpha1 9.723562240600586
layer3.5.alpha2 9.99062442779541
lay

current lr 9.58054e-04
Epoch: [13][0/40]	Time 1.786 (1.786)	Data 1.419 (1.419)	Loss 1.5491 (1.5491)	Prec@1 45.410 (45.410)
Test: [0/10]	Time 0.700 (0.700)	Loss 1.4247 (1.4247)	Prec@1 50.684 (50.684)
 * Prec@1 49.520
alpha1 9.456375122070312
layer1.0.alpha1 9.456586837768555
layer1.0.alpha2 9.477579116821289
layer1.1.alpha1 9.455808639526367
layer1.1.alpha2 9.534090042114258
layer1.2.alpha1 9.45541000366211
layer1.2.alpha2 9.788155555725098
layer2.0.alpha1 9.455869674682617
layer2.0.alpha2 9.453957557678223
layer2.1.alpha1 9.455869674682617
layer2.1.alpha2 9.629633903503418
layer2.2.alpha1 9.455937385559082
layer2.2.alpha2 9.75904369354248
layer2.3.alpha1 9.45576286315918
layer2.3.alpha2 9.923057556152344
layer3.0.alpha1 9.455869674682617
layer3.0.alpha2 9.481542587280273
layer3.1.alpha1 9.455802917480469
layer3.1.alpha2 9.560569763183594
layer3.2.alpha1 9.455869674682617
layer3.2.alpha2 9.568305969238281
layer3.3.alpha1 9.455869674682617
layer3.3.alpha2 9.646570205688477
layer3.4.alpha

Test: [0/10]	Time 0.783 (0.783)	Loss 1.1395 (1.1395)	Prec@1 60.156 (60.156)
 * Prec@1 57.550
alpha1 9.238752365112305
layer1.0.alpha1 9.239325523376465
layer1.0.alpha2 9.262131690979004
layer1.1.alpha1 9.238302230834961
layer1.1.alpha2 9.327444076538086
layer1.2.alpha1 9.238059043884277
layer1.2.alpha2 9.649788856506348
layer2.0.alpha1 9.23854923248291
layer2.0.alpha2 9.236639022827148
layer2.1.alpha1 9.23854923248291
layer2.1.alpha2 9.440023422241211
layer2.2.alpha1 9.23862075805664
layer2.2.alpha2 9.611556053161621
layer2.3.alpha1 9.23843765258789
layer2.3.alpha2 9.873111724853516
layer3.0.alpha1 9.23854923248291
layer3.0.alpha2 9.265533447265625
layer3.1.alpha1 9.238479614257812
layer3.1.alpha2 9.356232643127441
layer3.2.alpha1 9.23854923248291
layer3.2.alpha2 9.3761568069458
layer3.3.alpha1 9.23854923248291
layer3.3.alpha2 9.483173370361328
layer3.4.alpha1 9.23854923248291
layer3.4.alpha2 9.75185489654541
layer3.5.alpha1 9.23854923248291
layer3.5.alpha2 9.946135520935059
layer4.0.a

 * Prec@1 61.890
alpha1 9.035934448242188
layer1.0.alpha1 9.037171363830566
layer1.0.alpha2 9.060583114624023
layer1.1.alpha1 9.035531044006348
layer1.1.alpha2 9.129440307617188
layer1.2.alpha1 9.035243034362793
layer1.2.alpha2 9.509223937988281
layer2.0.alpha1 9.035744667053223
layer2.0.alpha2 9.03388786315918
layer2.1.alpha1 9.035744667053223
layer2.1.alpha2 9.258112907409668
layer2.2.alpha1 9.03581714630127
layer2.2.alpha2 9.463691711425781
layer2.3.alpha1 9.035628318786621
layer2.3.alpha2 9.819113731384277
layer3.0.alpha1 9.035744667053223
layer3.0.alpha2 9.06332015991211
layer3.1.alpha1 9.035670280456543
layer3.1.alpha2 9.165250778198242
layer3.2.alpha1 9.035745620727539
layer3.2.alpha2 9.200020790100098
layer3.3.alpha1 9.035744667053223
layer3.3.alpha2 9.355809211730957
layer3.4.alpha1 9.035744667053223
layer3.4.alpha2 9.661199569702148
layer3.5.alpha1 9.035744667053223
layer3.5.alpha2 9.917827606201172
layer4.0.alpha1 9.035744667053223
layer4.0.alpha2 9.14964485168457
layer4.1.a

Epoch: [32][0/40]	Time 1.786 (1.786)	Data 1.422 (1.422)	Loss 1.0866 (1.0866)	Prec@1 63.281 (63.281)
Test: [0/10]	Time 0.807 (0.807)	Loss 0.9516 (0.9516)	Prec@1 66.309 (66.309)
 * Prec@1 64.620
alpha1 8.820572853088379
layer1.0.alpha1 8.821993827819824
layer1.0.alpha2 8.847508430480957
layer1.1.alpha1 8.820728302001953
layer1.1.alpha2 8.919515609741211
layer1.2.alpha1 8.820417404174805
layer1.2.alpha2 9.353217124938965
layer2.0.alpha1 8.820917129516602
layer2.0.alpha2 8.819995880126953
layer2.1.alpha1 8.820917129516602
layer2.1.alpha2 9.058176040649414
layer2.2.alpha1 8.820992469787598
layer2.2.alpha2 9.302513122558594
layer2.3.alpha1 8.82079792022705
layer2.3.alpha2 9.76559829711914
layer3.0.alpha1 8.820917129516602
layer3.0.alpha2 8.850096702575684
layer3.1.alpha1 8.820840835571289
layer3.1.alpha2 8.965059280395508
layer3.2.alpha1 8.820920944213867
layer3.2.alpha2 9.033005714416504
layer3.3.alpha1 8.820917129516602
layer3.3.alpha2 9.23423957824707
layer3.4.alpha1 8.820916175842285
lay

Test: [0/10]	Time 0.724 (0.724)	Loss 0.7350 (0.7350)	Prec@1 74.121 (74.121)
 * Prec@1 72.730
alpha1 8.656304359436035
layer1.0.alpha1 8.65949821472168
layer1.0.alpha2 8.682924270629883
layer1.1.alpha1 8.657502174377441
layer1.1.alpha2 8.764819145202637
layer1.2.alpha1 8.657195091247559
layer1.2.alpha2 9.230631828308105
layer2.0.alpha1 8.657716751098633
layer2.0.alpha2 8.656342506408691
layer2.1.alpha1 8.657716751098633
layer2.1.alpha2 8.903603553771973
layer2.2.alpha1 8.657793045043945
layer2.2.alpha2 9.173023223876953
layer2.3.alpha1 8.657597541809082
layer2.3.alpha2 9.710731506347656
layer3.0.alpha1 8.657716751098633
layer3.0.alpha2 8.68826675415039
layer3.1.alpha1 8.65764045715332
layer3.1.alpha2 8.819269180297852
layer3.2.alpha1 8.657731056213379
layer3.2.alpha2 8.913965225219727
layer3.3.alpha1 8.65771770477295
layer3.3.alpha2 9.142184257507324
layer3.4.alpha1 8.65771770477295
layer3.4.alpha2 9.486370086669922
layer3.5.alpha1 8.657716751098633
layer3.5.alpha2 9.845940589904785
lay

 * Prec@1 75.500
alpha1 8.513936042785645
layer1.0.alpha1 8.516472816467285
layer1.0.alpha2 8.539216041564941
layer1.1.alpha1 8.515142440795898
layer1.1.alpha2 8.625236511230469
layer1.2.alpha1 8.514762878417969
layer1.2.alpha2 9.119178771972656
layer2.0.alpha1 8.515270233154297
layer2.0.alpha2 8.513279914855957
layer2.1.alpha1 8.515270233154297
layer2.1.alpha2 8.773261070251465
layer2.2.alpha1 8.51534652709961
layer2.2.alpha2 9.062928199768066
layer2.3.alpha1 8.515148162841797
layer2.3.alpha2 9.660529136657715
layer3.0.alpha1 8.515270233154297
layer3.0.alpha2 8.546268463134766
layer3.1.alpha1 8.515192031860352
layer3.1.alpha2 8.689774513244629
layer3.2.alpha1 8.515290260314941
layer3.2.alpha2 8.801751136779785
layer3.3.alpha1 8.515271186828613
layer3.3.alpha2 9.052809715270996
layer3.4.alpha1 8.515273094177246
layer3.4.alpha2 9.418042182922363
layer3.5.alpha1 8.515270233154297
layer3.5.alpha2 9.815606117248535
layer4.0.alpha1 8.515270233154297
layer4.0.alpha2 8.637261390686035
layer4.

current lr 4.76209e-04
Epoch: [51][0/40]	Time 2.135 (2.135)	Data 1.773 (1.773)	Loss 0.8364 (0.8364)	Prec@1 72.559 (72.559)
Test: [0/10]	Time 0.784 (0.784)	Loss 0.5965 (0.5965)	Prec@1 79.688 (79.688)
 * Prec@1 77.350
alpha1 8.374527931213379
layer1.0.alpha1 8.377436637878418
layer1.0.alpha2 8.40195369720459
layer1.1.alpha1 8.376137733459473
layer1.1.alpha2 8.49499225616455
layer1.2.alpha1 8.375779151916504
layer1.2.alpha2 9.025489807128906
layer2.0.alpha1 8.376233100891113
layer2.0.alpha2 8.37518310546875
layer2.1.alpha1 8.376240730285645
layer2.1.alpha2 8.642398834228516
layer2.2.alpha1 8.376317977905273
layer2.2.alpha2 8.946380615234375
layer2.3.alpha1 8.376117706298828
layer2.3.alpha2 9.608595848083496
layer3.0.alpha1 8.376240730285645
layer3.0.alpha2 8.406838417053223
layer3.1.alpha1 8.3761625289917
layer3.1.alpha2 8.56046199798584
layer3.2.alpha1 8.376241683959961
layer3.2.alpha2 8.702919006347656
layer3.3.alpha1 8.376242637634277
layer3.3.alpha2 8.962244033813477
layer3.4.alpha1 8

Test: [0/10]	Time 0.775 (0.775)	Loss 0.6167 (0.6167)	Prec@1 78.027 (78.027)
 * Prec@1 78.850
alpha1 8.27940845489502
layer1.0.alpha1 8.281277656555176
layer1.0.alpha2 8.305685043334961
layer1.1.alpha1 8.280255317687988
layer1.1.alpha2 8.405306816101074
layer1.2.alpha1 8.279855728149414
layer1.2.alpha2 8.962409019470215
layer2.0.alpha1 8.280293464660645
layer2.0.alpha2 8.280725479125977
layer2.1.alpha1 8.280301094055176
layer2.1.alpha2 8.553447723388672
layer2.2.alpha1 8.280378341674805
layer2.2.alpha2 8.870667457580566
layer2.3.alpha1 8.28017807006836
layer2.3.alpha2 9.572729110717773
layer3.0.alpha1 8.280301094055176
layer3.0.alpha2 8.311898231506348
layer3.1.alpha1 8.28022289276123
layer3.1.alpha2 8.472973823547363
layer3.2.alpha1 8.280302047729492
layer3.2.alpha2 8.627876281738281
layer3.3.alpha1 8.280303001403809
layer3.3.alpha2 8.90617847442627
layer3.4.alpha1 8.280303955078125
layer3.4.alpha2 9.289481163024902
layer3.5.alpha1 8.280301094055176
layer3.5.alpha2 9.756551742553711
la

 * Prec@1 80.140
alpha1 8.20481014251709
layer1.0.alpha1 8.206510543823242
layer1.0.alpha2 8.23055362701416
layer1.1.alpha1 8.205025672912598
layer1.1.alpha2 8.338035583496094
layer1.2.alpha1 8.204612731933594
layer1.2.alpha2 8.912599563598633
layer2.0.alpha1 8.205101013183594
layer2.0.alpha2 8.204200744628906
layer2.1.alpha1 8.205121994018555
layer2.1.alpha2 8.481514930725098
layer2.2.alpha1 8.205199241638184
layer2.2.alpha2 8.803468704223633
layer2.3.alpha1 8.204998016357422
layer2.3.alpha2 9.538209915161133
layer3.0.alpha1 8.205117225646973
layer3.0.alpha2 8.237702369689941
layer3.1.alpha1 8.205044746398926
layer3.1.alpha2 8.401897430419922
layer3.2.alpha1 8.205127716064453
layer3.2.alpha2 8.565893173217773
layer3.3.alpha1 8.205121994018555
layer3.3.alpha2 8.860625267028809
layer3.4.alpha1 8.205120086669922
layer3.4.alpha2 9.24878978729248
layer3.5.alpha1 8.205121040344238
layer3.5.alpha2 9.735137939453125
layer4.0.alpha1 8.205122947692871
layer4.0.alpha2 8.329477310180664
layer4.1.

Epoch: [70][0/40]	Time 1.713 (1.713)	Data 1.348 (1.348)	Loss 0.6460 (0.6460)	Prec@1 79.980 (79.980)
Test: [0/10]	Time 0.707 (0.707)	Loss 0.5245 (0.5245)	Prec@1 81.836 (81.836)
 * Prec@1 81.660
alpha1 8.140951156616211
layer1.0.alpha1 8.142897605895996
layer1.0.alpha2 8.168256759643555
layer1.1.alpha1 8.14162826538086
layer1.1.alpha2 8.279196739196777
layer1.2.alpha1 8.141206741333008
layer1.2.alpha2 8.873555183410645
layer2.0.alpha1 8.141700744628906
layer2.0.alpha2 8.140949249267578
layer2.1.alpha1 8.1417236328125
layer2.1.alpha2 8.421192169189453
layer2.2.alpha1 8.141798973083496
layer2.2.alpha2 8.755189895629883
layer2.3.alpha1 8.141597747802734
layer2.3.alpha2 9.510889053344727
layer3.0.alpha1 8.141716957092285
layer3.0.alpha2 8.174134254455566
layer3.1.alpha1 8.141644477844238
layer3.1.alpha2 8.341907501220703
layer3.2.alpha1 8.141736030578613
layer3.2.alpha2 8.512125968933105
layer3.3.alpha1 8.141736030578613
layer3.3.alpha2 8.81582260131836
layer3.4.alpha1 8.141719818115234
laye

Test: [0/10]	Time 0.706 (0.706)	Loss 0.4569 (0.4569)	Prec@1 84.277 (84.277)
 * Prec@1 83.190
alpha1 8.104844093322754
layer1.0.alpha1 8.107172966003418
layer1.0.alpha2 8.133138656616211
layer1.1.alpha1 8.105741500854492
layer1.1.alpha2 8.245796203613281
layer1.2.alpha1 8.10537338256836
layer1.2.alpha2 8.853113174438477
layer2.0.alpha1 8.105838775634766
layer2.0.alpha2 8.106318473815918
layer2.1.alpha1 8.10587215423584
layer2.1.alpha2 8.388663291931152
layer2.2.alpha1 8.105940818786621
layer2.2.alpha2 8.726399421691895
layer2.3.alpha1 8.10573959350586
layer2.3.alpha2 9.496685981750488
layer3.0.alpha1 8.10585880279541
layer3.0.alpha2 8.138815879821777
layer3.1.alpha1 8.105786323547363
layer3.1.alpha2 8.308730125427246
layer3.2.alpha1 8.105877876281738
layer3.2.alpha2 8.481926918029785
layer3.3.alpha1 8.105875968933105
layer3.3.alpha2 8.78862476348877
layer3.4.alpha1 8.105854988098145
layer3.4.alpha2 9.184582710266113
layer3.5.alpha1 8.105862617492676
layer3.5.alpha2 9.704849243164062
lay

 * Prec@1 84.090
alpha1 8.083955764770508
layer1.0.alpha1 8.08630084991455
layer1.0.alpha2 8.111820220947266
layer1.1.alpha1 8.084121704101562
layer1.1.alpha2 8.226747512817383
layer1.2.alpha1 8.08376407623291
layer1.2.alpha2 8.840021133422852
layer2.0.alpha1 8.084218978881836
layer2.0.alpha2 8.08491325378418
layer2.1.alpha1 8.084250450134277
layer2.1.alpha2 8.369102478027344
layer2.2.alpha1 8.084321975708008
layer2.2.alpha2 8.709498405456543
layer2.3.alpha1 8.084120750427246
layer2.3.alpha2 9.487499237060547
layer3.0.alpha1 8.084240913391113
layer3.0.alpha2 8.117249488830566
layer3.1.alpha1 8.084159851074219
layer3.1.alpha2 8.287338256835938
layer3.2.alpha1 8.084259033203125
layer3.2.alpha2 8.463690757751465
layer3.3.alpha1 8.084256172180176
layer3.3.alpha2 8.771480560302734
layer3.4.alpha1 8.084243774414062
layer3.4.alpha2 9.168975830078125
layer3.5.alpha1 8.084243774414062
layer3.5.alpha2 9.697064399719238
layer4.0.alpha1 8.0842924118042
layer4.0.alpha2 8.209086418151855
layer4.1.al

Epoch: [89][0/40]	Time 1.640 (1.640)	Data 1.285 (1.285)	Loss 0.5028 (0.5028)	Prec@1 84.082 (84.082)
Test: [0/10]	Time 0.712 (0.712)	Loss 0.4157 (0.4157)	Prec@1 86.426 (86.426)
 * Prec@1 84.900
alpha1 8.07227897644043
layer1.0.alpha1 8.074546813964844
layer1.0.alpha2 8.100412368774414
layer1.1.alpha1 8.072442054748535
layer1.1.alpha2 8.216063499450684
layer1.2.alpha1 8.072068214416504
layer1.2.alpha2 8.832749366760254
layer2.0.alpha1 8.072513580322266
layer2.0.alpha2 8.073482513427734
layer2.1.alpha1 8.072539329528809
layer2.1.alpha2 8.358762741088867
layer2.2.alpha1 8.072610855102539
layer2.2.alpha2 8.701363563537598
layer2.3.alpha1 8.072409629821777
layer2.3.alpha2 9.481439590454102
layer3.0.alpha1 8.072529792785645
layer3.0.alpha2 8.105668067932129
layer3.1.alpha1 8.07244873046875
layer3.1.alpha2 8.276604652404785
layer3.2.alpha1 8.072547912597656
layer3.2.alpha2 8.453451156616211
layer3.3.alpha1 8.072545051574707
layer3.3.alpha2 8.761709213256836
layer3.4.alpha1 8.072543144226074
la

Test: [0/10]	Time 0.798 (0.798)	Loss 0.4274 (0.4274)	Prec@1 85.059 (85.059)
 * Prec@1 84.490
alpha1 8.069633483886719
layer1.0.alpha1 8.07192611694336
layer1.0.alpha2 8.097755432128906
layer1.1.alpha1 8.069808006286621
layer1.1.alpha2 8.213533401489258
layer1.2.alpha1 8.069436073303223
layer1.2.alpha2 8.831130027770996
layer2.0.alpha1 8.069881439208984
layer2.0.alpha2 8.0708589553833
layer2.1.alpha1 8.069907188415527
layer2.1.alpha2 8.356534957885742
layer2.2.alpha1 8.069978713989258
layer2.2.alpha2 8.69955062866211
layer2.3.alpha1 8.069777488708496
layer2.3.alpha2 9.480206489562988
layer3.0.alpha1 8.069897651672363
layer3.0.alpha2 8.103043556213379
layer3.1.alpha1 8.069816589355469
layer3.1.alpha2 8.274002075195312
layer3.2.alpha1 8.069915771484375
layer3.2.alpha2 8.450876235961914
layer3.3.alpha1 8.069912910461426
layer3.3.alpha2 8.759366989135742
layer3.4.alpha1 8.069911003112793
layer3.4.alpha2 9.158480644226074
layer3.5.alpha1 8.069900512695312
layer3.5.alpha2 9.692182540893555
la

VBox(children=(Label(value=' 0.99MB of 0.99MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,99.0
prec,84.73
train_loss,0.53282
val_loss,0.46221
epoch_time,17.92884
lr,0.0
_runtime,1808.0
_timestamp,1628744128.0
_step,99.0


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
prec,▁▃▃▄▄▅▄▅▅▆▆▆▆▆▆▇▆▇▇▇▇▇▇▇▇███████████████
train_loss,█▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_time,▁▂▅▄▅▄▅▂▁▃▂▇▆▁█▄▃▂▅▅▆▆▅▅▆▃▄▅▄▁▁▂▃▃▂▅▅▃▅▃
lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███


current lr 1.00000e-03
Epoch: [0][0/40]	Time 2.027 (2.027)	Data 1.660 (1.660)	Loss 6.7467 (6.7467)	Prec@1 8.301 (8.301)
Test: [0/10]	Time 0.801 (0.801)	Loss 2.7800 (2.7800)	Prec@1 21.094 (21.094)
 * Prec@1 22.790
alpha1 9.961015701293945
layer1.0.alpha1 9.96114730834961
layer1.0.alpha2 9.96588134765625
layer1.1.alpha1 9.961121559143066
layer1.1.alpha2 9.981550216674805
layer1.2.alpha1 9.960795402526855
layer1.2.alpha2 9.991941452026367
layer2.0.alpha1 9.961231231689453
layer2.0.alpha2 9.96427059173584
layer2.1.alpha1 9.960983276367188
layer2.1.alpha2 9.976947784423828
layer2.2.alpha1 9.960832595825195
layer2.2.alpha2 9.98831558227539
layer2.3.alpha1 9.961112022399902
layer2.3.alpha2 9.991268157958984
layer3.0.alpha1 9.961091041564941
layer3.0.alpha2 9.990835189819336
layer3.1.alpha1 9.96139144897461
layer3.1.alpha2 10.002418518066406
layer3.2.alpha1 9.96101188659668
layer3.2.alpha2 9.994424819946289
layer3.3.alpha1 9.961004257202148
layer3.3.alpha2 9.989961624145508
layer3.4.alpha1 9.9

Test: [0/10]	Time 0.920 (0.920)	Loss 1.6527 (1.6527)	Prec@1 41.211 (41.211)
 * Prec@1 43.070
alpha1 9.723554611206055
layer1.0.alpha1 9.723913192749023
layer1.0.alpha2 9.730278968811035
layer1.1.alpha1 9.72390365600586
layer1.1.alpha2 9.769729614257812
layer1.2.alpha1 9.722896575927734
layer1.2.alpha2 9.937151908874512
layer2.0.alpha1 9.724150657653809
layer2.0.alpha2 9.724981307983398
layer2.1.alpha1 9.7234468460083
layer2.1.alpha2 9.786700248718262
layer2.2.alpha1 9.722929000854492
layer2.2.alpha2 9.908984184265137
layer2.3.alpha1 9.72380256652832
layer2.3.alpha2 9.961801528930664
layer3.0.alpha1 9.723723411560059
layer3.0.alpha2 9.817399978637695
layer3.1.alpha1 9.724531173706055
layer3.1.alpha2 9.932223320007324
layer3.2.alpha1 9.723553657531738
layer3.2.alpha2 9.890010833740234
layer3.3.alpha1 9.723538398742676
layer3.3.alpha2 9.920958518981934
layer3.4.alpha1 9.723562240600586
layer3.4.alpha2 9.937725067138672
layer3.5.alpha1 9.723562240600586
layer3.5.alpha2 9.976652145385742
la

 * Prec@1 49.650
alpha1 9.493269920349121
layer1.0.alpha1 9.493714332580566
layer1.0.alpha2 9.499797821044922
layer1.1.alpha1 9.493696212768555
layer1.1.alpha2 9.550878524780273
layer1.2.alpha1 9.492443084716797
layer1.2.alpha2 9.845342636108398
layer2.0.alpha1 9.493968963623047
layer2.0.alpha2 9.49547004699707
layer2.1.alpha1 9.493146896362305
layer2.1.alpha2 9.576125144958496
layer2.2.alpha1 9.492539405822754
layer2.2.alpha2 9.785014152526855
layer2.3.alpha1 9.493571281433105
layer2.3.alpha2 9.912915229797363
layer3.0.alpha1 9.493474006652832
layer3.0.alpha2 9.612960815429688
layer3.1.alpha1 9.49441909790039
layer3.1.alpha2 9.823243141174316
layer3.2.alpha1 9.493277549743652
layer3.2.alpha2 9.739104270935059
layer3.3.alpha1 9.493258476257324
layer3.3.alpha2 9.81900405883789
layer3.4.alpha1 9.493287086486816
layer3.4.alpha2 9.85466194152832
layer3.5.alpha1 9.493287086486816
layer3.5.alpha2 9.94017219543457
layer4.0.alpha1 9.493287086486816
layer4.0.alpha2 9.667092323303223
layer4.1.al

current lr 9.11838e-04
Epoch: [19][0/40]	Time 1.527 (1.527)	Data 1.162 (1.162)	Loss 1.3234 (1.3234)	Prec@1 54.004 (54.004)
Test: [0/10]	Time 0.780 (0.780)	Loss 1.1520 (1.1520)	Prec@1 60.059 (60.059)
 * Prec@1 58.620
alpha1 9.238635063171387
layer1.0.alpha1 9.23902702331543
layer1.0.alpha2 9.24680233001709
layer1.1.alpha1 9.239175796508789
layer1.1.alpha2 9.30385684967041
layer1.2.alpha1 9.23762321472168
layer1.2.alpha2 9.713537216186523
layer2.0.alpha1 9.23949146270752
layer2.0.alpha2 9.240575790405273
layer2.1.alpha1 9.238383293151855
layer2.1.alpha2 9.330538749694824
layer2.2.alpha1 9.2377290725708
layer2.2.alpha2 9.62087631225586
layer2.3.alpha1 9.238855361938477
layer2.3.alpha2 9.847076416015625
layer3.0.alpha1 9.238748550415039
layer3.0.alpha2 9.37427806854248
layer3.1.alpha1 9.239771842956543
layer3.1.alpha2 9.672591209411621
layer3.2.alpha1 9.238539695739746
layer3.2.alpha2 9.547311782836914
layer3.3.alpha1 9.238519668579102
layer3.3.alpha2 9.684450149536133
layer3.4.alpha1 9.23

Test: [0/10]	Time 0.800 (0.800)	Loss 0.9585 (0.9585)	Prec@1 67.578 (67.578)
 * Prec@1 65.850
alpha1 9.035962104797363
layer1.0.alpha1 9.0357666015625
layer1.0.alpha2 9.045771598815918
layer1.1.alpha1 9.036423683166504
layer1.1.alpha2 9.105894088745117
layer1.2.alpha1 9.03487491607666
layer1.2.alpha2 9.594294548034668
layer2.0.alpha1 9.036930084228516
layer2.0.alpha2 9.039255142211914
layer2.1.alpha1 9.035591125488281
layer2.1.alpha2 9.13386344909668
layer2.2.alpha1 9.03488826751709
layer2.2.alpha2 9.476937294006348
layer2.3.alpha1 9.036057472229004
layer2.3.alpha2 9.793548583984375
layer3.0.alpha1 9.03594970703125
layer3.0.alpha2 9.17958927154541
layer3.1.alpha1 9.037005424499512
layer3.1.alpha2 9.534615516662598
layer3.2.alpha1 9.03573226928711
layer3.2.alpha2 9.396429061889648
layer3.3.alpha1 9.035714149475098
layer3.3.alpha2 9.571672439575195
layer3.4.alpha1 9.035744667053223
layer3.4.alpha2 9.662420272827148
layer3.5.alpha1 9.035744667053223
layer3.5.alpha2 9.833062171936035
layer4