### Package settings

In [1]:
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
from torch.autograd import Variable

import os
os.environ['CUDA_VISIBLE_DEVICES']='1'
import shutil
import argparse
import time
import logging
import math

import models.resnet as models
from dataset.data import *

import torchvision.models.utils as utils
# from tensorboardX import SummaryWriter 
import numpy as np
import easydict

In [2]:
args = easydict.EasyDict({
        'cmd': 'train',
        'data_dir': 'data/',
        'arch': 'multi_resnet18_kd',
        'dataset': 'cifar100',
        'workers': 8,
        'epoch': 10,
        'start_epoch': 0,
        'batch_size': 128,
        'lr': 0.1,
        'momentum': 0.9,
        'weight_decay': 5e-4,
        'print_freq': 100,
        'resume': '',
        'step_ratio': 0.1,
        'warm_up': None,
        'save_folder': 'save_checkpoints/',
        'summary_folder': 'rums_alpha01/',
        'eval_every': 1000,
        
        # kd parameters
        'temperature': 3,
        'alpha': 0.1,
        'beta': 1e-6
})

In [3]:
def main():
#     args = parse_args()
    save_path = args.save_path = os.path.join(args.save_folder, args.arch)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    args.logger_file = os.path.join(save_path, 'log_{}.txt'.format(args.cmd))
    handlers = [logging.FileHandler(args.logger_file, mode='w'),
                logging.StreamHandler()]
    logging.basicConfig(level=logging.INFO,
                        datefmt='%m-%d-%y %H:%M',
                        format='%(asctime)s:%(message)s',
                        handlers=handlers)
    
    if args.cmd == 'train':
        logging.info('start training {}'.format(args.arch))
        run_training(args)
    
    elif args.cmd == 'test':
        logging.info('start evaluating {} with checkpoints from {}'.format(
            args.arch, args.resume))
        run_test(args)

In [4]:
def run_test(args):
#     writer = SummaryWriter(args.summary_folder)
    if args.dataset == 'cifar100':
        model = models.__dict__[args.arch](num_classes=100)
    else:
        raise NotImplementedError
    model = torch.nn.DataParallel(model).cuda()

    # load checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            logging.info("=> loading checkpoint `{}`".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch'] + 1
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            logging.info('=> loaded checkpoint `{}` (epoch: {})'.format(
                args.resume, checkpoint['epoch']))
        else:
            logging.info('=> no checkpoint found at `{}`'.format(args.resume))
            exit()
    
    cudnn.benchmark = True

    #load dataset
    if args.dataset == 'cifar100':
        test_loader = prepare_cifar100_test_dataset(data_dir=args.data_dir, batch_size=args.batch_size, 
                                                        num_workers=args.workers)
    else:
        raise NotImplementedError
    criterion = nn.CrossEntropyLoss().cuda()
    validate(args, test_loader, model, criterion)

### Train function

In [5]:
def run_training(args):
#     writer = SummaryWriter(args.summary_folder)
    if args.dataset == 'cifar100':
        model = models.__dict__[args.arch](num_classes=100)
    else:
        raise NotImplementedError
    model = torch.nn.DataParallel(model).cuda()
    best_prec1 = 0

    if args.resume:
        if os.path.isfile(args.resume):
            logging.info("=> loading checkpoint `{}`".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            logging.info('=> loaded checkpoint `{}` (epoch: {})'.format(
                args.resume, checkpoint['epoch']))
        else:
            logging.info('=> no checkpoint found at `{}`'.format(args.resume))
    
    cudnn.benchmark = True
    if args.dataset == 'cifar100':
        train_loader = prepare_cifar100_train_dataset(data_dir=args.data_dir, batch_size=args.batch_size, 
                                                        num_workers=args.workers)
        test_loader = prepare_cifar100_test_dataset(data_dir=args.data_dir, batch_size=args.batch_size, 
                                                        num_workers=args.workers)
    else:
        raise NotImplementedError
   
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay = args.weight_decay)


    end = time.time()
    model.train()
    step = 0
    for current_epoch in range(args.start_epoch, args.epoch):
        batch_time = AverageMeter()
        data_time = AverageMeter()
        losses = AverageMeter()
        top1 = AverageMeter()
        middle1_losses = AverageMeter()
        middle2_losses = AverageMeter()
        middle3_losses = AverageMeter()
        losses1_kd = AverageMeter()
        losses2_kd = AverageMeter()
        losses3_kd = AverageMeter()
        feature_losses_1 = AverageMeter()
        feature_losses_2 = AverageMeter()
        feature_losses_3 = AverageMeter()
        total_losses = AverageMeter()
        middle1_top1 = AverageMeter()
        middle2_top1 = AverageMeter()
        middle3_top1 = AverageMeter()

        adjust_learning_rate(args, optimizer, current_epoch)
        for i, (input, target) in enumerate(train_loader):
            data_time.update(time.time() - end)
            
            target = target.squeeze().long().cuda(non_blocking=True)
            input = Variable(input).cuda()
            
            # all outputs from a model
            output, middle_output1, middle_output2, middle_output3, \
            final_fea, middle1_fea, middle2_fea, middle3_fea = model(input)
            
            # cross entropy loss on a final classifier
            loss = criterion(output, target)
            losses.update(loss.item(), input.size(0))
            
            # cross entropy losses on shallow classifiers
            middle1_loss = criterion(middle_output1, target)
            middle1_losses.update(middle1_loss.item(), input.size(0))
            middle2_loss = criterion(middle_output2, target)
            middle2_losses.update(middle2_loss.item(), input.size(0))
            middle3_loss = criterion(middle_output3, target)
            middle3_losses.update(middle3_loss.item(), input.size(0))

            # output - logit
            temp4 = output / args.temperature
            temp4 = torch.softmax(temp4, dim=1)
            
            # KL loss between students and a teacher
            loss1by4 = kd_loss_function(middle_output1, temp4.detach(), args) * (args.temperature**2)
            losses1_kd.update(loss1by4, input.size(0))
            
            loss2by4 = kd_loss_function(middle_output2, temp4.detach(), args) * (args.temperature**2)
            losses2_kd.update(loss2by4, input.size(0))
            
            loss3by4 = kd_loss_function(middle_output3, temp4.detach(), args) * (args.temperature**2)
            losses3_kd.update(loss3by4, input.size(0))
            
            # Feature L2 loss
            feature_loss_1 = feature_loss_function(middle1_fea, final_fea.detach()) 
            feature_losses_1.update(feature_loss_1, input.size(0))
            feature_loss_2 = feature_loss_function(middle2_fea, final_fea.detach()) 
            feature_losses_2.update(feature_loss_2, input.size(0))
            feature_loss_3 = feature_loss_function(middle3_fea, final_fea.detach()) 
            feature_losses_3.update(feature_loss_3, input.size(0))

            # Total loss
            total_loss = (1 - args.alpha) * (loss + middle1_loss + middle2_loss + middle3_loss) + \
                        args.alpha * (loss1by4 + loss2by4 + loss3by4) + \
                        args.beta * (feature_loss_1 + feature_loss_2 + feature_loss_3)
            total_losses.update(total_loss.item(), input.size(0))
            
            prec1 = accuracy(output.data, target, topk=(1,))
            top1.update(prec1[0], input.size(0))

            middle1_prec1 = accuracy(middle_output1.data, target, topk=(1,))
            middle1_top1.update(middle1_prec1[0], input.size(0))
            middle2_prec1 = accuracy(middle_output2.data, target, topk=(1,))
            middle2_top1.update(middle2_prec1[0], input.size(0))
            middle3_prec1 = accuracy(middle_output3.data, target, topk=(1,))
            middle3_top1.update(middle3_prec1[0], input.size(0))
            
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

            batch_time.update(time.time() - end)
            end = time.time()
            if i % args.print_freq == 0:
                step += 1
                logging.info("Epoch: [{0}]\t"
                            "Iter: [{1}]\t"
                            "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                            "Data {data_time.val:.3f} ({data_time.avg:.3f})\t"
                            "Loss {loss.val:.3f} ({loss.avg:.3f})\t"
                            "Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t".format(
                                current_epoch,
                                i,
                                batch_time=batch_time,
                                data_time=data_time,
                                loss=total_losses,
                                top1=top1)
                ) 
        prec1 = validate(args, test_loader, model, criterion, None, current_epoch)
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        print("best: ", best_prec1)
        checkpoint_path = os.path.join(args.save_path, 'checkpoint_{:05d}.pth.tar'.format(current_epoch))
        save_checkpoint({
            'epoch': current_epoch,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            }, is_best, filename=checkpoint_path)
        shutil.copyfile(checkpoint_path, os.path.join(args.save_path, 'checkpoint_latest.pth.tar'))
        torch.cuda.empty_cache()

In [6]:
def validate(args, test_loader, model, criterion, writer=None, current_epoch=0):
    batch_time = AverageMeter()
    losses = AverageMeter()
    middle1_losses = AverageMeter()
    middle2_losses = AverageMeter()
    middle3_losses = AverageMeter()
    top1 = AverageMeter()
    middle1_top1 = AverageMeter()
    middle2_top1 = AverageMeter()
    middle3_top1 = AverageMeter()
    model.eval()
    end = time.time()
    for i, (input, target) in enumerate(test_loader):

        target = target.squeeze().long().cuda(non_blocking=True)
        input = Variable(input).cuda()

        output, middle_output1, middle_output2, middle_output3, \
        final_fea, middle1_fea, middle2_fea, middle3_fea = model(input)
            
        loss = criterion(output, target)
        losses.update(loss.item(), input.size(0))
        middle1_loss = criterion(middle_output1, target)
        middle1_losses.update(middle1_loss.item(), input.size(0))
        middle2_loss = criterion(middle_output2, target)
        middle2_losses.update(middle2_loss.item(), input.size(0))
        middle3_loss = criterion(middle_output3, target)
        middle3_losses.update(middle3_loss.item(), input.size(0))
            
        prec1 = accuracy(output.data, target, topk=(1,))
        top1.update(prec1[0], input.size(0))
        middle1_prec1 = accuracy(middle_output1.data, target, topk=(1,))
        middle1_top1.update(middle1_prec1[0], input.size(0))
        middle2_prec1 = accuracy(middle_output2.data, target, topk=(1,))
        middle2_top1.update(middle2_prec1[0], input.size(0))
        middle3_prec1 = accuracy(middle_output3.data, target, topk=(1,))
        middle3_top1.update(middle3_prec1[0], input.size(0))
        batch_time.update(time.time() - end)
        end = time.time()
    logging.info("Loss {loss.avg:.3f}\t"
                 "Prec@1 {top1.avg:.3f}\t"
                 "Middle1@1 {middle1_top1.avg:.3f}\t"
                 "Middle2@1 {middle2_top1.avg:.3f}\t"
                 "Middle3@1 {middle3_top1.avg:.3f}\t".format(
                    loss=losses,
                    top1=top1,
                    middle1_top1=middle1_top1,
                    middle2_top1=middle2_top1,
                    middle3_top1=middle3_top1))
    
    model.train()
    return top1.avg

### loss functions

In [7]:
def kd_loss_function(output, target_output,args):
    """Compute kd loss"""
    """
    para: output: middle ouptput logits.
    para: target_output: final output has divided by temperature and softmax.
    """

    output = output / args.temperature
    output_log_softmax = torch.log_softmax(output, dim=1)
    loss_kd = -torch.mean(torch.sum(output_log_softmax * target_output, dim=1))
    return loss_kd

def feature_loss_function(fea, target_fea):
    loss = (fea - target_fea)**2 * ((fea > 0) | (target_fea > 0)).float()
    return torch.abs(loss).sum()

In [8]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def adjust_learning_rate(args, optimizer, epoch):
    if args.warm_up and (epoch < 1):
        lr = 0.01
    elif 75 <= epoch < 130:
        lr = args.lr * (args.step_ratio ** 1)
    elif 130 <= epoch < 180:
        lr = args.lr * (args.step_ratio ** 2)
    elif epoch >=180:
        lr = args.lr * (args.step_ratio ** 3)
    else:
        lr = args.lr

    
    logging.info('Epoch [{}] learning rate = {}'.format(epoch, lr))
    
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

def accuracy(output, target, topk=(1,)):
    maxk = max(topk)
    batch_size = target.size(0)
    _, pred = output.topk(maxk, 1, True, True)  
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))  

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul(100.0 / batch_size))
    
    return res

def save_checkpoint(state, is_best, filename):
    torch.save(state, filename)
    if is_best:
        save_path = os.path.dirname(filename)
        shutil.copyfile(filename, os.path.join(save_path, 'model_best.path.tar'))

In [9]:
if __name__ == '__main__':
    main()

11-26-20 17:55:start training multi_resnet18_kd


Files already downloaded and verified
Files already downloaded and verified


11-26-20 17:55:Epoch [0] learning rate = 0.1
11-26-20 17:55:Epoch: [0]	Iter: [0]	Time 0.794 (0.794)	Data 0.197 (0.197)	Loss 29.264 (29.264)	Prec@1 0.000 (0.000)	
11-26-20 17:55:Epoch: [0]	Iter: [100]	Time 0.050 (0.057)	Data 0.000 (0.002)	Loss 26.898 (28.019)	Prec@1 4.688 (4.525)	
11-26-20 17:55:Epoch: [0]	Iter: [200]	Time 0.050 (0.054)	Data 0.000 (0.001)	Loss 25.939 (27.246)	Prec@1 9.375 (6.394)	
11-26-20 17:55:Epoch: [0]	Iter: [300]	Time 0.051 (0.053)	Data 0.000 (0.001)	Loss 26.137 (26.781)	Prec@1 7.812 (7.722)	
11-26-20 17:55:Loss 3.550	Prec@1 14.500	Middle1@1 14.340	Middle2@1 14.240	Middle3@1 13.220	
11-26-20 17:55:Epoch [1] learning rate = 0.1


best:  tensor(14.5000, device='cuda:0')


11-26-20 17:55:Epoch: [1]	Iter: [0]	Time 1.969 (1.969)	Data 1.938 (1.938)	Loss 25.760 (25.760)	Prec@1 8.594 (8.594)	
11-26-20 17:56:Epoch: [1]	Iter: [100]	Time 0.051 (0.070)	Data 0.000 (0.020)	Loss 23.804 (24.744)	Prec@1 20.312 (15.377)	
11-26-20 17:56:Epoch: [1]	Iter: [200]	Time 0.051 (0.060)	Data 0.000 (0.010)	Loss 24.077 (24.425)	Prec@1 17.188 (16.787)	
11-26-20 17:56:Epoch: [1]	Iter: [300]	Time 0.051 (0.057)	Data 0.000 (0.007)	Loss 22.777 (24.115)	Prec@1 27.344 (18.376)	
11-26-20 17:56:Loss 3.136	Prec@1 23.330	Middle1@1 21.090	Middle2@1 22.550	Middle3@1 22.420	
11-26-20 17:56:Epoch [2] learning rate = 0.1


best:  tensor(23.3300, device='cuda:0')


11-26-20 17:56:Epoch: [2]	Iter: [0]	Time 1.896 (1.896)	Data 1.860 (1.860)	Loss 22.876 (22.876)	Prec@1 28.906 (28.906)	
11-26-20 17:56:Epoch: [2]	Iter: [100]	Time 0.051 (0.069)	Data 0.000 (0.019)	Loss 22.597 (22.243)	Prec@1 28.906 (27.970)	
11-26-20 17:56:Epoch: [2]	Iter: [200]	Time 0.052 (0.061)	Data 0.000 (0.010)	Loss 21.397 (22.027)	Prec@1 32.031 (29.101)	
11-26-20 17:56:Epoch: [2]	Iter: [300]	Time 0.052 (0.058)	Data 0.000 (0.007)	Loss 20.885 (21.856)	Prec@1 32.031 (29.939)	
11-26-20 17:56:Loss 2.710	Prec@1 31.420	Middle1@1 28.380	Middle2@1 27.200	Middle3@1 29.510	
11-26-20 17:56:Epoch [3] learning rate = 0.1


best:  tensor(31.4200, device='cuda:0')


11-26-20 17:56:Epoch: [3]	Iter: [0]	Time 1.887 (1.887)	Data 1.852 (1.852)	Loss 20.761 (20.761)	Prec@1 39.062 (39.062)	
11-26-20 17:56:Epoch: [3]	Iter: [100]	Time 0.051 (0.070)	Data 0.000 (0.019)	Loss 20.497 (20.536)	Prec@1 42.188 (37.106)	
11-26-20 17:56:Epoch: [3]	Iter: [200]	Time 0.052 (0.061)	Data 0.000 (0.010)	Loss 19.107 (20.392)	Prec@1 48.438 (37.865)	
11-26-20 17:56:Epoch: [3]	Iter: [300]	Time 0.052 (0.058)	Data 0.000 (0.006)	Loss 20.201 (20.240)	Prec@1 42.969 (38.772)	
11-26-20 17:57:Loss 2.586	Prec@1 34.970	Middle1@1 29.740	Middle2@1 29.630	Middle3@1 31.380	
11-26-20 17:57:Epoch [4] learning rate = 0.1


best:  tensor(34.9700, device='cuda:0')


11-26-20 17:57:Epoch: [4]	Iter: [0]	Time 1.907 (1.907)	Data 1.875 (1.875)	Loss 19.267 (19.267)	Prec@1 50.000 (50.000)	
11-26-20 17:57:Epoch: [4]	Iter: [100]	Time 0.052 (0.070)	Data 0.000 (0.019)	Loss 20.026 (19.312)	Prec@1 40.625 (44.787)	
11-26-20 17:57:Epoch: [4]	Iter: [200]	Time 0.052 (0.061)	Data 0.000 (0.010)	Loss 18.626 (19.205)	Prec@1 51.562 (45.099)	
11-26-20 17:57:Epoch: [4]	Iter: [300]	Time 0.052 (0.058)	Data 0.000 (0.007)	Loss 18.794 (19.169)	Prec@1 49.219 (45.473)	
11-26-20 17:57:Loss 1.984	Prec@1 45.360	Middle1@1 33.910	Middle2@1 37.720	Middle3@1 42.440	
11-26-20 17:57:Epoch [5] learning rate = 0.1


best:  tensor(45.3600, device='cuda:0')


11-26-20 17:57:Epoch: [5]	Iter: [0]	Time 1.885 (1.885)	Data 1.849 (1.849)	Loss 18.660 (18.660)	Prec@1 48.438 (48.438)	
11-26-20 17:57:Epoch: [5]	Iter: [100]	Time 0.052 (0.070)	Data 0.000 (0.019)	Loss 18.527 (18.369)	Prec@1 47.656 (49.907)	
11-26-20 17:57:Epoch: [5]	Iter: [200]	Time 0.054 (0.061)	Data 0.000 (0.010)	Loss 19.358 (18.461)	Prec@1 50.000 (49.436)	
11-26-20 17:57:Epoch: [5]	Iter: [300]	Time 0.053 (0.058)	Data 0.000 (0.006)	Loss 19.363 (18.443)	Prec@1 49.219 (49.538)	
11-26-20 17:57:Loss 2.070	Prec@1 44.990	Middle1@1 32.640	Middle2@1 37.780	Middle3@1 40.990	
11-26-20 17:57:Epoch [6] learning rate = 0.1


best:  tensor(45.3600, device='cuda:0')


11-26-20 17:57:Epoch: [6]	Iter: [0]	Time 1.885 (1.885)	Data 1.851 (1.851)	Loss 18.066 (18.066)	Prec@1 50.781 (50.781)	
11-26-20 17:57:Epoch: [6]	Iter: [100]	Time 0.054 (0.071)	Data 0.000 (0.019)	Loss 17.230 (17.933)	Prec@1 59.375 (53.063)	
11-26-20 17:57:Epoch: [6]	Iter: [200]	Time 0.053 (0.062)	Data 0.000 (0.010)	Loss 17.876 (17.961)	Prec@1 51.562 (52.542)	
11-26-20 17:58:Epoch: [6]	Iter: [300]	Time 0.054 (0.059)	Data 0.000 (0.006)	Loss 17.951 (17.924)	Prec@1 51.562 (52.785)	
11-26-20 17:58:Loss 2.015	Prec@1 47.540	Middle1@1 35.880	Middle2@1 39.860	Middle3@1 42.780	
11-26-20 17:58:Epoch [7] learning rate = 0.1


best:  tensor(47.5400, device='cuda:0')


11-26-20 17:58:Epoch: [7]	Iter: [0]	Time 1.924 (1.924)	Data 1.894 (1.894)	Loss 17.677 (17.677)	Prec@1 53.125 (53.125)	
11-26-20 17:58:Epoch: [7]	Iter: [100]	Time 0.052 (0.071)	Data 0.000 (0.019)	Loss 16.650 (17.494)	Prec@1 61.719 (56.165)	
11-26-20 17:58:Epoch: [7]	Iter: [200]	Time 0.053 (0.062)	Data 0.000 (0.010)	Loss 17.772 (17.511)	Prec@1 50.000 (55.826)	
11-26-20 17:58:Epoch: [7]	Iter: [300]	Time 0.052 (0.059)	Data 0.000 (0.007)	Loss 17.453 (17.499)	Prec@1 53.125 (55.666)	
11-26-20 17:58:Loss 1.935	Prec@1 48.120	Middle1@1 38.730	Middle2@1 39.660	Middle3@1 44.240	
11-26-20 17:58:Epoch [8] learning rate = 0.1


best:  tensor(48.1200, device='cuda:0')


11-26-20 17:58:Epoch: [8]	Iter: [0]	Time 1.935 (1.935)	Data 1.899 (1.899)	Loss 15.987 (15.987)	Prec@1 65.625 (65.625)	
11-26-20 17:58:Epoch: [8]	Iter: [100]	Time 0.054 (0.072)	Data 0.000 (0.019)	Loss 17.288 (17.150)	Prec@1 59.375 (57.704)	
11-26-20 17:58:Epoch: [8]	Iter: [200]	Time 0.052 (0.062)	Data 0.000 (0.010)	Loss 18.339 (17.119)	Prec@1 47.656 (57.676)	
11-26-20 17:58:Epoch: [8]	Iter: [300]	Time 0.054 (0.059)	Data 0.000 (0.007)	Loss 17.297 (17.170)	Prec@1 58.594 (57.345)	
11-26-20 17:58:Loss 1.891	Prec@1 48.940	Middle1@1 40.010	Middle2@1 43.980	Middle3@1 45.480	
11-26-20 17:58:Epoch [9] learning rate = 0.1


best:  tensor(48.9400, device='cuda:0')


11-26-20 17:58:Epoch: [9]	Iter: [0]	Time 1.915 (1.915)	Data 1.886 (1.886)	Loss 17.111 (17.111)	Prec@1 60.938 (60.938)	
11-26-20 17:58:Epoch: [9]	Iter: [100]	Time 0.054 (0.072)	Data 0.000 (0.019)	Loss 17.322 (16.775)	Prec@1 55.469 (59.700)	
11-26-20 17:59:Epoch: [9]	Iter: [200]	Time 0.054 (0.063)	Data 0.000 (0.010)	Loss 16.480 (16.818)	Prec@1 67.188 (59.449)	
11-26-20 17:59:Epoch: [9]	Iter: [300]	Time 0.052 (0.060)	Data 0.000 (0.007)	Loss 17.244 (16.850)	Prec@1 53.906 (59.526)	
11-26-20 17:59:Loss 2.112	Prec@1 45.820	Middle1@1 29.120	Middle2@1 34.810	Middle3@1 41.540	


best:  tensor(48.9400, device='cuda:0')
