In [1]:
import argparse
import logging
import math
import os
import random
import shutil
import time
from copy import deepcopy
from collections import OrderedDict

import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from cifar import get_cifar10
from misc import AverageMeter, accuracy

logger = logging.getLogger(__name__)

best_acc = 0

# Hyper-Parameters 

In [0]:
class HyperParameters:

    def __init__(self):
        self.gpu_id      = 0        # id(s) for CUDA_VISIBLE_DEVICES
        self.num_workers = 4        # number of workers
        self.epochs      = 1024     # number of total epochs to run
        self.start_epoch = 54       # manual epoch number (useful on restarts)
        self.batch_size  = 64       # train batchsize
        self.lr          = 0.03     # initial learning rate
        self.warmup      = 0        # warmup epochs (unlabeled data based)
        self.wdecay      = 5e-4     # weight decay
        self.nesterov    = True     # use nesterov momentum
        self.use_ema     = True     # use EMA model
        self.ema_decay   = 0.999    # EMA decay rate
        self.mu          = 7        # coefficient of unlabeled batch size
        self.lambda_u    = 1        # coefficient of unlabeled loss
        self.threshold   = 0.95     # pseudo label threshold
        self.k_img       = 65536    # number of labeled examples to generate with augmentation techniques
        self.out         = 'result' # directory to output the result
        self.resume      = ''       # path to latest checkpoint (default: none)

args = HyperParameters()

# Train

In [0]:
def train_one_epoch(args, labeled_trainloader, unlabeled_trainloader, model, optimizer, ema_model, scheduler, epoch):
        
    batch_time = AverageMeter()
    data_time  = AverageMeter()
    losses     = AverageMeter()
    losses_x   = AverageMeter()
    losses_u   = AverageMeter()
    end = time.time()
    
    progress_bar = tqdm(range(args.iteration))
    
    train_loader = zip(labeled_trainloader, unlabeled_trainloader)
    model.train()
    for batch_idx, (data_x, data_u) in enumerate(train_loader):
        data_time.update(time.time() - end)
        
        # prepare data
        inputs_x, targets_x         = data_x
        (inputs_u_w, inputs_u_s), _ = data_u
        batch_size                  = inputs_x.shape[0]
        inputs                      = torch.cat((inputs_x, inputs_u_w, inputs_u_s)).to(args.device)

        # Supervised Loss
        targets_x                   = targets_x.to(args.device)
        logits                      = model(inputs)
        logits_x                    = logits[:batch_size]
        logits_u_w, logits_u_s      = logits[batch_size:].chunk(2)
        del logits
        Lx = F.cross_entropy(logits_x, targets_x, reduction='mean')

        # Unsupervised Loss
        pseudo_label         = torch.softmax(logits_u_w.detach_(), dim=-1)
        max_probs, targets_u = torch.max(pseudo_label, dim=-1)
        mask                 = max_probs.ge(args.threshold).float()
        Lu = (F.cross_entropy(logits_u_s, targets_u, reduction='none') * mask).mean()

        # Final Loss
        loss = Lx + args.lambda_u * Lu
        loss.backward()

        # update loss array
        losses.update(loss.item())
        losses_x.update(Lx.item())
        losses_u.update(Lu.item())

        # update optimizer
        optimizer.step()
        scheduler.step()
        if args.use_ema:
            ema_model.update(model)
        model.zero_grad()

        # print progress
        batch_time.update(time.time() - end)
        end       = time.time()
        mask_prob = mask.mean().item()
        progress_bar.set_description("Train Epoch: {epoch}/{epochs:4}. Iter: {batch:4}/{iter:4}. LR: {lr:.6f}. Data: {data:.3f}s. Batch: {bt:.3f}s. Loss: {loss:.4f}. Loss_x: {loss_x:.4f}. Loss_u: {loss_u:.4f}. Mask: {mask:.4f}. ".format(
                epoch=epoch + 1, epochs=args.epochs, batch=batch_idx + 1, iter=args.iteration,
                lr=scheduler.get_last_lr()[0], data=data_time.avg, bt=batch_time.avg,
                loss=losses.avg, loss_x=losses_x.avg, loss_u=losses_u.avg, mask=mask_prob))
        progress_bar.update()
    
    progress_bar.close()
    
    return losses.avg, losses_x.avg, losses_u.avg, mask_prob

# Test

In [0]:
def test(args, test_loader, model, epoch):
    
    batch_time = AverageMeter()
    data_time  = AverageMeter()
    losses     = AverageMeter()
    top1       = AverageMeter()
    top5       = AverageMeter()
    end        = time.time()

    test_loader = tqdm(test_loader)

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(test_loader):
            data_time.update(time.time() - end)
            model.eval()

            inputs  = inputs.to(args.device)
            targets = targets.to(args.device)
            outputs = model(inputs)
            loss    = F.cross_entropy(outputs, targets)

            prec1, prec5 = accuracy(outputs, targets, topk=(1, 5))
            losses.update(loss.item(), inputs.shape[0])
            top1.update(prec1.item(), inputs.shape[0])
            top5.update(prec5.item(), inputs.shape[0])
            batch_time.update(time.time() - end)
            end = time.time()
            
            test_loader.set_description("Test Iter: {batch:4}/{iter:4}. Data: {data:.3f}s. Batch: {bt:.3f}s. Loss: {loss:.4f}. top1: {top1:.2f}. top5: {top5:.2f}. ".format(
                    batch=batch_idx + 1, iter=len(test_loader), data=data_time.avg, bt=batch_time.avg,
                    loss=losses.avg, top1=top1.avg, top5=top5.avg))
        
        test_loader.close()

    logger.info("top-1 acc: {:.2f}".format(top1.avg))
    logger.info("top-5 acc: {:.2f}".format(top5.avg))
    return losses.avg, top1.avg

# EMA (Exponential Moving Average)

In [0]:
class ModelEMA(object):
    def __init__(self, args, model, decay, device='', resume=''):
        self.ema = deepcopy(model)
        self.ema.eval()
        self.decay  = decay
        self.device = device
        self.wd     = args.lr * args.wdecay
        if device:
            self.ema.to(device=device)
        self.ema_has_module = hasattr(self.ema, 'module')
        if resume:
            self._load_checkpoint(resume)
        for p in self.ema.parameters():
            p.requires_grad_(False)

    def _load_checkpoint(self, checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        assert isinstance(checkpoint, dict)
        if 'ema_state_dict' in checkpoint:
            new_state_dict = OrderedDict()
            for k, v in checkpoint['ema_state_dict'].items():
                if self.ema_has_module:
                    name = 'module.' + k if not k.startswith('module') else k
                else:
                    name = k
                new_state_dict[name] = v
            self.ema.load_state_dict(new_state_dict)

    def update(self, model):
        needs_module = hasattr(model, 'module') and not self.ema_has_module
        with torch.no_grad():
            msd = model.state_dict()
            for k, ema_v in self.ema.state_dict().items():
                if needs_module:
                    k = 'module.' + k
                model_v = msd[k].detach()
                if self.device:
                    model_v = model_v.to(device=self.device)
                ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v)
                # weight decay
                if 'bn' not in k:
                    msd[k] = msd[k] * (1. - self.wd)

# Main

In [None]:
def save_checkpoint(state, is_best, checkpoint, filename='checkpoint.pth.tar'):
    filepath = os.path.join(checkpoint, filename)
    torch.save(state, filepath)
    if is_best:
        shutil.copyfile(filepath, os.path.join(checkpoint, 'model_best.pth.tar'))

def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=7./16., last_epoch=-1):
    
    def _lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        no_progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0., math.cos(math.pi * num_cycles * no_progress))

    return LambdaLR(optimizer, _lr_lambda, last_epoch)

## Set Config

In [0]:
cudnn.benchmark = True
global best_acc

args.device     = torch.device('cuda', args.gpu_id)
args.world_size = 1
args.n_gpu      = torch.cuda.device_count()

05/30/2020 23:13:06 - INFO - __main__ -   {'gpu_id': 0, 'num_workers': 4, 'epochs': 1024, 'start_epoch': 0, 'batch_size': 64, 'lr': 0.03, 'warmup': 0, 'wdecay': 0.0005, 'nesterov': True, 'use_ema': True, 'ema_decay': 0.999, 'mu': 7, 'lambda_u': 1, 'threshold': 0.95, 'k_img': 65536, 'out': 'result', 'resume': '', 'amp': True, 'opt_level': 'O1', 'local_rank': -1, 'device': device(type='cuda', index=0), 'world_size': 1, 'n_gpu': 1}


## Set seed

In [0]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

os.makedirs(args.out, exist_ok=True)
writer = SummaryWriter(args.out)

## Get CIFAR10 data (augmented)

In [0]:
num_labeled = 250
labeled_dataset, unlabeled_dataset, test_dataset = get_cifar10('./data', num_labeled, args.k_img, args.k_img * args.mu)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/cifar-10-python.tar.gz to ./data


05/30/2020 23:13:25 - INFO - cifar -   Dataset: CIFAR10
05/30/2020 23:13:25 - INFO - cifar -   Labeled examples: 65536 Unlabeled examples: 458752


## Create WideResNet model

In [0]:
from wideresnet import build_wideresnet
model = build_wideresnet(depth=28, widen_factor=2, dropout=0, num_classes=10)
logger.info("Total params: {:.2f}M".format(sum(p.numel() for p in model.parameters())/1e6))

model.to(args.device)

train_sampler = RandomSampler

05/30/2020 23:13:25 - INFO - wideresnet -   Model: WideResNet 28x2
05/30/2020 23:13:25 - INFO - __main__ -   Total params: 1.47M


## Data Loader

In [0]:
labeled_trainloader = DataLoader(
        labeled_dataset,
        sampler=train_sampler(labeled_dataset),
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        drop_last=True)

unlabeled_trainloader = DataLoader(
        unlabeled_dataset,
        sampler=train_sampler(unlabeled_dataset),
        batch_size=args.batch_size*args.mu,
        num_workers=args.num_workers,
        drop_last=True)

test_loader = DataLoader(
        test_dataset,
        sampler=SequentialSampler(test_dataset),
        batch_size=args.batch_size,
        num_workers=args.num_workers)

## Optimization

In [0]:
optimizer        = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, nesterov=args.nesterov)
args.iteration   = args.k_img // args.batch_size // args.world_size
args.total_steps = args.epochs * args.iteration
scheduler        = get_cosine_schedule_with_warmup(optimizer, args.warmup * args.iteration, args.total_steps)

if args.use_ema:
    ema_model = ModelEMA(args, model, args.ema_decay, args.device)

start_epoch = 0

## Checkpoint

In [0]:
if args.resume:
    logger.info("==> Resuming from checkpoint..")
    assert os.path.isfile(args.resume), "Error: no checkpoint directory found!"
    args.out    = os.path.dirname(args.resume)
    checkpoint  = torch.load(args.resume)
    best_acc    = checkpoint['best_acc']
    start_epoch = checkpoint['epoch']
    model.load_state_dict(checkpoint['state_dict'])
    if args.use_ema:
        ema_model.ema.load_state_dict(checkpoint['ema_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    scheduler.load_state_dict(checkpoint['scheduler'])

logger.info("***** Running training *****")
logger.info(f"  Task = {'cifar10'}@{'250'}")
logger.info(f"  Num Epochs = {args.epochs}")
logger.info(f"  Batch size per GPU = {args.batch_size}")
logger.info(f"  Total train batch size = {args.batch_size*args.world_size}")
logger.info(f"  Total optimization steps = {args.total_steps}")

05/30/2020 23:13:41 - INFO - __main__ -   ***** Running training *****
05/30/2020 23:13:41 - INFO - __main__ -     Task = cifar10@250
05/30/2020 23:13:41 - INFO - __main__ -     Num Epochs = 1024
05/30/2020 23:13:41 - INFO - __main__ -     Batch size per GPU = 64
05/30/2020 23:13:41 - INFO - __main__ -     Total train batch size = 64
05/30/2020 23:13:41 - INFO - __main__ -     Total optimization steps = 1048576


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


## Loop on Epoch

In [0]:
test_accs = []
model.zero_grad()
for epoch in range(start_epoch, args.epochs):

    train_loss, train_loss_x, train_loss_u, mask_prob = train_one_epoch(args, labeled_trainloader, unlabeled_trainloader, model, optimizer, ema_model, scheduler, epoch)

    logger.info("Epoch {}. train_loss: {:.4f}. train_loss_x: {:.4f}. train_loss_u: {:.4f}.".format(epoch+1, train_loss, train_loss_x, train_loss_u))

    if args.use_ema:
        test_model = ema_model.ema
    else:
        test_model = model

    test_loss, test_acc = test(args, test_loader, test_model, epoch)

    is_best = test_acc > best_acc
    best_acc = max(test_acc, best_acc)
    model_to_save = model.module if hasattr(model, "module") else model
    if args.use_ema:
        ema_to_save = ema_model.ema.module if hasattr(ema_model.ema, "module") else ema_model.ema
    save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model_to_save.state_dict(),
                'ema_state_dict': ema_to_save.state_dict() if args.use_ema else None,
                'acc': test_acc,
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
            }, is_best, args.out)

    test_accs.append(test_acc)
    logger.info('Best top-1 acc: {:.2f}'.format(best_acc))
    logger.info('Mean top-1 acc: {:.2f}\n'.format(np.mean(test_accs[-20:])))

writer.close()

Train Epoch: 1/1024. Iter: 1024/1024. LR: 0.030000. Data: 0.028s. Batch: 0.609s. Loss: 0.5294. Loss_x: 0.2882. Loss_u: 0.2411. Mask: 0.4509. : 100%|██████████| 1024/1024 [10:23<00:00,  1.64it/s]
05/30/2020 23:24:04 - INFO - __main__ -   Epoch 1. train_loss: 0.5294. train_loss_x: 0.2882. train_loss_u: 0.2411.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.039s. Loss: 9.3724. top1: 18.92. top5: 54.28. : 100%|██████████| 157/157 [00:06<00:00, 25.14it/s]
05/30/2020 23:24:11 - INFO - __main__ -   top-1 acc: 18.92
05/30/2020 23:24:11 - INFO - __main__ -   top-5 acc: 54.28
05/30/2020 23:24:11 - INFO - __main__ -   Best top-1 acc: 18.92
05/30/2020 23:24:11 - INFO - __main__ -   Mean top-1 acc: 18.92

Train Epoch: 2/1024. Iter: 1024/1024. LR: 0.030000. Data: 0.030s. Batch: 0.618s. Loss: 0.3234. Loss_x: 0.0194. Loss_u: 0.3040. Mask: 0.4732. : 100%|██████████| 1024/1024 [10:32<00:00,  1.62it/s]
05/30/2020 23:34:44 - INFO - __main__ -   Epoch 2. train_loss: 0.3234. train_loss_x: 0.0194. train_loss_u

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 131072.0


Train Epoch: 5/1024. Iter: 1024/1024. LR: 0.029999. Data: 0.028s. Batch: 0.601s. Loss: 0.2640. Loss_x: 0.0052. Loss_u: 0.2588. Mask: 0.6272. : 100%|██████████| 1024/1024 [10:15<00:00,  1.66it/s]
05/31/2020 00:05:59 - INFO - __main__ -   Epoch 5. train_loss: 0.2640. train_loss_x: 0.0052. train_loss_u: 0.2588.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.037s. Loss: 2.3510. top1: 62.13. top5: 94.45. : 100%|██████████| 157/157 [00:05<00:00, 26.57it/s]
05/31/2020 00:06:05 - INFO - __main__ -   top-1 acc: 62.13
05/31/2020 00:06:05 - INFO - __main__ -   top-5 acc: 94.45
05/31/2020 00:06:05 - INFO - __main__ -   Best top-1 acc: 62.13
05/31/2020 00:06:05 - INFO - __main__ -   Mean top-1 acc: 45.13

Train Epoch: 6/1024. Iter: 1024/1024. LR: 0.029999. Data: 0.028s. Batch: 0.603s. Loss: 0.2555. Loss_x: 0.0039. Loss_u: 0.2516. Mask: 0.6518. : 100%|██████████| 1024/1024 [10:17<00:00,  1.66it/s]
05/31/2020 00:16:23 - INFO - __main__ -   Epoch 6. train_loss: 0.2555. train_loss_x: 0.0039. train_loss_u

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


Train Epoch: 9/1024. Iter: 1024/1024. LR: 0.029998. Data: 0.032s. Batch: 0.645s. Loss: 0.2347. Loss_x: 0.0021. Loss_u: 0.2326. Mask: 0.7321. : 100%|██████████| 1024/1024 [11:01<00:00,  1.55it/s]
05/31/2020 00:48:32 - INFO - __main__ -   Epoch 9. train_loss: 0.2347. train_loss_x: 0.0021. train_loss_u: 0.2326.
Test Iter:  157/ 157. Data: 0.010s. Batch: 0.044s. Loss: 1.7659. top1: 72.29. top5: 96.88. : 100%|██████████| 157/157 [00:06<00:00, 22.59it/s]
05/31/2020 00:48:39 - INFO - __main__ -   top-1 acc: 72.29
05/31/2020 00:48:39 - INFO - __main__ -   top-5 acc: 96.88
05/31/2020 00:48:39 - INFO - __main__ -   Best top-1 acc: 72.29
05/31/2020 00:48:39 - INFO - __main__ -   Mean top-1 acc: 55.71

Train Epoch: 10/1024. Iter: 1024/1024. LR: 0.029997. Data: 0.033s. Batch: 0.658s. Loss: 0.2304. Loss_x: 0.0017. Loss_u: 0.2287. Mask: 0.7054. : 100%|██████████| 1024/1024 [11:13<00:00,  1.52it/s]
05/31/2020 00:59:53 - INFO - __main__ -   Epoch 10. train_loss: 0.2304. train_loss_x: 0.0017. train_loss

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


Train Epoch: 12/1024. Iter: 1024/1024. LR: 0.029996. Data: 0.029s. Batch: 0.619s. Loss: 0.2251. Loss_x: 0.0014. Loss_u: 0.2238. Mask: 0.7991. : 100%|██████████| 1024/1024 [10:34<00:00,  1.61it/s]
05/31/2020 01:21:32 - INFO - __main__ -   Epoch 12. train_loss: 0.2251. train_loss_x: 0.0014. train_loss_u: 0.2238.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.038s. Loss: 1.5304. top1: 75.52. top5: 97.49. : 100%|██████████| 157/157 [00:05<00:00, 26.27it/s]
05/31/2020 01:21:38 - INFO - __main__ -   top-1 acc: 75.52
05/31/2020 01:21:38 - INFO - __main__ -   top-5 acc: 97.49
05/31/2020 01:21:38 - INFO - __main__ -   Best top-1 acc: 75.52
05/31/2020 01:21:38 - INFO - __main__ -   Mean top-1 acc: 60.48

Train Epoch: 13/1024. Iter: 1024/1024. LR: 0.029995. Data: 0.029s. Batch: 0.608s. Loss: 0.2207. Loss_x: 0.0014. Loss_u: 0.2193. Mask: 0.7455. : 100%|██████████| 1024/1024 [10:22<00:00,  1.64it/s]
05/31/2020 01:32:00 - INFO - __main__ -   Epoch 13. train_loss: 0.2207. train_loss_x: 0.0014. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


Train Epoch: 14/1024. Iter: 1024/1024. LR: 0.029995. Data: 0.029s. Batch: 0.606s. Loss: 0.2195. Loss_x: 0.0011. Loss_u: 0.2185. Mask: 0.7656. : 100%|██████████| 1024/1024 [10:20<00:00,  1.65it/s]
05/31/2020 01:42:27 - INFO - __main__ -   Epoch 14. train_loss: 0.2195. train_loss_x: 0.0011. train_loss_u: 0.2185.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.037s. Loss: 1.3970. top1: 77.67. top5: 97.68. : 100%|██████████| 157/157 [00:05<00:00, 26.46it/s]
05/31/2020 01:42:33 - INFO - __main__ -   top-1 acc: 77.67
05/31/2020 01:42:33 - INFO - __main__ -   top-5 acc: 97.68
05/31/2020 01:42:33 - INFO - __main__ -   Best top-1 acc: 77.67
05/31/2020 01:42:33 - INFO - __main__ -   Mean top-1 acc: 62.87

Train Epoch: 15/1024. Iter: 1024/1024. LR: 0.029994. Data: 0.029s. Batch: 0.606s. Loss: 0.2151. Loss_x: 0.0011. Loss_u: 0.2140. Mask: 0.7500. : 100%|██████████| 1024/1024 [10:20<00:00,  1.65it/s]
05/31/2020 01:52:54 - INFO - __main__ -   Epoch 15. train_loss: 0.2151. train_loss_x: 0.0011. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 18/1024. Iter: 1024/1024. LR: 0.029991. Data: 0.031s. Batch: 0.640s. Loss: 0.2092. Loss_x: 0.0008. Loss_u: 0.2085. Mask: 0.7746. : 100%|██████████| 1024/1024 [10:55<00:00,  1.56it/s]
05/31/2020 02:25:15 - INFO - __main__ -   Epoch 18. train_loss: 0.2092. train_loss_x: 0.0008. train_loss_u: 0.2085.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.040s. Loss: 1.2806. top1: 79.57. top5: 98.12. : 100%|██████████| 157/157 [00:06<00:00, 24.74it/s]
05/31/2020 02:25:22 - INFO - __main__ -   top-1 acc: 79.57
05/31/2020 02:25:22 - INFO - __main__ -   top-5 acc: 98.12
05/31/2020 02:25:22 - INFO - __main__ -   Best top-1 acc: 79.57
05/31/2020 02:25:22 - INFO - __main__ -   Mean top-1 acc: 66.48

Train Epoch: 19/1024. Iter: 1024/1024. LR: 0.029990. Data: 0.031s. Batch: 0.643s. Loss: 0.2083. Loss_x: 0.0007. Loss_u: 0.2076. Mask: 0.7723. : 100%|██████████| 1024/1024 [10:58<00:00,  1.55it/s]
05/31/2020 02:36:20 - INFO - __main__ -   Epoch 19. train_loss: 0.2083. train_loss_x: 0.0007. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 21/1024. Iter: 1024/1024. LR: 0.029988. Data: 0.032s. Batch: 0.644s. Loss: 0.2049. Loss_x: 0.0005. Loss_u: 0.2044. Mask: 0.8304. : 100%|██████████| 1024/1024 [10:59<00:00,  1.55it/s]
05/31/2020 02:58:32 - INFO - __main__ -   Epoch 21. train_loss: 0.2049. train_loss_x: 0.0005. train_loss_u: 0.2044.
Test Iter:  157/ 157. Data: 0.007s. Batch: 0.039s. Loss: 1.0937. top1: 81.69. top5: 98.39. : 100%|██████████| 157/157 [00:06<00:00, 25.30it/s]
05/31/2020 02:58:39 - INFO - __main__ -   top-1 acc: 81.69
05/31/2020 02:58:39 - INFO - __main__ -   top-5 acc: 98.39
05/31/2020 02:58:39 - INFO - __main__ -   Best top-1 acc: 81.69
05/31/2020 02:58:39 - INFO - __main__ -   Mean top-1 acc: 71.06

Train Epoch: 22/1024. Iter: 1024/1024. LR: 0.029987. Data: 0.032s. Batch: 0.643s. Loss: 0.2047. Loss_x: 0.0006. Loss_u: 0.2041. Mask: 0.8125. : 100%|██████████| 1024/1024 [10:58<00:00,  1.56it/s]
05/31/2020 03:09:37 - INFO - __main__ -   Epoch 22. train_loss: 0.2047. train_loss_x: 0.0006. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 23/1024. Iter: 1024/1024. LR: 0.029986. Data: 0.030s. Batch: 0.638s. Loss: 0.2033. Loss_x: 0.0005. Loss_u: 0.2028. Mask: 0.8259. : 100%|██████████| 1024/1024 [10:53<00:00,  1.57it/s]
05/31/2020 03:20:37 - INFO - __main__ -   Epoch 23. train_loss: 0.2033. train_loss_x: 0.0005. train_loss_u: 0.2028.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.039s. Loss: 1.0131. top1: 82.76. top5: 98.59. : 100%|██████████| 157/157 [00:06<00:00, 25.21it/s]
05/31/2020 03:20:44 - INFO - __main__ -   top-1 acc: 82.76
05/31/2020 03:20:44 - INFO - __main__ -   top-5 acc: 98.59
05/31/2020 03:20:44 - INFO - __main__ -   Best top-1 acc: 82.76
05/31/2020 03:20:44 - INFO - __main__ -   Mean top-1 acc: 74.95

Train Epoch: 24/1024. Iter: 1024/1024. LR: 0.029984. Data: 0.031s. Batch: 0.640s. Loss: 0.2011. Loss_x: 0.0005. Loss_u: 0.2006. Mask: 0.8304. : 100%|██████████| 1024/1024 [10:55<00:00,  1.56it/s]
05/31/2020 03:31:40 - INFO - __main__ -   Epoch 24. train_loss: 0.2011. train_loss_x: 0.0005. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 25/1024. Iter: 1024/1024. LR: 0.029983. Data: 0.032s. Batch: 0.641s. Loss: 0.2016. Loss_x: 0.0005. Loss_u: 0.2011. Mask: 0.8259. : 100%|██████████| 1024/1024 [10:56<00:00,  1.56it/s]
05/31/2020 03:42:43 - INFO - __main__ -   Epoch 25. train_loss: 0.2016. train_loss_x: 0.0005. train_loss_u: 0.2011.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.040s. Loss: 0.9884. top1: 83.45. top5: 98.63. : 100%|██████████| 157/157 [00:06<00:00, 24.66it/s]
05/31/2020 03:42:49 - INFO - __main__ -   top-1 acc: 83.45
05/31/2020 03:42:49 - INFO - __main__ -   top-5 acc: 98.63
05/31/2020 03:42:49 - INFO - __main__ -   Best top-1 acc: 83.45
05/31/2020 03:42:49 - INFO - __main__ -   Mean top-1 acc: 77.31

Train Epoch: 26/1024. Iter: 1024/1024. LR: 0.029982. Data: 0.033s. Batch: 0.641s. Loss: 0.1999. Loss_x: 0.0004. Loss_u: 0.1995. Mask: 0.8237. : 100%|██████████| 1024/1024 [10:56<00:00,  1.56it/s]
05/31/2020 03:53:46 - INFO - __main__ -   Epoch 26. train_loss: 0.1999. train_loss_x: 0.0004. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 28/1024. Iter: 1024/1024. LR: 0.029979. Data: 0.032s. Batch: 0.642s. Loss: 0.1973. Loss_x: 0.0004. Loss_u: 0.1969. Mask: 0.8594. : 100%|██████████| 1024/1024 [10:57<00:00,  1.56it/s]
05/31/2020 04:15:53 - INFO - __main__ -   Epoch 28. train_loss: 0.1973. train_loss_x: 0.0004. train_loss_u: 0.1969.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.039s. Loss: 0.8845. top1: 84.82. top5: 98.97. : 100%|██████████| 157/157 [00:06<00:00, 25.39it/s]
05/31/2020 04:15:59 - INFO - __main__ -   top-1 acc: 84.82
05/31/2020 04:15:59 - INFO - __main__ -   top-5 acc: 98.97
05/31/2020 04:16:00 - INFO - __main__ -   Best top-1 acc: 84.82
05/31/2020 04:16:00 - INFO - __main__ -   Mean top-1 acc: 79.79

Train Epoch: 29/1024. Iter: 1024/1024. LR: 0.029977. Data: 0.030s. Batch: 0.623s. Loss: 0.1964. Loss_x: 0.0004. Loss_u: 0.1960. Mask: 0.8326. : 100%|██████████| 1024/1024 [10:38<00:00,  1.60it/s]
05/31/2020 04:26:38 - INFO - __main__ -   Epoch 29. train_loss: 0.1964. train_loss_x: 0.0004. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 30/1024. Iter: 1024/1024. LR: 0.029976. Data: 0.030s. Batch: 0.615s. Loss: 0.1951. Loss_x: 0.0003. Loss_u: 0.1948. Mask: 0.8549. : 100%|██████████| 1024/1024 [10:29<00:00,  1.63it/s]
05/31/2020 04:37:14 - INFO - __main__ -   Epoch 30. train_loss: 0.1951. train_loss_x: 0.0003. train_loss_u: 0.1948.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.037s. Loss: 0.8523. top1: 85.36. top5: 98.93. : 100%|██████████| 157/157 [00:05<00:00, 26.95it/s]
05/31/2020 04:37:20 - INFO - __main__ -   top-1 acc: 85.36
05/31/2020 04:37:20 - INFO - __main__ -   top-5 acc: 98.93
05/31/2020 04:37:20 - INFO - __main__ -   Best top-1 acc: 85.36
05/31/2020 04:37:20 - INFO - __main__ -   Mean top-1 acc: 81.01

Train Epoch: 31/1024. Iter: 1024/1024. LR: 0.029974. Data: 0.030s. Batch: 0.614s. Loss: 0.1943. Loss_x: 0.0003. Loss_u: 0.1940. Mask: 0.8594. : 100%|██████████| 1024/1024 [10:29<00:00,  1.63it/s]
05/31/2020 04:47:49 - INFO - __main__ -   Epoch 31. train_loss: 0.1943. train_loss_x: 0.0003. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 32/1024. Iter: 1024/1024. LR: 0.029972. Data: 0.029s. Batch: 0.614s. Loss: 0.1922. Loss_x: 0.0003. Loss_u: 0.1919. Mask: 0.8527. : 100%|██████████| 1024/1024 [10:28<00:00,  1.63it/s]
05/31/2020 04:58:24 - INFO - __main__ -   Epoch 32. train_loss: 0.1922. train_loss_x: 0.0003. train_loss_u: 0.1919.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.038s. Loss: 0.8218. top1: 85.98. top5: 99.00. : 100%|██████████| 157/157 [00:05<00:00, 26.26it/s]
05/31/2020 04:58:30 - INFO - __main__ -   top-1 acc: 85.98
05/31/2020 04:58:30 - INFO - __main__ -   top-5 acc: 99.00
05/31/2020 04:58:31 - INFO - __main__ -   Best top-1 acc: 85.98
05/31/2020 04:58:31 - INFO - __main__ -   Mean top-1 acc: 82.05

Train Epoch: 33/1024. Iter: 1024/1024. LR: 0.029971. Data: 0.030s. Batch: 0.613s. Loss: 0.1920. Loss_x: 0.0003. Loss_u: 0.1917. Mask: 0.8415. : 100%|██████████| 1024/1024 [10:27<00:00,  1.63it/s]
05/31/2020 05:08:58 - INFO - __main__ -   Epoch 33. train_loss: 0.1920. train_loss_x: 0.0003. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 35/1024. Iter: 1024/1024. LR: 0.029967. Data: 0.030s. Batch: 0.618s. Loss: 0.1900. Loss_x: 0.0003. Loss_u: 0.1898. Mask: 0.8549. : 100%|██████████| 1024/1024 [10:32<00:00,  1.62it/s]
05/31/2020 05:30:16 - INFO - __main__ -   Epoch 35. train_loss: 0.1900. train_loss_x: 0.0003. train_loss_u: 0.1898.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.039s. Loss: 0.7707. top1: 86.78. top5: 99.05. : 100%|██████████| 157/157 [00:06<00:00, 25.46it/s]
05/31/2020 05:30:22 - INFO - __main__ -   top-1 acc: 86.78
05/31/2020 05:30:22 - INFO - __main__ -   top-5 acc: 99.05
05/31/2020 05:30:22 - INFO - __main__ -   Best top-1 acc: 86.78
05/31/2020 05:30:22 - INFO - __main__ -   Mean top-1 acc: 83.39

Train Epoch: 36/1024. Iter: 1024/1024. LR: 0.029965. Data: 0.030s. Batch: 0.622s. Loss: 0.1889. Loss_x: 0.0003. Loss_u: 0.1886. Mask: 0.8795. : 100%|██████████| 1024/1024 [10:36<00:00,  1.61it/s]
05/31/2020 05:40:59 - INFO - __main__ -   Epoch 36. train_loss: 0.1889. train_loss_x: 0.0003. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 37/1024. Iter: 1024/1024. LR: 0.029963. Data: 0.030s. Batch: 0.620s. Loss: 0.1872. Loss_x: 0.0003. Loss_u: 0.1869. Mask: 0.8616. : 100%|██████████| 1024/1024 [10:34<00:00,  1.61it/s]
05/31/2020 05:51:40 - INFO - __main__ -   Epoch 37. train_loss: 0.1872. train_loss_x: 0.0003. train_loss_u: 0.1869.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.038s. Loss: 0.7603. top1: 87.17. top5: 99.07. : 100%|██████████| 157/157 [00:06<00:00, 25.81it/s]
05/31/2020 05:51:46 - INFO - __main__ -   top-1 acc: 87.17
05/31/2020 05:51:46 - INFO - __main__ -   top-5 acc: 99.07
05/31/2020 05:51:46 - INFO - __main__ -   Best top-1 acc: 87.17
05/31/2020 05:51:46 - INFO - __main__ -   Mean top-1 acc: 84.18

Train Epoch: 38/1024. Iter: 1024/1024. LR: 0.029961. Data: 0.029s. Batch: 0.619s. Loss: 0.1870. Loss_x: 0.0003. Loss_u: 0.1867. Mask: 0.8795. : 100%|██████████| 1024/1024 [10:34<00:00,  1.61it/s]
05/31/2020 06:02:21 - INFO - __main__ -   Epoch 38. train_loss: 0.1870. train_loss_x: 0.0003. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 39/1024. Iter: 1024/1024. LR: 0.029959. Data: 0.029s. Batch: 0.616s. Loss: 0.1860. Loss_x: 0.0003. Loss_u: 0.1857. Mask: 0.8705. : 100%|██████████| 1024/1024 [10:30<00:00,  1.62it/s]
05/31/2020 06:12:57 - INFO - __main__ -   Epoch 39. train_loss: 0.1860. train_loss_x: 0.0003. train_loss_u: 0.1857.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.038s. Loss: 0.7662. top1: 87.51. top5: 99.21. : 100%|██████████| 157/157 [00:06<00:00, 25.65it/s]
05/31/2020 06:13:03 - INFO - __main__ -   top-1 acc: 87.51
05/31/2020 06:13:03 - INFO - __main__ -   top-5 acc: 99.21
05/31/2020 06:13:03 - INFO - __main__ -   Best top-1 acc: 87.51
05/31/2020 06:13:03 - INFO - __main__ -   Mean top-1 acc: 84.91

Train Epoch: 40/1024. Iter: 1024/1024. LR: 0.029957. Data: 0.029s. Batch: 0.609s. Loss: 0.1856. Loss_x: 0.0003. Loss_u: 0.1853. Mask: 0.8795. : 100%|██████████| 1024/1024 [10:24<00:00,  1.64it/s]
05/31/2020 06:23:28 - INFO - __main__ -   Epoch 40. train_loss: 0.1856. train_loss_x: 0.0003. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 42/1024. Iter:  875/1024. LR: 0.029953. Data: 0.029s. Batch: 0.614s. Loss: 0.1845. Loss_x: 0.0003. Loss_u: 0.1842. Mask: 0.8661. :  85%|████████▌ | 875/1024 [08:56<01:26,  1.73it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


Train Epoch: 42/1024. Iter: 1024/1024. LR: 0.029952. Data: 0.029s. Batch: 0.611s. Loss: 0.1850. Loss_x: 0.0003. Loss_u: 0.1847. Mask: 0.8951. : 100%|██████████| 1024/1024 [10:25<00:00,  1.64it/s]
05/31/2020 06:44:33 - INFO - __main__ -   Epoch 42. train_loss: 0.1850. train_loss_x: 0.0003. train_loss_u: 0.1847.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.037s. Loss: 0.7066. top1: 87.72. top5: 99.24. : 100%|██████████| 157/157 [00:05<00:00, 26.92it/s]
05/31/2020 06:44:39 - INFO - __main__ -   top-1 acc: 87.72
05/31/2020 06:44:39 - INFO - __main__ -   top-5 acc: 99.24
05/31/2020 06:44:39 - INFO - __main__ -   Best top-1 acc: 87.72
05/31/2020 06:44:39 - INFO - __main__ -   Mean top-1 acc: 85.80

Train Epoch: 43/1024. Iter: 1024/1024. LR: 0.029950. Data: 0.030s. Batch: 0.609s. Loss: 0.1821. Loss_x: 0.0002. Loss_u: 0.1819. Mask: 0.8862. : 100%|██████████| 1024/1024 [10:24<00:00,  1.64it/s]
05/31/2020 06:55:03 - INFO - __main__ -   Epoch 43. train_loss: 0.1821. train_loss_x: 0.0002. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 47/1024. Iter: 1024/1024. LR: 0.029940. Data: 0.029s. Batch: 0.613s. Loss: 0.1804. Loss_x: 0.0002. Loss_u: 0.1802. Mask: 0.8951. : 100%|██████████| 1024/1024 [10:28<00:00,  1.63it/s]
05/31/2020 07:37:15 - INFO - __main__ -   Epoch 47. train_loss: 0.1804. train_loss_x: 0.0002. train_loss_u: 0.1802.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.037s. Loss: 0.6956. top1: 88.44. top5: 99.33. : 100%|██████████| 157/157 [00:05<00:00, 26.42it/s]
05/31/2020 07:37:21 - INFO - __main__ -   top-1 acc: 88.44
05/31/2020 07:37:21 - INFO - __main__ -   top-5 acc: 99.33
05/31/2020 07:37:21 - INFO - __main__ -   Best top-1 acc: 88.44
05/31/2020 07:37:21 - INFO - __main__ -   Mean top-1 acc: 86.97

Train Epoch: 48/1024. Iter: 1024/1024. LR: 0.029938. Data: 0.030s. Batch: 0.612s. Loss: 0.1801. Loss_x: 0.0002. Loss_u: 0.1799. Mask: 0.8906. : 100%|██████████| 1024/1024 [10:26<00:00,  1.63it/s]
05/31/2020 07:47:48 - INFO - __main__ -   Epoch 48. train_loss: 0.1801. train_loss_x: 0.0002. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 49/1024. Iter: 1024/1024. LR: 0.029935. Data: 0.029s. Batch: 0.610s. Loss: 0.1769. Loss_x: 0.0002. Loss_u: 0.1768. Mask: 0.8862. : 100%|██████████| 1024/1024 [10:24<00:00,  1.64it/s]
05/31/2020 07:58:19 - INFO - __main__ -   Epoch 49. train_loss: 0.1769. train_loss_x: 0.0002. train_loss_u: 0.1768.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.037s. Loss: 0.6999. top1: 88.62. top5: 99.34. : 100%|██████████| 157/157 [00:05<00:00, 26.67it/s]
05/31/2020 07:58:25 - INFO - __main__ -   top-1 acc: 88.62
05/31/2020 07:58:25 - INFO - __main__ -   top-5 acc: 99.34
05/31/2020 07:58:25 - INFO - __main__ -   Best top-1 acc: 88.62
05/31/2020 07:58:25 - INFO - __main__ -   Mean top-1 acc: 87.34

Train Epoch: 50/1024. Iter:  775/1024. LR: 0.029933. Data: 0.030s. Batch: 0.612s. Loss: 0.1780. Loss_x: 0.0002. Loss_u: 0.1778. Mask: 0.9063. :  76%|███████▌  | 775/1024 [07:54<02:27,  1.69it/s]

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 262144.0


Train Epoch: 50/1024. Iter: 1024/1024. LR: 0.029932. Data: 0.029s. Batch: 0.610s. Loss: 0.1783. Loss_x: 0.0002. Loss_u: 0.1782. Mask: 0.9129. : 100%|██████████| 1024/1024 [10:24<00:00,  1.64it/s]
05/31/2020 08:08:49 - INFO - __main__ -   Epoch 50. train_loss: 0.1783. train_loss_x: 0.0002. train_loss_u: 0.1782.
Test Iter:  157/ 157. Data: 0.008s. Batch: 0.037s. Loss: 0.6831. top1: 88.72. top5: 99.43. : 100%|██████████| 157/157 [00:05<00:00, 26.67it/s]
05/31/2020 08:08:55 - INFO - __main__ -   top-1 acc: 88.72
05/31/2020 08:08:55 - INFO - __main__ -   top-5 acc: 99.43
05/31/2020 08:08:55 - INFO - __main__ -   Best top-1 acc: 88.72
05/31/2020 08:08:55 - INFO - __main__ -   Mean top-1 acc: 87.50

Train Epoch: 51/1024. Iter: 1024/1024. LR: 0.029930. Data: 0.029s. Batch: 0.613s. Loss: 0.1753. Loss_x: 0.0002. Loss_u: 0.1751. Mask: 0.9174. : 100%|██████████| 1024/1024 [10:27<00:00,  1.63it/s]
05/31/2020 08:19:23 - INFO - __main__ -   Epoch 51. train_loss: 0.1753. train_loss_x: 0.0002. train_lo

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0


Train Epoch: 55/1024. Iter: 1024/1024. LR: 0.029918. Data: 0.029s. Batch: 0.607s. Loss: 0.1742. Loss_x: 0.0001. Loss_u: 0.1740. Mask: 0.9219. : 100%|██████████| 1024/1024 [10:21<00:00,  1.65it/s]
05/31/2020 09:01:23 - INFO - __main__ -   Epoch 55. train_loss: 0.1742. train_loss_x: 0.0001. train_loss_u: 0.1740.
Test Iter:  157/ 157. Data: 0.007s. Batch: 0.037s. Loss: 0.6997. top1: 88.88. top5: 99.33. : 100%|██████████| 157/157 [00:05<00:00, 26.76it/s]
05/31/2020 09:01:29 - INFO - __main__ -   top-1 acc: 88.88
05/31/2020 09:01:29 - INFO - __main__ -   top-5 acc: 99.33
05/31/2020 09:01:29 - INFO - __main__ -   Best top-1 acc: 89.00
05/31/2020 09:01:29 - INFO - __main__ -   Mean top-1 acc: 88.18

Train Epoch: 56/1024. Iter:  927/1024. LR: 0.029916. Data: 0.030s. Batch: 0.611s. Loss: 0.1734. Loss_x: 0.0002. Loss_u: 0.1732. Mask: 0.9286. :  91%|█████████ | 927/1024 [09:26<00:58,  1.66it/s]