In [29]:
import os
import argparse
import logging
logging.basicConfig(level=logging.INFO)
from common import data, fit
from common.util import download_file
import mxnet as mx
import numpy as np
from mxnet.symbol import *

In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
def download_cifar100():
    data_dir="/efs/data/cifar-100-mxnet"
    #data_dir="/data/cifar-100-mxnet"
    fnames = (os.path.join(data_dir, "train.rec"),
              os.path.join(data_dir, "test.rec"))
    return fnames

In [37]:
(train_fname, val_fname) = download_cifar100()

# parse args
parser = argparse.ArgumentParser(description="train cifar100",
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
fit.add_fit_args(parser)
data.add_data_args(parser)
data.add_data_aug_args(parser)
data.set_data_aug_level(parser, 2)

num_unsup_examples = 50000
subset_factor = 5

parser.set_defaults(
    # data
    data_train     = train_fname,
    data_val       = val_fname,
    num_classes    = 100,
    image_shape    = '3,28,28',
    prefix         = 'cifar_sup_subset2',
    pad_size       = 2,
    # train
    batch_size     = 80,  # todo currently has to devide 'num_validation_samples'
    num_epochs     = 120,
    lr_step_epochs = '90',
    gpus           = '4,5,6,7',
    loss_gpu       = 0,
    disp_batches   = 20,
    num_examples   = num_unsup_examples / subset_factor, 
    wd             = 1e-4,
    lr             = .05,
    lr_factor      = .2,
    nembeddings    = 256,
    optimizer      = 'sgd'
)

args = parser.parse_args("")

unsup_multiplier = 0
labeled_per_class = 500 / subset_factor

sample_seed = 47
val_interval = 1
save_interval = 5

In [38]:
from common.multi_iterator import Multi_iterator
from common.data import get_partial_rec_iter

(train_sup, val) = get_partial_rec_iter(args, get_val=True, devide_by=subset_factor, shuffle=True)
(train_unsup, _) = get_partial_rec_iter(args, get_val=False, devide_by=1, shuffle=True)
    
num_sup_examples = labeled_per_class * args.num_classes
train = Multi_iterator(train_sup, train_unsup, unsup_multiplier, num_unsup_examples, num_sup_examples)

0  times more unsup data than sup data


In [39]:
from symbols import inception_cifar as base_net
from common.lba import compute_semisup_loss, logit_loss
from common.wrapper_module import get_embedding_shapes

# kvstore
kv = mx.kvstore.create(args.kv_store)

t_nb = args.batch_size * unsup_multiplier
#initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="avg", magnitude=2.34)
initializer = mx.init.Uniform(0.01)

lr, lr_scheduler = fit._get_lr_scheduler(args, kv)

def buildEmbeddingModule(arg_p=None, aux_p=None):
    dataSup = mx.symbol.Variable(name="dataSup")
    dataUnsup = []
    for i in range(unsup_multiplier):
        dataUnsup.append(Variable(name="dataUnsup"+str(i)))

    # concat data, feed both through the network
    # then split it up again
    data = concat(dataSup, *dataUnsup, dim=0)

    embeddings = base_net.build_embeddings(data, nembeddings=args.nembeddings)
    splitted = split(embeddings, num_outputs=(unsup_multiplier+1), axis=0, name='split')
    
    # devices for training
    devs = mx.cpu() if args.gpus is None or args.gpus is '' else [
        mx.gpu(int(i)) for i in args.gpus.split(',')]

    data_names = ['dataSup'] + ['dataUnsup'+str(i) for i in range(unsup_multiplier)]
        
    # create model
    model = mx.mod.Module(
        context       = devs,
        symbol        = splitted,
        data_names    = data_names,
        label_names   = None)
    
    model.bind(data_shapes=train.provide_data)         
    model.init_params(initializer, arg_p, aux_p)    
    model.init_optimizer(optimizer=args.optimizer, optimizer_params=(
       ('learning_rate', lr), 
       ('wd', 1e-4),
       ('momentum', args.mom),
       #('rescale_grad', 1/args),
       ('lr_scheduler', lr_scheduler)))

    return model


def buildSupLossModule(arg_p=None, aux_p=None):
    supEmbeddings = Variable(name="embeddings_sup")
    labelSup = mx.symbol.Variable(name='labelSup')
    overall_loss = []
    
    if unsup_multiplier >= 1:
        unsupEmbeddings = []
        for i in range(unsup_multiplier):
            unsupEmbeddings.append(Variable(name="embeddings_unsup"+str(i)))
    
        unsupEmbeddings = concat(*unsupEmbeddings, dim=0)
            
        (walker_loss, visit_loss) = compute_semisup_loss(supEmbeddings, unsupEmbeddings, labelSup, t_nb, 
                                                     walker_weight=1.0, visit_weight=0.5)
        #overall_loss = [walker_loss, visit_loss]
        
    overall_loss = [logit_loss(supEmbeddings, labelSup, args.num_classes)] + overall_loss
    
    # todo maybe use gpu
    devs = mx.cpu() if args.loss_gpu is None or args.loss_gpu is '' else mx.gpu(args.loss_gpu)

    # create module
    model = mx.mod.Module(
        context = devs,
        symbol  = Group(overall_loss),
        data_names = ['embeddings_sup'],#+['embeddings_unsup'+str(i) for i in range(unsup_multiplier)],
        label_names = ['labelSup'])
    
    # allocate memory by given the input data and label shapes
    model.bind(data_shapes=[get_embedding_shapes(args.batch_size, args.nembeddings, unsup_multiplier)[0]], 
               label_shapes=train.provide_label,
               inputs_need_grad=True)
            
    model.init_params(initializer, arg_p, aux_p)    
    model.init_optimizer(optimizer=args.optimizer, optimizer_params=(
        ('learning_rate', lr), 
        #('rescale_grad', 0.005),
        ('momentum', args.mom),
        ('wd', 1e-4),
        ('lr_scheduler', lr_scheduler)))
    
    return model


def buildLossModule(arg_p=None, aux_p=None):
    supEmbeddings = Variable(name="embeddings_sup")
    labelSup = mx.symbol.Variable(name='labelSup')
    overall_loss = []
    
    if unsup_multiplier >= 1:
        unsupEmbeddings = []
        for i in range(unsup_multiplier):
            unsupEmbeddings.append(Variable(name="embeddings_unsup"+str(i)))
    
        unsupEmbeddings = concat(*unsupEmbeddings, dim=0)
            
        (walker_loss, visit_loss) = compute_semisup_loss(supEmbeddings, unsupEmbeddings, labelSup, t_nb, 
                                                     walker_weight=0.8, visit_weight=0.5)
        overall_loss = [walker_loss, visit_loss]
        
    overall_loss = [logit_loss(supEmbeddings, labelSup, args.num_classes)] + overall_loss
    
    # todo maybe use gpu
    devs = mx.cpu() if args.loss_gpu is None or args.loss_gpu is '' else mx.gpu(args.loss_gpu)

    # create module
    model = mx.mod.Module(
        context = devs,
        symbol  = Group(overall_loss),
        data_names = ['embeddings_sup']+['embeddings_unsup'+str(i) for i in range(unsup_multiplier)],
        label_names = ['labelSup'])
    
    # allocate memory by given the input data and label shapes
    model.bind(data_shapes=get_embedding_shapes(args.batch_size, args.nembeddings, unsup_multiplier), 
               label_shapes=train.provide_label,
               inputs_need_grad=True)
            
    model.init_params(initializer, arg_p, aux_p)    
    model.init_optimizer(optimizer=args.optimizer, optimizer_params=(
        ('learning_rate', lr), 
        #('rescale_grad', 0.005),
        ('momentum', args.mom),
        ('wd', 1e-4),
        ('lr_scheduler', lr_scheduler)))
    
    return model

[11250.0]


In [None]:
from common.wrapper_module import WrapperModule
#eval_metrics = Multi_Accuracy(num= 3 if unsup_multiplier >= 1 else 1)
                    
def fit_model(args, embeddingModule, lossModule, data, **kwargs):
    """
    train a model
    args : argparse returns
    data_loader : function that returns the train and val data iterators
    """
    # logging
    head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=head)
    logging.info('start with arguments %s', args)
    
    # data iterators
    (train, val) = data

    # load model
    if 'arg_params' in kwargs and 'aux_params' in kwargs:
        arg_params = kwargs['arg_params']
        aux_params = kwargs['aux_params']
    else: 
        sym, arg_params, aux_params = fit._load_model(args, kv.rank)
        
    # save model
    checkpoint = fit._save_model(args, kv.rank)

    model = WrapperModule(embeddingModule, lossModule, unsup_multiplier)
    
    logf = open(args.prefix+'logs', 'w')
    def validate_model(epoch, *args_):
        if epoch % val_interval != 0: 
            return
        res = model.score(val)
        #TODO: pull this into default
        print('Epoch[%d] Validation-accuracy=%f' % (epoch,  res))
        logf.write('Epoch[%d] Validation-accuracy=%f \n' % (epoch,  res))  # python will convert \n to os.linesep
        logf.flush()

        if epoch % save_interval == 0:
            model.save_checkpoint(args.prefix, epoch)

    #monitor = mx.mon.Monitor(interval=1000, pattern='.*aba_backward.*') 
    monitor = mx.mon.Monitor(interval=1000, pattern='.*') 
    
    # callbacks that run after each batch
    batch_end_callbacks = [mx.callback.Speedometer(args.batch_size*(unsup_multiplier+1), args.disp_batches)]
    
    epoch_end_callbacks = validate_model

    # run
    model.fit(train,
        begin_epoch        = args.load_epoch if args.load_epoch else 0,
        num_epoch          = args.num_epochs,
        #eval_data          = val,
        #eval_metric        = eval_metrics,
        kvstore            = kv,
        arg_params         = arg_params,
        aux_params         = aux_params,
        batch_end_callback = batch_end_callbacks,
        epoch_end_callback = epoch_end_callbacks,
        allow_missing      = True
        #monitor            = monitor
             )
    logf.close()
    return model

In [None]:
train.reset()

# train using a checkpoint with 20% validation accuracy, trained only supervised
#(sym, arg_p, aux_p) = mx.model.load_checkpoint('embedding_val30',25)
#embeddingModule = buildEmbeddingModule(arg_p, aux_p)      

# train from previous iteration
#(arg_p_emb, aux_p_emb, arg_p_loss, aux_p_loss) = WrapperModule.load_checkpoint('cifarsemisup', 200)
#embeddingModule = buildEmbeddingModule(arg_p_emb, aux_p_emb)      
#lossModule = buildLossModule(arg_p_loss, aux_p_loss)

# train from scratch
embeddingModule = buildEmbeddingModule()

#semisupervised loss module
#lossModule = buildLossModule()

#supervised loss module
lossModule = buildSupLossModule()

m = fit_model(args, embeddingModule, lossModule, (train, val))

INFO:root:start with arguments Namespace(batch_size=80, benchmark=0, data_nthreads=4, data_train='/efs/data/cifar-100-mxnet/train.rec', data_val='/efs/data/cifar-100-mxnet/test.rec', disp_batches=20, dtype='float32', gpus='4,5,6,7', image_shape='3,28,28', kv_store='device', load_epoch=None, loss_gpu=0, lr=0.05, lr_factor=0.2, lr_step_epochs='90', max_random_aspect_ratio=0, max_random_h=36, max_random_l=50, max_random_rotate_angle=0, max_random_s=50, max_random_scale=1, max_random_shear_ratio=0, min_random_scale=1, model_prefix=None, mom=0.9, monitor=0, nembeddings=256, network=None, num_classes=100, num_epochs=120, num_examples=10000.0, num_layers=None, optimizer='sgd', pad_size=2, prefix='cifar_sup_subset2', random_crop=1, random_mirror=1, rgb_mean='123.68,116.779,103.939', test_io=0, top_k=0, wd=0.0001)
INFO:root:Epoch[0] Batch [20]	Speed: 385.12 samples/sec	accuracy=nan
INFO:root:Epoch[0] Batch [40]	Speed: 378.89 samples/sec	accuracy=nan
INFO:root:Epoch[0] Batch [60]	Speed: 379.85 s

Epoch[0] Validation-accuracy=0.023400


INFO:root:Saved checkpoint to "emb_cifar_sup_subset2-0000.params"
INFO:root:Saved checkpoint to "loss_cifar_sup_subset2-0000.params"
INFO:root:Epoch[1] Batch [20]	Speed: 377.57 samples/sec	accuracy=nan
INFO:root:Epoch[1] Batch [40]	Speed: 378.49 samples/sec	accuracy=nan
INFO:root:Epoch[1] Batch [60]	Speed: 376.90 samples/sec	accuracy=nan
INFO:root:Epoch[1] Batch [80]	Speed: 379.19 samples/sec	accuracy=nan
INFO:root:Epoch[1] Batch [100]	Speed: 378.85 samples/sec	accuracy=nan
INFO:root:Epoch[1] Batch [120]	Speed: 379.10 samples/sec	accuracy=nan
INFO:root:Epoch[1] Train-accuracy=nan
INFO:root:Epoch[1] Time cost=26.473


Epoch[1] Validation-accuracy=0.041100


INFO:root:Epoch[2] Batch [20]	Speed: 378.71 samples/sec	accuracy=nan
INFO:root:Epoch[2] Batch [40]	Speed: 378.51 samples/sec	accuracy=nan
INFO:root:Epoch[2] Batch [60]	Speed: 378.14 samples/sec	accuracy=nan
INFO:root:Epoch[2] Batch [80]	Speed: 376.75 samples/sec	accuracy=nan
INFO:root:Epoch[2] Batch [100]	Speed: 377.37 samples/sec	accuracy=nan
INFO:root:Epoch[2] Batch [120]	Speed: 379.48 samples/sec	accuracy=nan
INFO:root:Epoch[2] Train-accuracy=nan
INFO:root:Epoch[2] Time cost=26.472


Epoch[2] Validation-accuracy=0.057300


INFO:root:Epoch[3] Batch [20]	Speed: 377.48 samples/sec	accuracy=nan
INFO:root:Epoch[3] Batch [40]	Speed: 378.14 samples/sec	accuracy=nan
INFO:root:Epoch[3] Batch [60]	Speed: 377.38 samples/sec	accuracy=nan
INFO:root:Epoch[3] Batch [80]	Speed: 376.73 samples/sec	accuracy=nan
INFO:root:Epoch[3] Batch [100]	Speed: 375.45 samples/sec	accuracy=nan
INFO:root:Epoch[3] Batch [120]	Speed: 377.09 samples/sec	accuracy=nan
INFO:root:Epoch[3] Train-accuracy=nan
INFO:root:Epoch[3] Time cost=26.514


Epoch[3] Validation-accuracy=0.075300


INFO:root:Epoch[4] Batch [20]	Speed: 378.93 samples/sec	accuracy=nan
INFO:root:Epoch[4] Batch [40]	Speed: 376.90 samples/sec	accuracy=nan
INFO:root:Epoch[4] Batch [60]	Speed: 376.53 samples/sec	accuracy=nan
INFO:root:Epoch[4] Batch [80]	Speed: 376.23 samples/sec	accuracy=nan
INFO:root:Epoch[4] Batch [100]	Speed: 375.92 samples/sec	accuracy=nan
INFO:root:Epoch[4] Batch [120]	Speed: 377.36 samples/sec	accuracy=nan
INFO:root:Epoch[4] Train-accuracy=nan
INFO:root:Epoch[4] Time cost=26.572


Epoch[4] Validation-accuracy=0.104800


INFO:root:Epoch[5] Batch [20]	Speed: 377.51 samples/sec	accuracy=nan
INFO:root:Epoch[5] Batch [40]	Speed: 376.96 samples/sec	accuracy=nan
INFO:root:Epoch[5] Batch [60]	Speed: 376.54 samples/sec	accuracy=nan
INFO:root:Epoch[5] Batch [80]	Speed: 378.16 samples/sec	accuracy=nan
INFO:root:Epoch[5] Batch [100]	Speed: 379.54 samples/sec	accuracy=nan
INFO:root:Epoch[5] Batch [120]	Speed: 379.47 samples/sec	accuracy=nan
INFO:root:Epoch[5] Train-accuracy=nan
INFO:root:Epoch[5] Time cost=26.446


Epoch[5] Validation-accuracy=0.125800


INFO:root:Saved checkpoint to "emb_cifar_sup_subset2-0005.params"
INFO:root:Saved checkpoint to "loss_cifar_sup_subset2-0005.params"
INFO:root:Epoch[6] Batch [20]	Speed: 377.68 samples/sec	accuracy=nan
INFO:root:Epoch[6] Batch [40]	Speed: 377.02 samples/sec	accuracy=nan
INFO:root:Epoch[6] Batch [60]	Speed: 376.88 samples/sec	accuracy=nan
INFO:root:Epoch[6] Batch [80]	Speed: 376.05 samples/sec	accuracy=nan
INFO:root:Epoch[6] Batch [100]	Speed: 375.51 samples/sec	accuracy=nan
INFO:root:Epoch[6] Batch [120]	Speed: 377.21 samples/sec	accuracy=nan
INFO:root:Epoch[6] Train-accuracy=nan
INFO:root:Epoch[6] Time cost=26.583


Epoch[6] Validation-accuracy=0.094700


INFO:root:Epoch[7] Batch [20]	Speed: 376.56 samples/sec	accuracy=nan
INFO:root:Epoch[7] Batch [40]	Speed: 375.67 samples/sec	accuracy=nan
INFO:root:Epoch[7] Batch [60]	Speed: 376.81 samples/sec	accuracy=nan
INFO:root:Epoch[7] Batch [80]	Speed: 377.81 samples/sec	accuracy=nan
INFO:root:Epoch[7] Batch [100]	Speed: 377.88 samples/sec	accuracy=nan
INFO:root:Epoch[7] Batch [120]	Speed: 375.62 samples/sec	accuracy=nan
INFO:root:Epoch[7] Train-accuracy=nan
INFO:root:Epoch[7] Time cost=26.538


Epoch[7] Validation-accuracy=0.153800


INFO:root:Epoch[8] Batch [20]	Speed: 377.65 samples/sec	accuracy=nan
INFO:root:Epoch[8] Batch [40]	Speed: 378.64 samples/sec	accuracy=nan
INFO:root:Epoch[8] Batch [60]	Speed: 375.32 samples/sec	accuracy=nan
INFO:root:Epoch[8] Batch [80]	Speed: 382.41 samples/sec	accuracy=nan
INFO:root:Epoch[8] Batch [100]	Speed: 384.10 samples/sec	accuracy=nan
INFO:root:Epoch[8] Batch [120]	Speed: 383.92 samples/sec	accuracy=nan
INFO:root:Epoch[8] Train-accuracy=nan
INFO:root:Epoch[8] Time cost=26.326


Epoch[8] Validation-accuracy=0.161000


INFO:root:Epoch[9] Batch [20]	Speed: 378.62 samples/sec	accuracy=nan
INFO:root:Epoch[9] Batch [40]	Speed: 375.40 samples/sec	accuracy=nan
INFO:root:Epoch[9] Batch [60]	Speed: 372.58 samples/sec	accuracy=nan
INFO:root:Epoch[9] Batch [80]	Speed: 376.19 samples/sec	accuracy=nan
INFO:root:Epoch[9] Batch [100]	Speed: 374.49 samples/sec	accuracy=nan
INFO:root:Epoch[9] Batch [120]	Speed: 376.07 samples/sec	accuracy=nan
INFO:root:Epoch[9] Train-accuracy=nan
INFO:root:Epoch[9] Time cost=26.665


Epoch[9] Validation-accuracy=0.162500


INFO:root:Epoch[10] Batch [20]	Speed: 375.64 samples/sec	accuracy=nan
INFO:root:Epoch[10] Batch [40]	Speed: 376.68 samples/sec	accuracy=nan
INFO:root:Epoch[10] Batch [60]	Speed: 374.97 samples/sec	accuracy=nan
INFO:root:Epoch[10] Batch [80]	Speed: 377.37 samples/sec	accuracy=nan
INFO:root:Epoch[10] Batch [100]	Speed: 376.09 samples/sec	accuracy=nan
INFO:root:Epoch[10] Batch [120]	Speed: 375.82 samples/sec	accuracy=nan
INFO:root:Epoch[10] Train-accuracy=nan
INFO:root:Epoch[10] Time cost=26.611


Epoch[10] Validation-accuracy=0.208600


INFO:root:Saved checkpoint to "emb_cifar_sup_subset2-0010.params"
INFO:root:Saved checkpoint to "loss_cifar_sup_subset2-0010.params"
INFO:root:Epoch[11] Batch [20]	Speed: 375.47 samples/sec	accuracy=nan
INFO:root:Epoch[11] Batch [40]	Speed: 374.78 samples/sec	accuracy=nan
INFO:root:Epoch[11] Batch [60]	Speed: 376.57 samples/sec	accuracy=nan
INFO:root:Epoch[11] Batch [80]	Speed: 375.54 samples/sec	accuracy=nan
INFO:root:Epoch[11] Batch [100]	Speed: 375.72 samples/sec	accuracy=nan
INFO:root:Epoch[11] Batch [120]	Speed: 378.50 samples/sec	accuracy=nan
INFO:root:Epoch[11] Train-accuracy=nan
INFO:root:Epoch[11] Time cost=26.616


Epoch[11] Validation-accuracy=0.202800


INFO:root:Epoch[12] Batch [20]	Speed: 375.41 samples/sec	accuracy=nan
INFO:root:Epoch[12] Batch [40]	Speed: 376.47 samples/sec	accuracy=nan
INFO:root:Epoch[12] Batch [60]	Speed: 375.70 samples/sec	accuracy=nan
INFO:root:Epoch[12] Batch [80]	Speed: 375.03 samples/sec	accuracy=nan
INFO:root:Epoch[12] Batch [100]	Speed: 375.33 samples/sec	accuracy=nan
INFO:root:Epoch[12] Batch [120]	Speed: 376.43 samples/sec	accuracy=nan
INFO:root:Epoch[12] Train-accuracy=nan
INFO:root:Epoch[12] Time cost=26.668


Epoch[12] Validation-accuracy=0.205000


INFO:root:Epoch[13] Batch [20]	Speed: 378.57 samples/sec	accuracy=nan
INFO:root:Epoch[13] Batch [40]	Speed: 377.38 samples/sec	accuracy=nan
INFO:root:Epoch[13] Batch [60]	Speed: 375.91 samples/sec	accuracy=nan
INFO:root:Epoch[13] Batch [80]	Speed: 375.55 samples/sec	accuracy=nan
INFO:root:Epoch[13] Batch [100]	Speed: 378.60 samples/sec	accuracy=nan
INFO:root:Epoch[13] Batch [120]	Speed: 375.46 samples/sec	accuracy=nan
INFO:root:Epoch[13] Train-accuracy=nan
INFO:root:Epoch[13] Time cost=26.564


Epoch[13] Validation-accuracy=0.231400


INFO:root:Epoch[14] Batch [20]	Speed: 375.85 samples/sec	accuracy=nan
INFO:root:Epoch[14] Batch [40]	Speed: 376.55 samples/sec	accuracy=nan
INFO:root:Epoch[14] Batch [60]	Speed: 374.37 samples/sec	accuracy=nan
INFO:root:Epoch[14] Batch [80]	Speed: 375.61 samples/sec	accuracy=nan
INFO:root:Epoch[14] Batch [100]	Speed: 376.73 samples/sec	accuracy=nan
INFO:root:Epoch[14] Batch [120]	Speed: 375.69 samples/sec	accuracy=nan
INFO:root:Epoch[14] Train-accuracy=nan
INFO:root:Epoch[14] Time cost=26.609


Epoch[14] Validation-accuracy=0.252000


INFO:root:Epoch[15] Batch [20]	Speed: 377.31 samples/sec	accuracy=nan
INFO:root:Epoch[15] Batch [40]	Speed: 375.22 samples/sec	accuracy=nan
INFO:root:Epoch[15] Batch [60]	Speed: 377.83 samples/sec	accuracy=nan
INFO:root:Epoch[15] Batch [80]	Speed: 375.47 samples/sec	accuracy=nan
INFO:root:Epoch[15] Batch [100]	Speed: 376.79 samples/sec	accuracy=nan
INFO:root:Epoch[15] Batch [120]	Speed: 375.95 samples/sec	accuracy=nan
INFO:root:Epoch[15] Train-accuracy=nan
INFO:root:Epoch[15] Time cost=26.589


Epoch[15] Validation-accuracy=0.253700


INFO:root:Saved checkpoint to "emb_cifar_sup_subset2-0015.params"
INFO:root:Saved checkpoint to "loss_cifar_sup_subset2-0015.params"
INFO:root:Epoch[16] Batch [20]	Speed: 377.40 samples/sec	accuracy=nan
INFO:root:Epoch[16] Batch [40]	Speed: 375.25 samples/sec	accuracy=nan
INFO:root:Epoch[16] Batch [60]	Speed: 374.55 samples/sec	accuracy=nan
INFO:root:Epoch[16] Batch [80]	Speed: 376.74 samples/sec	accuracy=nan
INFO:root:Epoch[16] Batch [100]	Speed: 376.29 samples/sec	accuracy=nan
INFO:root:Epoch[16] Batch [120]	Speed: 373.73 samples/sec	accuracy=nan
INFO:root:Epoch[16] Train-accuracy=nan
INFO:root:Epoch[16] Time cost=26.638


Epoch[16] Validation-accuracy=0.278300


INFO:root:Epoch[17] Batch [20]	Speed: 378.42 samples/sec	accuracy=nan
INFO:root:Epoch[17] Batch [40]	Speed: 375.24 samples/sec	accuracy=nan
INFO:root:Epoch[17] Batch [60]	Speed: 375.95 samples/sec	accuracy=nan
INFO:root:Epoch[17] Batch [80]	Speed: 373.19 samples/sec	accuracy=nan
INFO:root:Epoch[17] Batch [100]	Speed: 376.55 samples/sec	accuracy=nan
INFO:root:Epoch[17] Batch [120]	Speed: 373.82 samples/sec	accuracy=nan
INFO:root:Epoch[17] Train-accuracy=nan
INFO:root:Epoch[17] Time cost=26.656


Epoch[17] Validation-accuracy=0.265700


INFO:root:Epoch[18] Batch [20]	Speed: 377.17 samples/sec	accuracy=nan
INFO:root:Epoch[18] Batch [40]	Speed: 375.73 samples/sec	accuracy=nan
INFO:root:Epoch[18] Batch [60]	Speed: 375.08 samples/sec	accuracy=nan
INFO:root:Epoch[18] Batch [80]	Speed: 376.43 samples/sec	accuracy=nan
INFO:root:Epoch[18] Batch [100]	Speed: 375.51 samples/sec	accuracy=nan
INFO:root:Epoch[18] Batch [120]	Speed: 377.51 samples/sec	accuracy=nan
INFO:root:Epoch[18] Train-accuracy=nan
INFO:root:Epoch[18] Time cost=26.626


Epoch[18] Validation-accuracy=0.289200


INFO:root:Epoch[19] Batch [20]	Speed: 374.76 samples/sec	accuracy=nan
INFO:root:Epoch[19] Batch [40]	Speed: 376.31 samples/sec	accuracy=nan
INFO:root:Epoch[19] Batch [60]	Speed: 376.44 samples/sec	accuracy=nan
INFO:root:Epoch[19] Batch [80]	Speed: 376.13 samples/sec	accuracy=nan
INFO:root:Epoch[19] Batch [100]	Speed: 375.40 samples/sec	accuracy=nan
INFO:root:Epoch[19] Batch [120]	Speed: 377.81 samples/sec	accuracy=nan
INFO:root:Epoch[19] Train-accuracy=nan
INFO:root:Epoch[19] Time cost=26.642


Epoch[19] Validation-accuracy=0.295600


INFO:root:Epoch[20] Batch [20]	Speed: 377.33 samples/sec	accuracy=nan
INFO:root:Epoch[20] Batch [40]	Speed: 376.36 samples/sec	accuracy=nan
INFO:root:Epoch[20] Batch [60]	Speed: 375.42 samples/sec	accuracy=nan
INFO:root:Epoch[20] Batch [80]	Speed: 376.82 samples/sec	accuracy=nan
INFO:root:Epoch[20] Batch [100]	Speed: 376.13 samples/sec	accuracy=nan
INFO:root:Epoch[20] Batch [120]	Speed: 375.16 samples/sec	accuracy=nan
INFO:root:Epoch[20] Train-accuracy=nan
INFO:root:Epoch[20] Time cost=26.638


Epoch[20] Validation-accuracy=0.309300


INFO:root:Saved checkpoint to "emb_cifar_sup_subset2-0020.params"
INFO:root:Saved checkpoint to "loss_cifar_sup_subset2-0020.params"
INFO:root:Epoch[21] Batch [20]	Speed: 380.05 samples/sec	accuracy=nan
INFO:root:Epoch[21] Batch [40]	Speed: 377.48 samples/sec	accuracy=nan
INFO:root:Epoch[21] Batch [60]	Speed: 376.28 samples/sec	accuracy=nan
INFO:root:Epoch[21] Batch [80]	Speed: 374.38 samples/sec	accuracy=nan
INFO:root:Epoch[21] Batch [100]	Speed: 375.24 samples/sec	accuracy=nan
INFO:root:Epoch[21] Batch [120]	Speed: 377.47 samples/sec	accuracy=nan
INFO:root:Epoch[21] Train-accuracy=nan
INFO:root:Epoch[21] Time cost=26.582


Epoch[21] Validation-accuracy=0.303100


INFO:root:Epoch[22] Batch [20]	Speed: 376.30 samples/sec	accuracy=nan
INFO:root:Epoch[22] Batch [40]	Speed: 375.90 samples/sec	accuracy=nan
INFO:root:Epoch[22] Batch [60]	Speed: 375.40 samples/sec	accuracy=nan
INFO:root:Epoch[22] Batch [80]	Speed: 378.66 samples/sec	accuracy=nan
INFO:root:Epoch[22] Batch [100]	Speed: 376.17 samples/sec	accuracy=nan
INFO:root:Epoch[22] Batch [120]	Speed: 374.77 samples/sec	accuracy=nan
INFO:root:Epoch[22] Train-accuracy=nan
INFO:root:Epoch[22] Time cost=26.597


Epoch[22] Validation-accuracy=0.316700


INFO:root:Epoch[23] Batch [20]	Speed: 376.47 samples/sec	accuracy=nan
INFO:root:Epoch[23] Batch [40]	Speed: 375.75 samples/sec	accuracy=nan
INFO:root:Epoch[23] Batch [60]	Speed: 375.11 samples/sec	accuracy=nan
INFO:root:Epoch[23] Batch [80]	Speed: 377.83 samples/sec	accuracy=nan
INFO:root:Epoch[23] Batch [100]	Speed: 374.17 samples/sec	accuracy=nan
INFO:root:Epoch[23] Batch [120]	Speed: 376.34 samples/sec	accuracy=nan
INFO:root:Epoch[23] Train-accuracy=nan
INFO:root:Epoch[23] Time cost=26.651


Epoch[23] Validation-accuracy=0.342600


INFO:root:Epoch[24] Batch [20]	Speed: 373.61 samples/sec	accuracy=nan
INFO:root:Epoch[24] Batch [40]	Speed: 375.12 samples/sec	accuracy=nan
INFO:root:Epoch[24] Batch [60]	Speed: 374.73 samples/sec	accuracy=nan
INFO:root:Epoch[24] Batch [80]	Speed: 376.24 samples/sec	accuracy=nan
INFO:root:Epoch[24] Batch [100]	Speed: 373.17 samples/sec	accuracy=nan
INFO:root:Epoch[24] Batch [120]	Speed: 375.93 samples/sec	accuracy=nan
INFO:root:Epoch[24] Train-accuracy=nan
INFO:root:Epoch[24] Time cost=26.715


Epoch[24] Validation-accuracy=0.296400


INFO:root:Epoch[25] Batch [20]	Speed: 380.57 samples/sec	accuracy=nan
INFO:root:Epoch[25] Batch [40]	Speed: 378.72 samples/sec	accuracy=nan
INFO:root:Epoch[25] Batch [60]	Speed: 375.80 samples/sec	accuracy=nan
INFO:root:Epoch[25] Batch [80]	Speed: 377.02 samples/sec	accuracy=nan
INFO:root:Epoch[25] Batch [100]	Speed: 376.12 samples/sec	accuracy=nan
INFO:root:Epoch[25] Batch [120]	Speed: 377.44 samples/sec	accuracy=nan
INFO:root:Epoch[25] Train-accuracy=nan
INFO:root:Epoch[25] Time cost=26.511


Epoch[25] Validation-accuracy=0.333500


INFO:root:Saved checkpoint to "emb_cifar_sup_subset2-0025.params"
INFO:root:Saved checkpoint to "loss_cifar_sup_subset2-0025.params"
INFO:root:Epoch[26] Batch [20]	Speed: 373.37 samples/sec	accuracy=nan
INFO:root:Epoch[26] Batch [40]	Speed: 375.70 samples/sec	accuracy=nan
INFO:root:Epoch[26] Batch [60]	Speed: 373.59 samples/sec	accuracy=nan
INFO:root:Epoch[26] Batch [80]	Speed: 375.84 samples/sec	accuracy=nan
INFO:root:Epoch[26] Batch [100]	Speed: 375.89 samples/sec	accuracy=nan
INFO:root:Epoch[26] Batch [120]	Speed: 377.97 samples/sec	accuracy=nan
INFO:root:Epoch[26] Train-accuracy=nan
INFO:root:Epoch[26] Time cost=26.655


Epoch[26] Validation-accuracy=0.366000


INFO:root:Epoch[27] Batch [20]	Speed: 373.65 samples/sec	accuracy=nan
INFO:root:Epoch[27] Batch [40]	Speed: 377.67 samples/sec	accuracy=nan
INFO:root:Epoch[27] Batch [60]	Speed: 376.30 samples/sec	accuracy=nan
INFO:root:Epoch[27] Batch [80]	Speed: 375.55 samples/sec	accuracy=nan
INFO:root:Epoch[27] Batch [100]	Speed: 376.59 samples/sec	accuracy=nan
INFO:root:Epoch[27] Batch [120]	Speed: 373.52 samples/sec	accuracy=nan
INFO:root:Epoch[27] Train-accuracy=nan
INFO:root:Epoch[27] Time cost=26.665


Epoch[27] Validation-accuracy=0.345900


INFO:root:Epoch[28] Batch [20]	Speed: 376.15 samples/sec	accuracy=nan
INFO:root:Epoch[28] Batch [40]	Speed: 374.13 samples/sec	accuracy=nan
INFO:root:Epoch[28] Batch [60]	Speed: 377.14 samples/sec	accuracy=nan
INFO:root:Epoch[28] Batch [80]	Speed: 374.56 samples/sec	accuracy=nan
INFO:root:Epoch[28] Batch [100]	Speed: 375.22 samples/sec	accuracy=nan
INFO:root:Epoch[28] Batch [120]	Speed: 377.24 samples/sec	accuracy=nan
INFO:root:Epoch[28] Train-accuracy=nan
INFO:root:Epoch[28] Time cost=26.655


Epoch[28] Validation-accuracy=0.347500


INFO:root:Epoch[29] Batch [20]	Speed: 377.76 samples/sec	accuracy=nan
INFO:root:Epoch[29] Batch [40]	Speed: 378.57 samples/sec	accuracy=nan
INFO:root:Epoch[29] Batch [60]	Speed: 379.69 samples/sec	accuracy=nan
INFO:root:Epoch[29] Batch [80]	Speed: 375.19 samples/sec	accuracy=nan
INFO:root:Epoch[29] Batch [100]	Speed: 374.97 samples/sec	accuracy=nan
INFO:root:Epoch[29] Batch [120]	Speed: 374.33 samples/sec	accuracy=nan
INFO:root:Epoch[29] Train-accuracy=nan
INFO:root:Epoch[29] Time cost=26.567


Epoch[29] Validation-accuracy=0.373300


INFO:root:Epoch[30] Batch [20]	Speed: 377.51 samples/sec	accuracy=nan
INFO:root:Epoch[30] Batch [40]	Speed: 375.89 samples/sec	accuracy=nan
INFO:root:Epoch[30] Batch [60]	Speed: 376.90 samples/sec	accuracy=nan
INFO:root:Epoch[30] Batch [80]	Speed: 374.87 samples/sec	accuracy=nan
INFO:root:Epoch[30] Batch [100]	Speed: 377.53 samples/sec	accuracy=nan
INFO:root:Epoch[30] Batch [120]	Speed: 376.06 samples/sec	accuracy=nan
INFO:root:Epoch[30] Train-accuracy=nan
INFO:root:Epoch[30] Time cost=26.601


Epoch[30] Validation-accuracy=0.372200


INFO:root:Saved checkpoint to "emb_cifar_sup_subset2-0030.params"
INFO:root:Saved checkpoint to "loss_cifar_sup_subset2-0030.params"
INFO:root:Epoch[31] Batch [20]	Speed: 375.20 samples/sec	accuracy=nan
INFO:root:Epoch[31] Batch [40]	Speed: 376.24 samples/sec	accuracy=nan
INFO:root:Epoch[31] Batch [60]	Speed: 377.30 samples/sec	accuracy=nan
INFO:root:Epoch[31] Batch [80]	Speed: 376.08 samples/sec	accuracy=nan
INFO:root:Epoch[31] Batch [100]	Speed: 376.74 samples/sec	accuracy=nan
INFO:root:Epoch[31] Batch [120]	Speed: 374.95 samples/sec	accuracy=nan
INFO:root:Epoch[31] Train-accuracy=nan
INFO:root:Epoch[31] Time cost=26.596


Epoch[31] Validation-accuracy=0.377900


INFO:root:Epoch[32] Batch [20]	Speed: 375.69 samples/sec	accuracy=nan
INFO:root:Epoch[32] Batch [40]	Speed: 376.13 samples/sec	accuracy=nan
INFO:root:Epoch[32] Batch [60]	Speed: 373.28 samples/sec	accuracy=nan
INFO:root:Epoch[32] Batch [80]	Speed: 374.60 samples/sec	accuracy=nan
INFO:root:Epoch[32] Batch [100]	Speed: 375.25 samples/sec	accuracy=nan
INFO:root:Epoch[32] Batch [120]	Speed: 374.82 samples/sec	accuracy=nan
INFO:root:Epoch[32] Train-accuracy=nan
INFO:root:Epoch[32] Time cost=26.676


Epoch[32] Validation-accuracy=0.372400


INFO:root:Epoch[33] Batch [20]	Speed: 375.55 samples/sec	accuracy=nan
INFO:root:Epoch[33] Batch [40]	Speed: 379.96 samples/sec	accuracy=nan
INFO:root:Epoch[33] Batch [60]	Speed: 382.07 samples/sec	accuracy=nan
INFO:root:Epoch[33] Batch [80]	Speed: 377.84 samples/sec	accuracy=nan
INFO:root:Epoch[33] Batch [100]	Speed: 373.94 samples/sec	accuracy=nan
INFO:root:Epoch[33] Batch [120]	Speed: 377.13 samples/sec	accuracy=nan
INFO:root:Epoch[33] Train-accuracy=nan
INFO:root:Epoch[33] Time cost=26.516


Epoch[33] Validation-accuracy=0.376700


INFO:root:Epoch[34] Batch [20]	Speed: 373.48 samples/sec	accuracy=nan
INFO:root:Epoch[34] Batch [40]	Speed: 374.85 samples/sec	accuracy=nan
INFO:root:Epoch[34] Batch [60]	Speed: 375.96 samples/sec	accuracy=nan
INFO:root:Epoch[34] Batch [80]	Speed: 374.42 samples/sec	accuracy=nan
INFO:root:Epoch[34] Batch [100]	Speed: 376.08 samples/sec	accuracy=nan
INFO:root:Epoch[34] Batch [120]	Speed: 375.61 samples/sec	accuracy=nan
INFO:root:Epoch[34] Train-accuracy=nan
INFO:root:Epoch[34] Time cost=26.703


Epoch[34] Validation-accuracy=0.366400


INFO:root:Epoch[35] Batch [20]	Speed: 376.41 samples/sec	accuracy=nan
INFO:root:Epoch[35] Batch [40]	Speed: 375.03 samples/sec	accuracy=nan
INFO:root:Epoch[35] Batch [60]	Speed: 377.09 samples/sec	accuracy=nan
INFO:root:Epoch[35] Batch [80]	Speed: 375.16 samples/sec	accuracy=nan
INFO:root:Epoch[35] Batch [100]	Speed: 377.53 samples/sec	accuracy=nan
INFO:root:Epoch[35] Batch [120]	Speed: 377.46 samples/sec	accuracy=nan
INFO:root:Epoch[35] Train-accuracy=nan
INFO:root:Epoch[35] Time cost=26.621


Epoch[35] Validation-accuracy=0.383300


INFO:root:Saved checkpoint to "emb_cifar_sup_subset2-0035.params"
INFO:root:Saved checkpoint to "loss_cifar_sup_subset2-0035.params"
INFO:root:Epoch[36] Batch [20]	Speed: 375.56 samples/sec	accuracy=nan
INFO:root:Epoch[36] Batch [40]	Speed: 377.00 samples/sec	accuracy=nan
INFO:root:Epoch[36] Batch [60]	Speed: 376.13 samples/sec	accuracy=nan
INFO:root:Epoch[36] Batch [80]	Speed: 376.54 samples/sec	accuracy=nan
INFO:root:Epoch[36] Batch [100]	Speed: 375.57 samples/sec	accuracy=nan
INFO:root:Epoch[36] Batch [120]	Speed: 375.93 samples/sec	accuracy=nan
INFO:root:Epoch[36] Train-accuracy=nan
INFO:root:Epoch[36] Time cost=26.618


Epoch[36] Validation-accuracy=0.382200


INFO:root:Epoch[37] Batch [20]	Speed: 374.61 samples/sec	accuracy=nan
INFO:root:Epoch[37] Batch [40]	Speed: 375.45 samples/sec	accuracy=nan
INFO:root:Epoch[37] Batch [60]	Speed: 381.06 samples/sec	accuracy=nan
INFO:root:Epoch[37] Batch [80]	Speed: 377.60 samples/sec	accuracy=nan
INFO:root:Epoch[37] Batch [100]	Speed: 376.30 samples/sec	accuracy=nan
INFO:root:Epoch[37] Batch [120]	Speed: 373.50 samples/sec	accuracy=nan
INFO:root:Epoch[37] Train-accuracy=nan
INFO:root:Epoch[37] Time cost=26.593


Epoch[37] Validation-accuracy=0.404700


INFO:root:Epoch[38] Batch [20]	Speed: 376.59 samples/sec	accuracy=nan
INFO:root:Epoch[38] Batch [40]	Speed: 377.11 samples/sec	accuracy=nan
INFO:root:Epoch[38] Batch [60]	Speed: 376.98 samples/sec	accuracy=nan
INFO:root:Epoch[38] Batch [80]	Speed: 376.08 samples/sec	accuracy=nan
INFO:root:Epoch[38] Batch [100]	Speed: 375.53 samples/sec	accuracy=nan
INFO:root:Epoch[38] Batch [120]	Speed: 377.43 samples/sec	accuracy=nan
INFO:root:Epoch[38] Train-accuracy=nan
INFO:root:Epoch[38] Time cost=26.592


Epoch[38] Validation-accuracy=0.368700


INFO:root:Epoch[39] Batch [20]	Speed: 377.30 samples/sec	accuracy=nan
INFO:root:Epoch[39] Batch [40]	Speed: 374.30 samples/sec	accuracy=nan
INFO:root:Epoch[39] Batch [60]	Speed: 376.30 samples/sec	accuracy=nan
INFO:root:Epoch[39] Batch [80]	Speed: 375.25 samples/sec	accuracy=nan
INFO:root:Epoch[39] Batch [100]	Speed: 375.49 samples/sec	accuracy=nan
INFO:root:Epoch[39] Batch [120]	Speed: 375.18 samples/sec	accuracy=nan
INFO:root:Epoch[39] Train-accuracy=nan
INFO:root:Epoch[39] Time cost=26.658


Epoch[39] Validation-accuracy=0.397300


INFO:root:Epoch[40] Batch [20]	Speed: 373.40 samples/sec	accuracy=nan
INFO:root:Epoch[40] Batch [40]	Speed: 376.13 samples/sec	accuracy=nan
INFO:root:Epoch[40] Batch [60]	Speed: 375.05 samples/sec	accuracy=nan
INFO:root:Epoch[40] Batch [80]	Speed: 375.06 samples/sec	accuracy=nan
INFO:root:Epoch[40] Batch [100]	Speed: 375.29 samples/sec	accuracy=nan
INFO:root:Epoch[40] Batch [120]	Speed: 375.29 samples/sec	accuracy=nan
INFO:root:Epoch[40] Train-accuracy=nan
INFO:root:Epoch[40] Time cost=26.673


Epoch[40] Validation-accuracy=0.376500


INFO:root:Saved checkpoint to "emb_cifar_sup_subset2-0040.params"
INFO:root:Saved checkpoint to "loss_cifar_sup_subset2-0040.params"
INFO:root:Epoch[41] Batch [20]	Speed: 375.44 samples/sec	accuracy=nan
INFO:root:Epoch[41] Batch [40]	Speed: 374.74 samples/sec	accuracy=nan
INFO:root:Epoch[41] Batch [60]	Speed: 378.98 samples/sec	accuracy=nan
INFO:root:Epoch[41] Batch [80]	Speed: 379.60 samples/sec	accuracy=nan
INFO:root:Epoch[41] Batch [100]	Speed: 375.69 samples/sec	accuracy=nan
INFO:root:Epoch[41] Batch [120]	Speed: 374.91 samples/sec	accuracy=nan
INFO:root:Epoch[41] Train-accuracy=nan
INFO:root:Epoch[41] Time cost=26.555


In [None]:
aux_p

In [None]:
embeddingModule.save_checkpoint('cifarsemisup200', 200)

In [None]:
lossModule.save_checkpoint('cifarsemisup200_loss', 200)

In [None]:
val.reset()
val.next().label[0].shape[0]

In [None]:
class Multi_Accuracy(mx.metric.EvalMetric):
    """Calculate accuracies of multi label"""

    def __init__(self, num=None):
        super(Multi_Accuracy, self).__init__('multi-accuracy', num)

    def update(self, labels, preds):

        #for i in range(len(preds)):
        for i in range(1):
            pred_label = mx.nd.argmax_channel(preds[i]).asnumpy().astype('int32')
            label = labels[0].asnumpy().astype('int32')

            #mx.metric.check_label_shapes(label, pred_label)
            
            #print((pred_label.flat == label.flat).sum())
            #print(len(pred_label.flat))

            
            self.sum_metric[i] += (pred_label.flat == label.flat).sum()
            self.num_inst[i] += len(pred_label.flat)

In [None]:
# batch 10 train 0.389, valid: 0.40