In [1]:
# MarginBased_model.py
from mxnet import gluon
from mxnet.gluon import Block, HybridBlock
import numpy as np

class L2Normalization(HybridBlock):
    r"""Applies L2 Normalization to input.
    Parameters
    ----------
    mode : str
        Mode of normalization.
        See :func:`~mxnet.ndarray.L2Normalization` for available choices.
    Inputs:
        - **data**: input tensor with arbitrary shape.
    Outputs:
        - **out**: output tensor with the same shape as `data`.
    """
    def __init__(self, mode, **kwargs):
        self._mode = mode
        super(L2Normalization, self).__init__(**kwargs)

    def hybrid_forward(self, F, x):
        return F.L2Normalization(x, mode=self._mode, name='l2_norm')

    def __repr__(self):
        s = '{name}({_mode})'
        return s.format(name=self.__class__.__name__,
                        **self.__dict__)


def get_distance(F, x):
    """Helper function for margin-based loss. Return a distance matrix given a matrix."""
    n = x.shape[0]

    square = F.sum(x ** 2.0, axis=1, keepdims=True)
    distance_square = square + square.transpose() - (2.0 * F.dot(x, x.transpose()))

    # Adding identity to make sqrt work.
    return F.sqrt(distance_square + F.array(np.identity(n)))

class DistanceWeightedSampling(HybridBlock):
    r"""Distance weighted sampling. See "sampling matters in deep embedding learning"
    paper for details.
    Parameters
    ----------
    batch_k : int
        Number of images per class.
    Inputs:
        - **data**: input tensor with shape (batch_size, embed_dim).
        Here we assume the consecutive batch_k examples are of the same class.
        For example, if batch_k = 5, the first 5 examples belong to the same class,
        6th-10th examples belong to another class, etc.
    Outputs:
        - a_indices: indices of anchors.
        - x[a_indices]: sampled anchor embeddings.
        - x[p_indices]: sampled positive embeddings.
        - x[n_indices]: sampled negative embeddings.
        - x: embeddings of the input batch.
    """
    def __init__(self, batch_k, cutoff=0.5, nonzero_loss_cutoff=1.4, **kwargs):
        self.batch_k = batch_k
        self.cutoff = cutoff

        # We sample only from negatives that induce a non-zero loss.
        # These are negatives with a distance < nonzero_loss_cutoff.
        # With a margin-based loss, nonzero_loss_cutoff == margin + beta.
        self.nonzero_loss_cutoff = nonzero_loss_cutoff
        super(DistanceWeightedSampling, self).__init__(**kwargs)

    def hybrid_forward(self, F, x):
        k = self.batch_k
        n, d = x.shape

        distance = get_distance(F, x)
        # Cut off to avoid high variance.
        distance = F.maximum(distance, self.cutoff)

        # Subtract max(log(distance)) for stability.
        log_weights = ((2.0 - float(d)) * F.log(distance)
                       - (float(d - 3) / 2) * F.log(1.0 - 0.25 * (distance ** 2.0)))
        weights = F.exp(log_weights - F.max(log_weights))

        # Sample only negative examples by setting weights of
        # the same-class examples to 0.
        mask = np.ones(weights.shape)
        for i in range(0, n, k):
            mask[i:i+k, i:i+k] = 0

        weights = weights * F.array(mask) * (distance < self.nonzero_loss_cutoff)
        weights = weights / F.sum(weights, axis=1, keepdims=True)

        a_indices = []
        p_indices = []
        n_indices = []

        np_weights = weights.asnumpy()
        for i in range(n):
            block_idx = i // k

            try:
                n_indices += np.random.choice(n, k-1, p=np_weights[i]).tolist()
            except:
                n_indices += np.random.choice(n, k-1).tolist()
            for j in range(block_idx * k, (block_idx + 1) * k):
                if j != i:
                    a_indices.append(i)
                    p_indices.append(j)

        return a_indices, x[a_indices], x[p_indices], x[n_indices], x

    def __repr__(self):
        s = '{name}({batch_k})'
        return s.format(name=self.__class__.__name__,
                        **self.__dict__)


class MarginNet(Block):
    r"""Embedding network with distance weighted sampling.
    It takes a base CNN and adds an embedding layer and a
    sampling layer at the end.
    Parameters
    ----------
    base_net : Block
        Base network.
    emb_dim : int
        Dimensionality of the embedding.
    batch_k : int
        Number of images per class in a batch. Used in sampling.
    Inputs:
        - **data**: input tensor with shape (batch_size, channels, width, height).
        Here we assume the consecutive batch_k images are of the same class.
        For example, if batch_k = 5, the first 5 images belong to the same class,
        6th-10th images belong to another class, etc.
    Outputs:
        - The output of DistanceWeightedSampling.
    """
    def __init__(self, base_net, emb_dim, batch_k, **kwargs):
        super(MarginNet, self).__init__(**kwargs)
        with self.name_scope():
            self.base_net = base_net
            self.dense = gluon.nn.Dense(emb_dim)
            self.normalize = L2Normalization(mode='instance')
            self.sampled = DistanceWeightedSampling(batch_k=batch_k)

    def forward(self, x):
        z = self.base_net(x)
        z = self.dense(z)
        z = self.normalize(z)
        z = self.sampled(z)
        return z


class MarginLoss(gluon.loss.Loss):
    r"""Margin based loss.
    Parameters
    ----------
    margin : float
        Margin between positive and negative pairs.
    nu : float
        Regularization parameter for beta.
    Inputs:
        - anchors: sampled anchor embeddings.
        - positives: sampled positive embeddings.
        - negatives: sampled negative embeddings.
        - beta_in: class-specific betas.
        - a_indices: indices of anchors. Used to get class-specific beta.
    Outputs:
        - Loss.
    """
    def __init__(self, margin=0.2, nu=0.0, weight=None, batch_axis=0, **kwargs):
        super(MarginLoss, self).__init__(weight, batch_axis, **kwargs)
        self._margin = margin
        self._nu = nu

    def hybrid_forward(self, F, anchors, positives, negatives, beta_in, a_indices=None):
        if a_indices is not None:
            # Jointly train class-specific beta.
            beta = beta_in.data()[a_indices]
            beta_reg_loss = F.sum(beta) * self._nu
        else:
            # Use a constant beta.
            beta = beta_in
            beta_reg_loss = 0.0

        d_ap = F.sqrt(F.sum(F.square(positives - anchors), axis=1) + 1e-8)
        d_an = F.sqrt(F.sum(F.square(negatives - anchors), axis=1) + 1e-8)

        pos_loss = F.maximum(d_ap - beta + self._margin, 0.0)
        neg_loss = F.maximum(beta - d_an + self._margin, 0.0)

        pair_cnt = float(F.sum((pos_loss > 0.0) + (neg_loss > 0.0)).asscalar())

        # Normalize based on the number of pairs.
        loss = (F.sum(pos_loss + neg_loss) + beta_reg_loss) / pair_cnt
        print('loss',loss)
        return gluon.loss._apply_weighting(F, loss, self._weight, None)

In [9]:
#!pip install mxnet

Collecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/96/98/c9877e100c3d1ac92263bfaba7bb8a49294e099046592040a2ff8620ac61/mxnet-1.1.0.post0-py2.py3-none-manylinux1_x86_64.whl (23.8MB)
[K    100% |################################| 23.8MB 869kB/s eta 0:00:01  7% |##                              | 1.9MB 17.6MB/s eta 0:00:02    22% |#######                         | 5.3MB 27.5MB/s eta 0:00:01    44% |##############                  | 10.6MB 14.0MB/s eta 0:00:01    51% |################                | 12.2MB 16.1MB/s eta 0:00:01    54% |#################               | 13.1MB 16.7MB/s eta 0:00:01    62% |####################            | 14.9MB 14.2MB/s eta 0:00:01    70% |######################          | 16.9MB 22.8MB/s eta 0:00:01    94% |##############################  | 22.4MB 25.6MB/s eta 0:00:01
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading https://files.pythonhosted.org/packages/84/44/21a7fdd50841aaaef224b943f7d10df87e476e181bb926ccf859bcb53d

In [2]:
from __future__ import division

import argparse
import logging
import time
import os
import numpy as np
from tqdm import tqdm
from bottleneck import argpartition
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.autograd import Variable
from torch.autograd import Function
import torch.backends.cudnn as cudnn
from LFWDataset import LFWDataset
import mxnet as mx
#from data import cub200_iterator
from mxnet import gluon
from mxnet.gluon.model_zoo import vision as models
from mxnet import autograd as ag, nd
#from model import MarginNet, MarginLoss

In [3]:


logging.basicConfig(level=logging.INFO)

# CLI
parser = argparse.ArgumentParser(description='train a model for image classification.')
parser.add_argument('--data-path', type=str, default='/scratch/ys3225/deeplearningdataset', #/scratch/hb1500/deeplearningdataset/train
                    help='path of data.')

###########
parser.add_argument('--data_train-path', type=str, default='/scratch/ys3225/deeplearningdataset/train', 
                    help='path of data.')
parser.add_argument('--data_test-path', type=str, default='/scratch/ys3225/deeplearningdataset/test', 
                    help='path of data.')
###########

parser.add_argument('--embed-dim', type=int, default=128,
                    help='dimensionality of image embedding. default is 128.')
parser.add_argument('--batch-size', type=int, default=70,
                    help='training batch size per device (CPU/GPU). default is 70.')
parser.add_argument('--batch-k', type=int, default=5,
                    help='number of images per class in a batch. default is 5.')
parser.add_argument('--gpus', type=str, default='',
                    help='list of gpus to use, e.g. 0 or 0,2,5. empty means using cpu.')
parser.add_argument('--epochs', type=int, default=20,
                    help='number of training epochs. default is 20.')
parser.add_argument('--optimizer', type=str, default='adam',
                    help='optimizer. default is adam.')
parser.add_argument('--lr', type=float, default=0.0001,
                    help='learning rate. default is 0.0001.')
parser.add_argument('--lr-beta', type=float, default=0.1,
                    help='learning rate for the beta in margin based loss. default is 0.1.')
parser.add_argument('--margin', type=float, default=0.2,
                    help='margin for the margin based loss. default is 0.2.')
parser.add_argument('--beta', type=float, default=1.2,
                    help='initial value for beta. default is 1.2.')
parser.add_argument('--nu', type=float, default=0.0,
                    help='regularization parameter for beta. default is 0.0.')
parser.add_argument('--factor', type=float, default=0.5,
                    help='learning rate schedule factor. default is 0.5.')
parser.add_argument('--steps', type=str, default='12,14,16,18',
                    help='epochs to update learning rate. default is 12,14,16,18.')
parser.add_argument('--wd', type=float, default=0.0001,
                    help='weight decay rate. default is 0.0001.')
parser.add_argument('--seed', type=int, default=123,
                    help='random seed to use. default=123.')
parser.add_argument('--model', type=str, default='resnet18_v1',
                    help='type of model to use. see vision_model for options.resnet50_v2')
parser.add_argument('--save-model-prefix', type=str, default='margin_loss_model',
                    help='prefix of models to be saved.')
parser.add_argument('--use-pretrained', action='store_true',
                    help='enable using pretrained model from gluon.')
parser.add_argument('--kvstore', type=str, default='device',
                    help='kvstore to use for trainer.')
parser.add_argument('--log-interval', type=int, default=20,
                    help='number of batches to wait before logging.')

###################################
parser.add_argument('--dataroot', type=str, default='/scratch/ys3225/deeplearningdataset/train',#default='/scratch/hb1500/deeplearningdataset/train'
                    help='path to dataset')
parser.add_argument('--testdataroot', type=str, default='/scratch/ys3225/deeplearningdataset/test',#default='/media/lior/LinuxHDD/datasets/vgg_face_dataset/aligned'
                    help='path to dataset')
parser.add_argument('--lfw-dir', type=str, default='/scratch/ys3225/lfw',
                    help='path to dataset')
parser.add_argument('--lfw-pairs-path', type=str, default='lfw_pairs.txt',
                    help='path to pairs file')

parser.add_argument('--log-dir', default='/scratch/ys3225/logdir_triplet_loss',
                    help='folder to output model checkpoints')
parser.add_argument('--resume',
                    default='/scratch/ys3225/resume/run-optim_adam-lr0.001-wd0.0-embeddings512-center0.5-MSCeleb/checkpoint_11.pth',
                    type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
# parser.add_argument('--epochs', type=int, default=50, metavar='E',
#                     help='number of epochs to train (default: 10)')
# Training options
# parser.add_argument('--embedding-size', type=int, default=256, metavar='ES',
#                     help='Dimensionality of the embedding')

parser.add_argument('--center_loss_weight', type=float, default=0.5, help='weight for center loss')
parser.add_argument('--alpha', type=float, default=0.5, help='learning rate of the centers')
parser.add_argument('--embedding-size', type=int, default=512, metavar='ES',
                    help='Dimensionality of the embedding')

# parser.add_argument('--batch-size', type=int, default=64, metavar='BS',
#                     help='input batch size for training (default: 128)')
parser.add_argument('--test-batch-size', type=int, default=64, metavar='BST',
                    help='input batch size for testing (default: 1000)')
parser.add_argument('--n-triplets', type=int, default=1000000, metavar='N',
                    help='how many triplets will generate from the dataset,default=1000000')
# parser.add_argument('--margin', type=float, default=1.0, metavar='MARGIN',
#                     help='the margin value for the triplet loss function (default: 1.0')
# parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
#                     help='learning rate (default: 0.001)')
parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')

parser.add_argument('--lr-decay', default=1e-4, type=float, metavar='LRD',
                    help='learning rate decay ratio (default: 1e-4')
# parser.add_argument('--wd', default=0.0, type=float,
#                     metavar='W', help='weight decay (default: 0.0)')
# parser.add_argument('--optimizer', default='adam', type=str,
#                     metavar='OPT', help='The optimizer to use (default: Adagrad)')
# Device options
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='enables CUDA training')
parser.add_argument('--gpu-id', default='0', type=str,
                    help='id(s) for CUDA_VISIBLE_DEVICES')
# parser.add_argument('--seed', type=int, default=0, metavar='S',
#                     help='random seed (default: 0)')
# parser.add_argument('--log-interval', type=int, default=10, metavar='LI',
#                     help='how many batches to wait before logging training status')
###################################
opt = parser.parse_args(args = [])

logging.info(opt)

INFO:root:Namespace(alpha=0.5, batch_k=5, batch_size=70, beta=1.2, beta1=0.5, center_loss_weight=0.5, data_path='/scratch/ys3225/deeplearningdataset', data_test_path='/scratch/ys3225/deeplearningdataset/test', data_train_path='/scratch/ys3225/deeplearningdataset/train', dataroot='/scratch/ys3225/deeplearningdataset/train', embed_dim=128, embedding_size=512, epochs=20, factor=0.5, gpu_id='0', gpus='', kvstore='device', lfw_dir='/scratch/ys3225/lfw', lfw_pairs_path='lfw_pairs.txt', log_dir='/scratch/ys3225/logdir_triplet_loss', log_interval=20, lr=0.0001, lr_beta=0.1, lr_decay=0.0001, margin=0.2, model='resnet18_v1', n_triplets=1000000, no_cuda=False, nu=0.0, optimizer='adam', resume='/scratch/ys3225/resume/run-optim_adam-lr0.001-wd0.0-embeddings512-center0.5-MSCeleb/checkpoint_11.pth', save_model_prefix='margin_loss_model', seed=123, start_epoch=0, steps='12,14,16,18', test_batch_size=64, testdataroot='/scratch/ys3225/deeplearningdataset/test', use_pretrained=False, wd=0.0001)


In [4]:
# Settings.
mx.random.seed(opt.seed)
np.random.seed(opt.seed)

batch_size = opt.batch_size

gpus = [] if opt.gpus is None or opt.gpus is '' else [
    int(gpu) for gpu in opt.gpus.split(',')]
num_gpus = len(gpus)

batch_size *= max(1, num_gpus)
context = [mx.gpu(i) for i in gpus] if num_gpus > 0 else [mx.cpu()]
steps = [int(step) for step in opt.steps.split(',')]

# Construct model.
kwargs = {'ctx': context, 'pretrained': opt.use_pretrained}
net = models.get_model(opt.model, **kwargs)

if opt.use_pretrained:
    # Use a smaller learning rate for pre-trained convolutional layers.
    for v in net.collect_params().values():
        if 'conv' in v.name:
            setattr(v, 'lr_mult', 0.01)

net.hybridize()
net = MarginNet(net.features, opt.embed_dim, opt.batch_k)
beta = mx.gluon.Parameter('beta', shape=(100,))

In [5]:
args = opt
# set the device to use by setting CUDA_VISIBLE_DEVICES env variable in
# order to prevent any memory allocation on unused GPUs
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id

args.cuda = not args.no_cuda and torch.cuda.is_available()
np.random.seed(args.seed)

if not os.path.exists(args.log_dir):
    os.makedirs(args.log_dir)

if args.cuda:
    cudnn.benchmark = True

#LOG_DIR = args.log_dir + '/run-optim_{}-lr{}-wd{}-embeddings{}-center_loss{}-MSCeleb'.format(args.optimizer, args.lr, args.wd,args.embedding_size,args.center_loss_weight)
LOG_DIR = args.log_dir + '/run-optim_{}-n{}-lr{}-wd{}-m{}-embeddings{}-msceleb-alpha10'\
    .format(args.optimizer, args.n_triplets, args.lr, args.wd,
            args.margin,args.embedding_size)

# create logger
#logger = Logger(LOG_DIR)

kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {}
#l2_dist = PairwiseDistance(2)


transform = transforms.Compose([
                         transforms.Resize(96),
                         transforms.RandomHorizontalFlip(),
                         transforms.ToTensor(),
                         transforms.Normalize(mean = [ 0.5, 0.5, 0.5 ],
                                               std = [ 0.5, 0.5, 0.5 ])
                     ])

#train_dir = TripletFaceDataset(args.dataroot,transform=transform)
#train_dir = TripletFaceDataset(dir=args.dataroot,n_triplets=args.n_triplets,transform=transform)
#train_dir = MarginDataset(dir=args.dataroot,n_triplets=args.n_triplets,transform=transform)
train_dir = ImageFolder(args.dataroot,transform=transform)
train_loader = torch.utils.data.DataLoader(train_dir,batch_size=args.batch_size, shuffle=False, **kwargs)
testacc_dir = ImageFolder(args.testdataroot,transform=transform)
test_loader = torch.utils.data.DataLoader(
    LFWDataset(dir=args.lfw_dir,pairs_path=args.lfw_pairs_path,
                     transform=transform),
    batch_size=args.batch_size, shuffle=False, **kwargs)
testaccuracy_loader = torch.utils.data.DataLoader(testacc_dir,
    batch_size=args.batch_size, shuffle=True, **kwargs)



100%|██████████| 6000/6000 [00:05<00:00, 1099.02it/s]


In [6]:

# Get iterators.
#train_data, val_data = cub200_iterator(opt.data_path, opt.batch_k, batch_size, (3, 224, 224))


def get_distance_matrix(x):
    """Get distance matrix given a matrix. Used in testing."""
    square = nd.sum(x ** 2.0, axis=1, keepdims=True)
    distance_square = square + square.transpose() - (2.0 * nd.dot(x, x.transpose()))
    return nd.sqrt(distance_square)


def evaluate_emb(emb, labels):
    """Evaluate embeddings based on Recall@k."""
    d_mat = get_distance_matrix(emb)
    d_mat = d_mat.asnumpy()
    labels = labels.asnumpy()

    names = []
    accs = []
    for k in [1, 2, 4, 8, 16]:
        names.append('Recall@%d' % k)
        correct, cnt = 0.0, 0.0
        for i in range(emb.shape[0]):
            d_mat[i, i] = 1e10
            nns = argpartition(d_mat[i], k)[:k]
            if any(labels[i] == labels[nn] for nn in nns):
                correct += 1
            cnt += 1
        accs.append(correct/cnt)
    return names, accs


def test(ctx):
    """Test a model."""
    val_data.reset()
    outputs = []
    labels = []
    for batch in val_data:
        data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
        for x in data:
            outputs.append(net(x)[-1])
        labels += label

    outputs = nd.concatenate(outputs, axis=0)[:val_data.n_test]
    labels = nd.concatenate(labels, axis=0)[:val_data.n_test]
    return evaluate_emb(outputs, labels)


def get_lr(lr, epoch, steps, factor):
    """Get learning rate based on schedule."""
    for s in steps:
        if epoch >= s:
            lr *= factor
    return lr


def train(epochs, ctx):
    """Training function."""
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    net.initialize(mx.init.Xavier(magnitude=2), ctx=ctx)

    opt_options = {'learning_rate': opt.lr, 'wd': opt.wd}
    if opt.optimizer == 'sgd':
        opt_options['momentum'] = 0.9
    if opt.optimizer == 'adam':
        opt_options['epsilon'] = 1e-7
    trainer = gluon.Trainer(net.collect_params(), opt.optimizer,
                            opt_options,
                            kvstore=opt.kvstore)
    if opt.lr_beta > 0.0:
        # Jointly train class-specific beta.
        # See "sampling matters in deep embedding learning" paper for details.
        beta.initialize(mx.init.Constant(opt.beta), ctx=ctx)
        trainer_beta = gluon.Trainer([beta], 'sgd',
                                     {'learning_rate': opt.lr_beta, 'momentum': 0.9},
                                     kvstore=opt.kvstore)

    loss = MarginLoss(margin=opt.margin, nu=opt.nu)

    best_val = 0.0
    for epoch in range(epochs):
        tic = time.time()
        prev_loss, cumulative_loss = 0.0, 0.0

        # Learning rate schedule.
        trainer.set_learning_rate(get_lr(opt.lr, epoch, steps, opt.factor))
        logging.info('Epoch %d learning rate=%f', epoch, trainer.learning_rate)
        if opt.lr_beta > 0.0:
            trainer_beta.set_learning_rate(get_lr(opt.lr_beta, epoch, steps, opt.factor))
            logging.info('Epoch %d beta learning rate=%f', epoch, trainer_beta.learning_rate)
            
            
        pbar = tqdm(enumerate(train_loader))
        labels, distances = [], []
            
            
        # Inner training loop.
        #for i in range(200):
        for batch_idx, (data,label) in pbar:
#             batch = train_data.next()
#             data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
#             label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)



#         data_v = Variable(data.cuda())
#         target_var = Variable(label)
            #Ls = []
            with ag.record():
                x = mx.ndarray.array(data.numpy())
                y = mx.ndarray.array(label.numpy())
                a_indices, anchors, positives, negatives, _ = net(x)

                if opt.lr_beta > 0.0:
                    L = loss(anchors, positives, negatives, beta, y[a_indices])
                else:
                    L = loss(anchors, positives, negatives, opt.beta, None)

                # Store the loss and do backward after we have done forward
                # on all GPUs for better speed on multiple GPUs.
                #Ls.append(L)
                cumulative_loss += nd.mean(L).asscalar()

            #for L in Ls:
                L.backward()

            # Update.
            trainer.step(x.shape[0])
            if opt.lr_beta > 0.0:
                trainer_beta.step(x.shape[0])

            if (batch_idx) % opt.log_interval == 0:
                #print(cumulative_loss)
                logging.info('[Epoch %d, Iter %d] training loss=%f' % (
                    epoch, batch_idx, cumulative_loss - prev_loss))
                prev_loss = cumulative_loss

        logging.info('[Epoch %d] training loss=%f'%(epoch, cumulative_loss))
        logging.info('[Epoch %d] time cost: %f'%(epoch, time.time()-tic))

        names, val_accs = test(ctx)
        for name, val_acc in zip(names, val_accs):
            logging.info('[Epoch %d] validation: %s=%f'%(epoch, name, val_acc))

        if val_accs[0] > best_val:
            best_val = val_accs[0]
            logging.info('Saving %s.' % opt.save_model_prefix)
            net.collect_params().save('%s.params' % opt.save_model_prefix)
    return best_val

In [17]:
if __name__ == '__main__':
    best_val_recall = train(opt.epochs, context)
    print('Best validation Recall@1: %.2f.' % best_val_recall)

  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set force_reinit=True to re-initialize."%self.name)
  "Set for

INFO:root:Epoch 0 beta learning rate=0.100000




0it [00:00, ?it/s][A[A[A[AINFO:root:[Epoch 0, Iter 0] training loss=0.885826




1it [00:03,  3.77s/it][A[A[A[A



2it [00:11,  5.73s/it][A[A[A[A



3it [00:19,  6.44s/it][A[A[A[A



4it [00:27,  6.77s/it][A[A[A[A



5it [00:34,  6.95s/it][A[A[A[A



6it [00:42,  7.08s/it][A[A[A[A



7it [00:51,  7.31s/it][A[A[A[A



8it [00:58,  7.36s/it][A[A[A[A



9it [01:06,  7.40s/it][A[A[A[A



10it [01:14,  7.43s/it][A[A[A[A



11it [01:22,  7.46s/it][A[A[A[A



12it [01:29,  7.48s/it][A[A[A[A



13it [01:37,  7.49s/it][A[A[A[A



14it [01:45,  7.51s/it][A[A[A[A



15it [01:52,  7.52s/it][A[A[A[A



16it [02:00,  7.53s/it][A[A[A[A



17it [02:08,  7.54s/it][A[A[A[A



18it [02:15,  7.55s/it][A[A[A[A



19it [02:23,  7.56s/it][A[A[A[AProcess Process-9:
Process Process-10:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/share/apps/python3/

KeyboardInterrupt: 

In [None]:
ctx =context
epochs = 1
"""Training function."""
if isinstance(ctx, mx.Context):
    ctx = [ctx]
net.initialize(mx.init.Xavier(magnitude=2), ctx=ctx)

opt_options = {'learning_rate': opt.lr, 'wd': opt.wd}
if opt.optimizer == 'sgd':
    opt_options['momentum'] = 0.9
if opt.optimizer == 'adam':
    opt_options['epsilon'] = 1e-7
trainer = gluon.Trainer(net.collect_params(), opt.optimizer,
                        opt_options,
                        kvstore=opt.kvstore)
if opt.lr_beta > 0.0:
    # Jointly train class-specific beta.
    # See "sampling matters in deep embedding learning" paper for details.
    beta.initialize(mx.init.Constant(opt.beta), ctx=ctx)
    trainer_beta = gluon.Trainer([beta], 'sgd',
                                 {'learning_rate': opt.lr_beta, 'momentum': 0.9},
                                 kvstore=opt.kvstore)

loss = MarginLoss(margin=opt.margin, nu=opt.nu)

best_val = 0.0
for epoch in range(epochs):
    tic = time.time()
    prev_loss, cumulative_loss = 0.0, 0.0

    # Learning rate schedule.
    trainer.set_learning_rate(get_lr(opt.lr, epoch, steps, opt.factor))
    logging.info('Epoch %d learning rate=%f', epoch, trainer.learning_rate)
    if opt.lr_beta > 0.0:
        trainer_beta.set_learning_rate(get_lr(opt.lr_beta, epoch, steps, opt.factor))
        logging.info('Epoch %d beta learning rate=%f', epoch, trainer_beta.learning_rate)


    pbar = tqdm(enumerate(train_loader))
    labels, distances = [], []


    # Inner training loop.
    #for i in range(200):
    for batch_idx, (data,label) in pbar:
#             batch = train_data.next()
#             data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
#             label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)



#         data_v = Variable(data.cuda())
#         target_var = Variable(label)
        #Ls = []
        with ag.record():
            x = mx.ndarray.array(data.numpy())
            y = mx.ndarray.array(label.numpy())
            a_indices, anchors, positives, negatives, _ = net(x)

            if opt.lr_beta > 0.0:
                L = loss(anchors, positives, negatives, beta, y[a_indices])
            else:
                L = loss(anchors, positives, negatives, opt.beta, None)
            #print('L',L)
            # Store the loss and do backward after we have done forward
            # on all GPUs for better speed on multiple GPUs.
            #Ls.append(L)
            cumulative_loss += nd.mean(L).asscalar()

        #for L in Ls:
            L.backward()

        # Update.
        trainer.step(x.shape[0])
        if opt.lr_beta > 0.0:
            trainer_beta.step(x.shape[0])

        if (batch_idx) % opt.log_interval == 0:
            #print('cumulative_loss',cumulative_loss)
            logging.info('[Epoch %d, Iter %d] training loss=%f' % (
                epoch, batch_idx, cumulative_loss - prev_loss))
            prev_loss = cumulative_loss

    logging.info('[Epoch %d] training loss=%f'%(epoch, cumulative_loss))
    logging.info('[Epoch %d] time cost: %f'%(epoch, time.time()-tic))

    names, val_accs = test(ctx)
    for name, val_acc in zip(names, val_accs):
        logging.info('[Epoch %d] validation: %s=%f'%(epoch, name, val_acc))

    if val_accs[0] > best_val:
        best_val = val_accs[0]
        logging.info('Saving %s.' % opt.save_model_prefix)
        net.collect_params().save('%s.params' % opt.save_model_prefix)


INFO:root:Epoch 0 learning rate=0.000100
INFO:root:Epoch 0 beta learning rate=0.100000
0it [00:00, ?it/s]INFO:root:[Epoch 0, Iter 0] training loss=0.930200
1it [00:11, 11.17s/it]

loss 
[ 0.93019962]
<NDArray 1 @cpu(0)>
L 
[ 0.93019962]
<NDArray 1 @cpu(0)>
cumulative_loss 0.930199623108


2it [00:24, 12.08s/it]

loss 
[ 0.91740507]
<NDArray 1 @cpu(0)>
L 
[ 0.91740507]
<NDArray 1 @cpu(0)>


3it [00:31, 10.59s/it]

loss 
[ 0.90673321]
<NDArray 1 @cpu(0)>
L 
[ 0.90673321]
<NDArray 1 @cpu(0)>


4it [00:39,  9.83s/it]

loss 
[ 0.88220185]
<NDArray 1 @cpu(0)>
L 
[ 0.88220185]
<NDArray 1 @cpu(0)>


5it [00:46,  9.39s/it]

loss 
[ 0.83398068]
<NDArray 1 @cpu(0)>
L 
[ 0.83398068]
<NDArray 1 @cpu(0)>


6it [00:54,  9.08s/it]

loss 
[ 0.77277905]
<NDArray 1 @cpu(0)>
L 
[ 0.77277905]
<NDArray 1 @cpu(0)>


7it [01:02,  8.86s/it]

loss 
[ 0.71019232]
<NDArray 1 @cpu(0)>
L 
[ 0.71019232]
<NDArray 1 @cpu(0)>


8it [01:09,  8.70s/it]

loss 
[ 0.62252474]
<NDArray 1 @cpu(0)>
L 
[ 0.62252474]
<NDArray 1 @cpu(0)>


9it [01:17,  8.57s/it]

loss 
[ 0.48916167]
<NDArray 1 @cpu(0)>
L 
[ 0.48916167]
<NDArray 1 @cpu(0)>


10it [01:24,  8.48s/it]

loss 
[ 0.47051683]
<NDArray 1 @cpu(0)>
L 
[ 0.47051683]
<NDArray 1 @cpu(0)>


11it [01:32,  8.41s/it]

loss 
[ 0.45091084]
<NDArray 1 @cpu(0)>
L 
[ 0.45091084]
<NDArray 1 @cpu(0)>


12it [01:40,  8.35s/it]

loss 
[ 0.42601067]
<NDArray 1 @cpu(0)>
L 
[ 0.42601067]
<NDArray 1 @cpu(0)>


13it [01:47,  8.30s/it]

loss 
[ 0.46469131]
<NDArray 1 @cpu(0)>
L 
[ 0.46469131]
<NDArray 1 @cpu(0)>


14it [01:55,  8.25s/it]

loss 
[ 0.51586175]
<NDArray 1 @cpu(0)>
L 
[ 0.51586175]
<NDArray 1 @cpu(0)>


15it [02:03,  8.21s/it]

loss 
[ 1.16150987]
<NDArray 1 @cpu(0)>
L 
[ 1.16150987]
<NDArray 1 @cpu(0)>


16it [02:10,  8.18s/it]

loss 
[ 1.17238867]
<NDArray 1 @cpu(0)>
L 
[ 1.17238867]
<NDArray 1 @cpu(0)>


17it [02:18,  8.15s/it]

loss 
[ 1.15615427]
<NDArray 1 @cpu(0)>
L 
[ 1.15615427]
<NDArray 1 @cpu(0)>


18it [02:26,  8.13s/it]

loss 
[ 1.11623287]
<NDArray 1 @cpu(0)>
L 
[ 1.11623287]
<NDArray 1 @cpu(0)>


19it [02:33,  8.10s/it]

loss 
[ 1.07199419]
<NDArray 1 @cpu(0)>
L 
[ 1.07199419]
<NDArray 1 @cpu(0)>


20it [02:41,  8.08s/it]

loss 
[ 0.9871161]
<NDArray 1 @cpu(0)>
L 
[ 0.9871161]
<NDArray 1 @cpu(0)>


INFO:root:[Epoch 0, Iter 20] training loss=16.120860
21it [02:49,  8.06s/it]

loss 
[ 0.99249452]
<NDArray 1 @cpu(0)>
L 
[ 0.99249452]
<NDArray 1 @cpu(0)>
cumulative_loss 17.0510601103


22it [02:56,  8.04s/it]

loss 
[ 0.95787352]
<NDArray 1 @cpu(0)>
L 
[ 0.95787352]
<NDArray 1 @cpu(0)>


23it [03:04,  8.02s/it]

loss 
[ 0.88952804]
<NDArray 1 @cpu(0)>
L 
[ 0.88952804]
<NDArray 1 @cpu(0)>


24it [03:12,  8.01s/it]

loss 
[ 0.83331943]
<NDArray 1 @cpu(0)>
L 
[ 0.83331943]
<NDArray 1 @cpu(0)>


25it [03:19,  7.99s/it]

loss 
[ 0.79198456]
<NDArray 1 @cpu(0)>
L 
[ 0.79198456]
<NDArray 1 @cpu(0)>


26it [03:27,  7.98s/it]

loss 
[ 0.73027498]
<NDArray 1 @cpu(0)>
L 
[ 0.73027498]
<NDArray 1 @cpu(0)>


27it [03:35,  7.97s/it]

loss 
[ 0.68877345]
<NDArray 1 @cpu(0)>
L 
[ 0.68877345]
<NDArray 1 @cpu(0)>


28it [03:42,  7.96s/it]

loss 
[ 0.65354192]
<NDArray 1 @cpu(0)>
L 
[ 0.65354192]
<NDArray 1 @cpu(0)>


29it [03:50,  7.95s/it]

loss 
[ 0.72846258]
<NDArray 1 @cpu(0)>
L 
[ 0.72846258]
<NDArray 1 @cpu(0)>


30it [03:58,  7.93s/it]

loss 
[ 0.72060961]
<NDArray 1 @cpu(0)>
L 
[ 0.72060961]
<NDArray 1 @cpu(0)>


31it [04:05,  7.93s/it]

loss 
[ 0.69318932]
<NDArray 1 @cpu(0)>
L 
[ 0.69318932]
<NDArray 1 @cpu(0)>


32it [04:13,  7.92s/it]

loss 
[ 0.65480667]
<NDArray 1 @cpu(0)>
L 
[ 0.65480667]
<NDArray 1 @cpu(0)>


33it [04:21,  7.91s/it]

loss 
[ 0.71406746]
<NDArray 1 @cpu(0)>
L 
[ 0.71406746]
<NDArray 1 @cpu(0)>


34it [04:28,  7.91s/it]

loss 
[ 0.63331634]
<NDArray 1 @cpu(0)>
L 
[ 0.63331634]
<NDArray 1 @cpu(0)>


35it [04:36,  7.90s/it]

loss 
[ 0.60383046]
<NDArray 1 @cpu(0)>
L 
[ 0.60383046]
<NDArray 1 @cpu(0)>


36it [04:44,  7.90s/it]

loss 
[ 0.58724391]
<NDArray 1 @cpu(0)>
L 
[ 0.58724391]
<NDArray 1 @cpu(0)>


37it [04:52,  7.89s/it]

loss 
[ 0.57116336]
<NDArray 1 @cpu(0)>
L 
[ 0.57116336]
<NDArray 1 @cpu(0)>


38it [04:59,  7.89s/it]

loss 
[ 0.5626362]
<NDArray 1 @cpu(0)>
L 
[ 0.5626362]
<NDArray 1 @cpu(0)>


39it [05:07,  7.88s/it]

loss 
[ 0.54934257]
<NDArray 1 @cpu(0)>
L 
[ 0.54934257]
<NDArray 1 @cpu(0)>


40it [05:15,  7.88s/it]

loss 
[ 0.54553598]
<NDArray 1 @cpu(0)>
L 
[ 0.54553598]
<NDArray 1 @cpu(0)>


INFO:root:[Epoch 0, Iter 40] training loss=13.661322
41it [05:22,  7.87s/it]

loss 
[ 0.55182171]
<NDArray 1 @cpu(0)>
L 
[ 0.55182171]
<NDArray 1 @cpu(0)>
cumulative_loss 30.7123821676


42it [05:30,  7.87s/it]

loss 
[ 0.40666464]
<NDArray 1 @cpu(0)>
L 
[ 0.40666464]
<NDArray 1 @cpu(0)>


43it [05:38,  7.87s/it]

loss 
[ 0.35743579]
<NDArray 1 @cpu(0)>
L 
[ 0.35743579]
<NDArray 1 @cpu(0)>


44it [05:46,  7.86s/it]

loss 
[ 0.51043445]
<NDArray 1 @cpu(0)>
L 
[ 0.51043445]
<NDArray 1 @cpu(0)>


45it [05:53,  7.86s/it]

loss 
[ 0.46853647]
<NDArray 1 @cpu(0)>
L 
[ 0.46853647]
<NDArray 1 @cpu(0)>


46it [06:01,  7.86s/it]

loss 
[ 0.42070645]
<NDArray 1 @cpu(0)>
L 
[ 0.42070645]
<NDArray 1 @cpu(0)>


47it [06:09,  7.85s/it]

loss 
[ 0.38056162]
<NDArray 1 @cpu(0)>
L 
[ 0.38056162]
<NDArray 1 @cpu(0)>


48it [06:16,  7.85s/it]

loss 
[ 0.2944499]
<NDArray 1 @cpu(0)>
L 
[ 0.2944499]
<NDArray 1 @cpu(0)>


49it [06:24,  7.85s/it]

loss 
[ 0.49778351]
<NDArray 1 @cpu(0)>
L 
[ 0.49778351]
<NDArray 1 @cpu(0)>


50it [06:32,  7.85s/it]

loss 
[ 0.26283753]
<NDArray 1 @cpu(0)>
L 
[ 0.26283753]
<NDArray 1 @cpu(0)>


51it [06:39,  7.84s/it]

loss 
[ 0.31132311]
<NDArray 1 @cpu(0)>
L 
[ 0.31132311]
<NDArray 1 @cpu(0)>


52it [06:47,  7.84s/it]

loss 
[ 0.35376164]
<NDArray 1 @cpu(0)>
L 
[ 0.35376164]
<NDArray 1 @cpu(0)>


53it [06:55,  7.84s/it]

loss 
[ 0.46350756]
<NDArray 1 @cpu(0)>
L 
[ 0.46350756]
<NDArray 1 @cpu(0)>


54it [07:02,  7.83s/it]

loss 
[ 0.39023423]
<NDArray 1 @cpu(0)>
L 
[ 0.39023423]
<NDArray 1 @cpu(0)>


55it [07:10,  7.83s/it]

loss 
[ 0.39793965]
<NDArray 1 @cpu(0)>
L 
[ 0.39793965]
<NDArray 1 @cpu(0)>


56it [07:18,  7.83s/it]

loss 
[ 0.32669663]
<NDArray 1 @cpu(0)>
L 
[ 0.32669663]
<NDArray 1 @cpu(0)>


57it [07:26,  7.82s/it]

loss 
[ 0.32369828]
<NDArray 1 @cpu(0)>
L 
[ 0.32369828]
<NDArray 1 @cpu(0)>


58it [07:33,  7.82s/it]

loss 
[ 0.3076607]
<NDArray 1 @cpu(0)>
L 
[ 0.3076607]
<NDArray 1 @cpu(0)>


59it [07:41,  7.82s/it]

loss 
[ 0.37766787]
<NDArray 1 @cpu(0)>
L 
[ 0.37766787]
<NDArray 1 @cpu(0)>


60it [07:49,  7.82s/it]

loss 
[ 0.35369188]
<NDArray 1 @cpu(0)>
L 
[ 0.35369188]
<NDArray 1 @cpu(0)>


INFO:root:[Epoch 0, Iter 60] training loss=7.451401
61it [07:56,  7.82s/it]

loss 
[ 0.24580872]
<NDArray 1 @cpu(0)>
L 
[ 0.24580872]
<NDArray 1 @cpu(0)>
cumulative_loss 38.1637828052


62it [08:04,  7.81s/it]

loss 
[ 0.25410056]
<NDArray 1 @cpu(0)>
L 
[ 0.25410056]
<NDArray 1 @cpu(0)>


63it [08:12,  7.81s/it]

loss 
[ 0.31967944]
<NDArray 1 @cpu(0)>
L 
[ 0.31967944]
<NDArray 1 @cpu(0)>


64it [08:19,  7.81s/it]

loss 
[ 0.26422262]
<NDArray 1 @cpu(0)>
L 
[ 0.26422262]
<NDArray 1 @cpu(0)>


65it [08:27,  7.81s/it]

loss 
[ 0.20363738]
<NDArray 1 @cpu(0)>
L 
[ 0.20363738]
<NDArray 1 @cpu(0)>


66it [08:35,  7.81s/it]

loss 
[ 0.21358664]
<NDArray 1 @cpu(0)>
L 
[ 0.21358664]
<NDArray 1 @cpu(0)>


67it [08:43,  7.81s/it]

loss 
[ 0.29319295]
<NDArray 1 @cpu(0)>
L 
[ 0.29319295]
<NDArray 1 @cpu(0)>


68it [08:50,  7.80s/it]

loss 
[ 0.40762755]
<NDArray 1 @cpu(0)>
L 
[ 0.40762755]
<NDArray 1 @cpu(0)>


69it [08:58,  7.80s/it]

loss 
[ 0.29034105]
<NDArray 1 @cpu(0)>
L 
[ 0.29034105]
<NDArray 1 @cpu(0)>


70it [09:06,  7.80s/it]

loss 
[ 0.28230765]
<NDArray 1 @cpu(0)>
L 
[ 0.28230765]
<NDArray 1 @cpu(0)>


71it [09:13,  7.80s/it]

loss 
[ 0.22582756]
<NDArray 1 @cpu(0)>
L 
[ 0.22582756]
<NDArray 1 @cpu(0)>


72it [09:21,  7.80s/it]

loss 
[ 0.21078564]
<NDArray 1 @cpu(0)>
L 
[ 0.21078564]
<NDArray 1 @cpu(0)>


73it [09:29,  7.80s/it]

loss 
[ 0.19969577]
<NDArray 1 @cpu(0)>
L 
[ 0.19969577]
<NDArray 1 @cpu(0)>


74it [09:37,  7.80s/it]

loss 
[ 0.19925973]
<NDArray 1 @cpu(0)>
L 
[ 0.19925973]
<NDArray 1 @cpu(0)>


75it [09:45,  7.80s/it]

loss 
[ 0.19943261]
<NDArray 1 @cpu(0)>
L 
[ 0.19943261]
<NDArray 1 @cpu(0)>


76it [09:52,  7.80s/it]

loss 
[ 0.19932304]
<NDArray 1 @cpu(0)>
L 
[ 0.19932304]
<NDArray 1 @cpu(0)>


77it [10:00,  7.80s/it]

loss 
[ 0.19992433]
<NDArray 1 @cpu(0)>
L 
[ 0.19992433]
<NDArray 1 @cpu(0)>


78it [10:08,  7.80s/it]

loss 
[ 0.19931647]
<NDArray 1 @cpu(0)>
L 
[ 0.19931647]
<NDArray 1 @cpu(0)>


79it [10:16,  7.80s/it]

loss 
[ 0.19965608]
<NDArray 1 @cpu(0)>
L 
[ 0.19965608]
<NDArray 1 @cpu(0)>


80it [10:23,  7.80s/it]

loss 
[ 0.19983438]
<NDArray 1 @cpu(0)>
L 
[ 0.19983438]
<NDArray 1 @cpu(0)>


INFO:root:[Epoch 0, Iter 80] training loss=4.761119
81it [10:31,  7.80s/it]

loss 
[ 0.19936787]
<NDArray 1 @cpu(0)>
L 
[ 0.19936787]
<NDArray 1 @cpu(0)>
cumulative_loss 42.9249021411


82it [10:39,  7.79s/it]

loss 
[ 0.19984844]
<NDArray 1 @cpu(0)>
L 
[ 0.19984844]
<NDArray 1 @cpu(0)>


83it [10:46,  7.80s/it]

loss 
[ 0.19936515]
<NDArray 1 @cpu(0)>
L 
[ 0.19936515]
<NDArray 1 @cpu(0)>


84it [10:54,  7.80s/it]

loss 
[ 0.1985952]
<NDArray 1 @cpu(0)>
L 
[ 0.1985952]
<NDArray 1 @cpu(0)>


85it [11:02,  7.80s/it]

loss 
[ 0.19936571]
<NDArray 1 @cpu(0)>
L 
[ 0.19936571]
<NDArray 1 @cpu(0)>


86it [11:10,  7.79s/it]

loss 
[ 0.19877286]
<NDArray 1 @cpu(0)>
L 
[ 0.19877286]
<NDArray 1 @cpu(0)>


87it [11:17,  7.79s/it]

loss 
[ 0.19966914]
<NDArray 1 @cpu(0)>
L 
[ 0.19966914]
<NDArray 1 @cpu(0)>


88it [11:25,  7.79s/it]

loss 
[ 0.19904248]
<NDArray 1 @cpu(0)>
L 
[ 0.19904248]
<NDArray 1 @cpu(0)>


89it [11:33,  7.79s/it]

loss 
[ 0.1995123]
<NDArray 1 @cpu(0)>
L 
[ 0.1995123]
<NDArray 1 @cpu(0)>


90it [11:41,  7.79s/it]

loss 
[ 0.19970463]
<NDArray 1 @cpu(0)>
L 
[ 0.19970463]
<NDArray 1 @cpu(0)>


91it [11:48,  7.79s/it]

loss 
[ 0.19948688]
<NDArray 1 @cpu(0)>
L 
[ 0.19948688]
<NDArray 1 @cpu(0)>


92it [11:56,  7.79s/it]

loss 
[ 0.19881736]
<NDArray 1 @cpu(0)>
L 
[ 0.19881736]
<NDArray 1 @cpu(0)>


93it [12:04,  7.79s/it]

loss 
[ 0.19972959]
<NDArray 1 @cpu(0)>
L 
[ 0.19972959]
<NDArray 1 @cpu(0)>


94it [12:12,  7.79s/it]

loss 
[ 0.19984749]
<NDArray 1 @cpu(0)>
L 
[ 0.19984749]
<NDArray 1 @cpu(0)>


95it [12:20,  7.79s/it]

loss 
[ 0.19863568]
<NDArray 1 @cpu(0)>
L 
[ 0.19863568]
<NDArray 1 @cpu(0)>


96it [12:27,  7.79s/it]

loss 
[ 0.19985077]
<NDArray 1 @cpu(0)>
L 
[ 0.19985077]
<NDArray 1 @cpu(0)>


97it [12:35,  7.79s/it]

loss 
[ 0.22420447]
<NDArray 1 @cpu(0)>
L 
[ 0.22420447]
<NDArray 1 @cpu(0)>


98it [12:43,  7.79s/it]

loss 
[ 0.19966483]
<NDArray 1 @cpu(0)>
L 
[ 0.19966483]
<NDArray 1 @cpu(0)>


99it [12:50,  7.79s/it]

loss 
[ 0.21588317]
<NDArray 1 @cpu(0)>
L 
[ 0.21588317]
<NDArray 1 @cpu(0)>


100it [12:58,  7.79s/it]

loss 
[ 0.21884115]
<NDArray 1 @cpu(0)>
L 
[ 0.21884115]
<NDArray 1 @cpu(0)>


INFO:root:[Epoch 0, Iter 100] training loss=4.046547
101it [13:06,  7.79s/it]

loss 
[ 0.19770955]
<NDArray 1 @cpu(0)>
L 
[ 0.19770955]
<NDArray 1 @cpu(0)>
cumulative_loss 46.9714489877


102it [13:13,  7.78s/it]

loss 
[ 0.21364801]
<NDArray 1 @cpu(0)>
L 
[ 0.21364801]
<NDArray 1 @cpu(0)>


103it [13:21,  7.78s/it]

loss 
[ 0.19925553]
<NDArray 1 @cpu(0)>
L 
[ 0.19925553]
<NDArray 1 @cpu(0)>


104it [13:29,  7.78s/it]

loss 
[ 0.20009357]
<NDArray 1 @cpu(0)>
L 
[ 0.20009357]
<NDArray 1 @cpu(0)>


105it [13:36,  7.78s/it]

loss 
[ 0.2057824]
<NDArray 1 @cpu(0)>
L 
[ 0.2057824]
<NDArray 1 @cpu(0)>


106it [13:44,  7.78s/it]

loss 
[ 0.24411938]
<NDArray 1 @cpu(0)>
L 
[ 0.24411938]
<NDArray 1 @cpu(0)>


107it [13:52,  7.78s/it]

loss 
[ 0.20065646]
<NDArray 1 @cpu(0)>
L 
[ 0.20065646]
<NDArray 1 @cpu(0)>


108it [14:00,  7.78s/it]

loss 
[ 0.22489665]
<NDArray 1 @cpu(0)>
L 
[ 0.22489665]
<NDArray 1 @cpu(0)>


109it [14:07,  7.78s/it]

loss 
[ 0.26044676]
<NDArray 1 @cpu(0)>
L 
[ 0.26044676]
<NDArray 1 @cpu(0)>


110it [14:15,  7.78s/it]

loss 
[ 0.20777147]
<NDArray 1 @cpu(0)>
L 
[ 0.20777147]
<NDArray 1 @cpu(0)>


111it [14:23,  7.78s/it]

loss 
[ 0.20863782]
<NDArray 1 @cpu(0)>
L 
[ 0.20863782]
<NDArray 1 @cpu(0)>


112it [14:31,  7.78s/it]

loss 
[ 0.21230014]
<NDArray 1 @cpu(0)>
L 
[ 0.21230014]
<NDArray 1 @cpu(0)>


113it [14:38,  7.78s/it]

loss 
[ 0.19798717]
<NDArray 1 @cpu(0)>
L 
[ 0.19798717]
<NDArray 1 @cpu(0)>


114it [14:46,  7.78s/it]

loss 
[ 0.21652737]
<NDArray 1 @cpu(0)>
L 
[ 0.21652737]
<NDArray 1 @cpu(0)>


115it [14:54,  7.78s/it]

loss 
[ 0.2178912]
<NDArray 1 @cpu(0)>
L 
[ 0.2178912]
<NDArray 1 @cpu(0)>


116it [15:02,  7.78s/it]

loss 
[ 0.23770939]
<NDArray 1 @cpu(0)>
L 
[ 0.23770939]
<NDArray 1 @cpu(0)>


117it [15:09,  7.78s/it]

loss 
[ 0.20031689]
<NDArray 1 @cpu(0)>
L 
[ 0.20031689]
<NDArray 1 @cpu(0)>


118it [15:17,  7.78s/it]

loss 
[ 0.20539396]
<NDArray 1 @cpu(0)>
L 
[ 0.20539396]
<NDArray 1 @cpu(0)>


119it [15:25,  7.78s/it]

loss 
[ 0.23265149]
<NDArray 1 @cpu(0)>
L 
[ 0.23265149]
<NDArray 1 @cpu(0)>


120it [15:32,  7.77s/it]

loss 
[ 0.22567245]
<NDArray 1 @cpu(0)>
L 
[ 0.22567245]
<NDArray 1 @cpu(0)>


INFO:root:[Epoch 0, Iter 120] training loss=4.307848
121it [15:40,  7.77s/it]

loss 
[ 0.19608991]
<NDArray 1 @cpu(0)>
L 
[ 0.19608991]
<NDArray 1 @cpu(0)>
cumulative_loss 51.2792969793


122it [15:48,  7.77s/it]

loss 
[ 0.19886924]
<NDArray 1 @cpu(0)>
L 
[ 0.19886924]
<NDArray 1 @cpu(0)>


123it [15:56,  7.77s/it]

loss 
[ 0.23855884]
<NDArray 1 @cpu(0)>
L 
[ 0.23855884]
<NDArray 1 @cpu(0)>


124it [16:03,  7.77s/it]

loss 
[ 0.25491846]
<NDArray 1 @cpu(0)>
L 
[ 0.25491846]
<NDArray 1 @cpu(0)>


125it [16:11,  7.77s/it]

loss 
[ 0.2227286]
<NDArray 1 @cpu(0)>
L 
[ 0.2227286]
<NDArray 1 @cpu(0)>


126it [16:19,  7.77s/it]

loss 
[ 0.1985635]
<NDArray 1 @cpu(0)>
L 
[ 0.1985635]
<NDArray 1 @cpu(0)>


127it [16:27,  7.77s/it]

loss 
[ 0.19835052]
<NDArray 1 @cpu(0)>
L 
[ 0.19835052]
<NDArray 1 @cpu(0)>


128it [16:35,  7.77s/it]

loss 
[ 0.25035492]
<NDArray 1 @cpu(0)>
L 
[ 0.25035492]
<NDArray 1 @cpu(0)>


129it [16:42,  7.77s/it]

loss 
[ 0.25866833]
<NDArray 1 @cpu(0)>
L 
[ 0.25866833]
<NDArray 1 @cpu(0)>


130it [16:50,  7.77s/it]

loss 
[ 0.21738748]
<NDArray 1 @cpu(0)>
L 
[ 0.21738748]
<NDArray 1 @cpu(0)>


131it [16:58,  7.77s/it]

loss 
[ 0.19922917]
<NDArray 1 @cpu(0)>
L 
[ 0.19922917]
<NDArray 1 @cpu(0)>


132it [17:05,  7.77s/it]

loss 
[ 0.20233974]
<NDArray 1 @cpu(0)>
L 
[ 0.20233974]
<NDArray 1 @cpu(0)>


133it [17:13,  7.77s/it]

loss 
[ 0.23033763]
<NDArray 1 @cpu(0)>
L 
[ 0.23033763]
<NDArray 1 @cpu(0)>


134it [17:21,  7.77s/it]

loss 
[ 0.24815291]
<NDArray 1 @cpu(0)>
L 
[ 0.24815291]
<NDArray 1 @cpu(0)>


135it [17:28,  7.77s/it]

loss 
[ 0.22015187]
<NDArray 1 @cpu(0)>
L 
[ 0.22015187]
<NDArray 1 @cpu(0)>


136it [17:36,  7.77s/it]

loss 
[ 0.24502853]
<NDArray 1 @cpu(0)>
L 
[ 0.24502853]
<NDArray 1 @cpu(0)>


137it [17:44,  7.77s/it]

loss 
[ 0.24900842]
<NDArray 1 @cpu(0)>
L 
[ 0.24900842]
<NDArray 1 @cpu(0)>


138it [17:52,  7.77s/it]

loss 
[ 0.24296364]
<NDArray 1 @cpu(0)>
L 
[ 0.24296364]
<NDArray 1 @cpu(0)>


139it [18:00,  7.77s/it]

loss 
[ 0.19890828]
<NDArray 1 @cpu(0)>
L 
[ 0.19890828]
<NDArray 1 @cpu(0)>


140it [18:07,  7.77s/it]

loss 
[ 0.20485149]
<NDArray 1 @cpu(0)>
L 
[ 0.20485149]
<NDArray 1 @cpu(0)>
