In [1]:
import time
import os

import numpy as np

import matplotlib.pyplot as plt

import mxnet as mx
from mxnet import autograd, gluon

import gluoncv as gcv
from gluoncv.utils import download, viz

In [2]:
original_train_data = gcv.data.RecordFileDetection('Deep-Waldo/data/train.rec')
combined_data = gcv.data.RecordFileDetection('Deep-Waldo/data/combined.rec')

classes = ['Waldo']  # only one foreground class here

try:
    ctx = [mx.gpu(0)]
    print('GPU Loaded')
    
except:
    ctx = [mx.cpu()]

GPU Loaded


In [3]:
net = gcv.model_zoo.get_model('ssd_512_mobilenet1.0_voc', pretrained=True)
net.reset_class(classes)

mbox_loss = gcv.loss.SSDMultiBoxLoss()
ce_metric = mx.metric.Loss('CrossEntropy')
smoothl1_metric = mx.metric.Loss('SmoothL1')

In [4]:
def get_dataloader(net, train_dataset, data_shape, batch_size, num_workers):
    
    from gluoncv.data.batchify import Tuple, Stack, Pad
    from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform
    
    width, height = data_shape, data_shape
    
    # use fake data to generate fixed anchors for target generation
    with autograd.train_mode():
        _, _, anchors = net(mx.nd.zeros((1, 3, height, width)))
        
    batchify_fn = Tuple(Stack(), Stack(), Stack())  # stack image, cls_targets, box_targets
    
    train_loader = gluon.data.DataLoader(
        train_dataset.transform(SSDDefaultTrainTransform(width, height, anchors)),
        batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers)
    
    return train_loader

# Low res - large batch (128, 32)
# High res - small batch (1024, 2)
# Medium res - medium batch (512, 8)

train_loader = get_dataloader(net, original_train_data, 512, 8, 0) # look into modifying this because what is our data sahpe?
combined_loader = get_dataloader(net, combined_data, 512, 8, 0)

In [5]:
net.collect_params().reset_ctx(ctx)

########################################################
#             Tune the Hyperparameters Here            #
########################################################

learning_rate = 1e-3
num_epochs = 10

trainer = gluon.Trainer(
    net.collect_params(), 'adam',
    {'learning_rate': learning_rate, 'wd': 0.0005})

########################################################
#             Tune the Hyperparameters Here            #
########################################################

# try:
#   net.load_parameters('./model/waldo.params')
#   print('Model Loaded')
# except:
#   pass

num_epochs = 100

print('Training...')

for epoch in range(num_epochs):

    start_ = time.time()
    
    ce_metric.reset() 
    smoothl1_metric.reset()
    tic = time.time()
    btic = time.time()
    
    net.hybridize(static_alloc=True, static_shape=True) # look this up if you wan tot know more https://mxnet.incubator.apache.org/api/python/gluon/gluon.html?highlight=hybridize#mxnet.gluon.Block.hybridize
    
    for i, batch in enumerate(train_loader):
        
        batch_size = batch[0].shape[0]
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
        
        with autograd.record():
            
            cls_preds = []
            box_preds = []
            
            for x in data:
                
                cls_pred, box_pred, _ = net(x)
                cls_preds.append(cls_pred)
                box_preds.append(box_pred)
                
            sum_loss, cls_loss, box_loss = mbox_loss(
                cls_preds, box_preds, cls_targets, box_targets)
            
            autograd.backward(sum_loss)
        # since we have already normalized the loss, we don't want to normalize
        # by batch-size anymore
        trainer.step(1)
        ce_metric.update(0, [l * batch_size for l in cls_loss])
        smoothl1_metric.update(0, [l * batch_size for l in box_loss])
        name1, loss1 = ce_metric.get()
        name2, loss2 = smoothl1_metric.get()
        
        print('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format(
            epoch, i, batch_size/(time.time()-btic), name1, loss1, name2, loss2))
        
    if not os.path.isdir('./model'):
        os.mkdir('./model')
      
#     net.save_parameters('./model/ssd_waldo_hd.params')


Training...
[Epoch 0][Batch 0], Speed: 0.417 samples/sec, CrossEntropy=8.433, SmoothL1=13.759
[Epoch 0][Batch 1], Speed: 0.355 samples/sec, CrossEntropy=8.387, SmoothL1=13.323


KeyboardInterrupt: 

In [7]:
net = gcv.model_zoo.get_model('ssd_512_mobilenet1.0_custom', classes=classes, pretrained_base=False)
net.load_parameters('./model/ssd_waldo_hd.params', ctx=ctx)

x, image = gcv.data.transforms.presets.ssd.load_test('./Deep-Waldo/data/original/30.jpg', 512)
cid, score, bbox = net(x.as_in_context(ctx[0]))
fig = plt.figure(figsize=(22,20))
ax = fig.add_subplot(1, 1, 1)
viz.plot_bbox(image, bbox[0], score[0], cid[0], class_names=classes, ax=ax)
plt.show()

MXNetError: [21:54:52] src/storage/./pooled_storage_manager.h:143: cudaMalloc failed: out of memory

Stack trace returned 10 entries:
[bt] (0) /home/exampletestaccount1/anaconda3/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x3f23c2) [0x7f30ad05b3c2]
[bt] (1) /home/exampletestaccount1/anaconda3/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x3f2988) [0x7f30ad05b988]
[bt] (2) /home/exampletestaccount1/anaconda3/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x38c598f) [0x7f30b052e98f]
[bt] (3) /home/exampletestaccount1/anaconda3/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x38c9b82) [0x7f30b0532b82]
[bt] (4) /home/exampletestaccount1/anaconda3/lib/python3.5/site-packages/mxnet/libmxnet.so(void mxnet::CopyFromToDnsImpl<mshadow::cpu, mshadow::gpu>(mxnet::NDArray const&, mxnet::NDArray const&, mxnet::RunContext)+0x2e9) [0x7f30aff21089]
[bt] (5) /home/exampletestaccount1/anaconda3/lib/python3.5/site-packages/mxnet/libmxnet.so(void mxnet::CopyFromToImpl<mshadow::cpu, mshadow::gpu>(mxnet::NDArray const&, mxnet::NDArray const&, mxnet::RunContext, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&)+0x44b) [0x7f30aff3a62b]
[bt] (6) /home/exampletestaccount1/anaconda3/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x32d172b) [0x7f30aff3a72b]
[bt] (7) /home/exampletestaccount1/anaconda3/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x30acde4) [0x7f30afd15de4]
[bt] (8) /home/exampletestaccount1/anaconda3/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x30b3bfb) [0x7f30afd1cbfb]
[bt] (9) /home/exampletestaccount1/anaconda3/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x30b3e1e) [0x7f30afd1ce1e]

