# Simple Toy TS Training for Deep3D+

In [1]:
"""
Simple tester for the deep3d
"""
import tensorflow as tf
import Deep3D_branched as deep3d
import utils
import numpy as np
import os
import os.path
import h5py

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

## Importing Data from H5 Format for fast loading
- Will eventually unit test dynamic CPU data loading pipeline here

In [2]:
inria_file = '/a/data/deep3d_data/inria_data.h5'
# inria_file = 'data/inria_data.h5'
h5f = h5py.File(inria_file,'r')

X_train_0 = h5f['X_0'][:,10:170,16:304,:]
Y_train_0 = h5f['Y_0'][:,10:170,16:304,:]
X_train_1 = h5f['X_1'][:,10:170,16:304,:]
Y_train_1 = h5f['Y_1'][:,10:170,16:304,:]
X_train_2 = h5f['X_2'][:,10:170,16:304,:]
Y_train_2 = h5f['Y_2'][:,10:170,16:304,:]
X_train_3 = h5f['X_3'][:,10:170,16:304,:]
Y_train_3 = h5f['Y_3'][:,10:170,16:304,:]
X_train_4 = h5f['X_4'][:,10:170,16:304,:]
Y_train_4 = h5f['Y_4'][:,10:170,16:304,:]
X_train_5 = h5f['X_5'][:,10:170,16:304,:]
Y_train_5 = h5f['Y_5'][:,10:170,16:304,:]
X_train_6 = h5f['X_6'][:,10:170,16:304,:]
Y_train_6 = h5f['Y_6'][:,10:170,16:304,:]
#X_train_7 = h5f['X_7'][:,10:170,16:304,:]
#Y_train_7 = h5f['Y_7'][:,10:170,16:304,:]


X_val = h5f['X_7'][:,10:170,16:304,:]
Y_val = h5f['Y_7'][:,10:170,16:304,:]
  
h5f.close()



# ------------------------------------------#
X_train = np.concatenate([X_train_0,X_train_1,X_train_2,X_train_3,X_train_4,X_train_5,X_train_6])
Y_train = np.concatenate([Y_train_0,Y_train_1,Y_train_2,Y_train_3,Y_train_4,Y_train_5,Y_train_6])

print "Training Size:" + str(X_train.shape)
print "Validation Size:" + str(X_val.shape)

Training Size:(3500, 160, 288, 3)
Validation Size:(443, 160, 288, 3)


## Training Loop

In [3]:
batchsize = 64
num_epochs = 3
num_batches = (X_train.shape[0]/batchsize)*num_epochs
print_step = 1
viz_step = 10

# Define config for GPU memory debugging 
config = tf.ConfigProto()
config.gpu_options.allow_growth=True  # Switch to True for dynamic memory allocation instead of TF hogging BS
config.gpu_options.per_process_gpu_memory_fraction= 1  # Cap TF mem usage
config.allow_soft_placement=True
with tf.device('/gpu:0'):
    # Session
    sess = tf.Session(config=config)
    
    # Placeholders
    images = tf.placeholder(tf.float32, [None, 160, 288, 3], name='input_batch')
    true_out = tf.placeholder(tf.float32, [None, 160, 288, 3] , name='ground_truth')
    train_mode = tf.placeholder(tf.bool, name='train_mode')

    # Building Net based on VGG weights 
    net = deep3d.Deep3Dnet('./vgg19.npy', dropout = 0.5)
    net.build(images, train_mode)

    # Print number of variables used: 143667240 variables, i.e. ideal size = 548MB
    print 'Variable count:'
    print(net.get_var_count())
    
    # Run initializer 
    sess.run(tf.global_variables_initializer())
   
    # Define Training Objectives
    with tf.variable_scope("Loss"):
        cost = tf.reduce_sum(tf.abs(net.prob - true_out))/batchsize
    
    
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):    
        train = tf.train.GradientDescentOptimizer(0.002).minimize(cost)
        
    # Track Cost    
    tf.summary.scalar('cost', cost)
    # tensorboard operations to compile summary and then write into logs
    merged = tf.summary.merge_all()
    writer = tf.summary.FileWriter('./tensorboard_logs/', graph = sess.graph)

    
    # Training Loop
    print ""
    print "== Start training =="
    for i in xrange(num_batches):
        # Creating Batch
        image_mask = np.random.choice(X_train.shape[0],batchsize)
        images_in = X_train[image_mask,:,:,:]
        labels_in = Y_train[image_mask,:,:,:]

        # Traing Step
        _, cost_val, summary = sess.run([train, cost, merged], feed_dict={images: images_in, true_out: labels_in, train_mode: True})
        writer.add_summary(summary, i)

        # No longer needed: cost_hist.append(cost_val)
        if i%print_step == 0:
            print ("({}/{})".format(i, num_batches).ljust(10) + ' | Cost: ' + str(cost_val))
    
    
    print ""
    print "Training Completed, storing weights"
    # Store Traing Output
    net.save_npy(sess)
    

Variable count:
139063583

== Start training ==
(0/162)    | Cost: 12277.1
(1/162)    | Cost: 9202.23
(2/162)    | Cost: 8189.46
(3/162)    | Cost: 8614.69
(4/162)    | Cost: 8098.2
(5/162)    | Cost: 8203.87
(6/162)    | Cost: 7643.63
(7/162)    | Cost: 8911.97
(8/162)    | Cost: 7444.71
(9/162)    | Cost: 7882.08
(10/162)   | Cost: 7743.14
(11/162)   | Cost: 7741.81
(12/162)   | Cost: 8068.62
(13/162)   | Cost: 8914.06
(14/162)   | Cost: 8087.85
(15/162)   | Cost: 7135.56
(16/162)   | Cost: 7467.24
(17/162)   | Cost: 8208.4
(18/162)   | Cost: 7888.25
(19/162)   | Cost: 6983.28
(20/162)   | Cost: 7840.61
(21/162)   | Cost: 8272.17
(22/162)   | Cost: 8900.15
(23/162)   | Cost: 8055.81
(24/162)   | Cost: 8751.43
(25/162)   | Cost: 8451.68
(26/162)   | Cost: 7665.87
(27/162)   | Cost: 8102.41
(28/162)   | Cost: 7906.55
(29/162)   | Cost: 8806.08
(30/162)   | Cost: 8369.04
(31/162)   | Cost: 7368.95
(32/162)   | Cost: 8144.27
(33/162)   | Cost: 7734.42
(34/162)   | Cost: 7909.18
(35/162) 

ResourceExhaustedError: OOM when allocating tensor with shape[64,160,288,3,33]
	 [[Node: gradients/select/Mul_grad/mul = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](gradients/select/Sum_grad/Tile, select/ExpandDims)]]

Caused by op u'gradients/select/Mul_grad/mul', defined at:
  File "/usr/lib64/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/usr/lib64/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-d2a1efbf94c5>", line 39, in <module>
    train = tf.train.GradientDescentOptimizer(0.002).minimize(cost)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 315, in minimize
    grad_loss=grad_loss)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 386, in compute_gradients
    colocate_gradients_with_ops=colocate_gradients_with_ops)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 560, in gradients
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 368, in _MaybeCompile
    return grad_fn()  # Exit early
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/ops/gradients_impl.py", line 560, in <lambda>
    grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/ops/math_grad.py", line 625, in _MulGrad
    return (array_ops.reshape(math_ops.reduce_sum(grad * y, rx), sx),
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/ops/math_ops.py", line 821, in binary_op_wrapper
    return func(x, y, name=name)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/ops/math_ops.py", line 1044, in _mul_dispatch
    return gen_math_ops._mul(x, y, name=name)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/ops/gen_math_ops.py", line 1434, in _mul
    result = _op_def_lib.apply_op("Mul", x=x, y=y, name=name)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

...which was originally created as op u'select/Mul', defined at:
  File "/usr/lib64/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
[elided 18 identical lines from previous traceback]
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-d2a1efbf94c5>", line 23, in <module>
    net.build(images, train_mode)
  File "Deep3D_branched.py", line 147, in build
    self.prob  = selection.select(self.mask, rgb)
  File "selection.py", line 30, in select
    disparity_image = tf.multiply(slices, tf.expand_dims(masks, axis=3))
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/ops/math_ops.py", line 278, in multiply
    return gen_math_ops._mul(x, y, name)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/ops/gen_math_ops.py", line 1434, in _mul
    result = _op_def_lib.apply_op("Mul", x=x, y=y, name=name)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/a/h/tlee05/Envs/deep-venv/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[64,160,288,3,33]
	 [[Node: gradients/select/Mul_grad/mul = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](gradients/select/Sum_grad/Tile, select/ExpandDims)]]


## Testing Output

In [None]:
# Test
test_img = np.expand_dims(X_val[170], axis = 0)
test_ans = Y_val[170]

with tf.device("/gpu:0"):
    res, mask, up_conv = sess.run([net.prob, net.mask, net.up_conv], 
                                  feed_dict={images: test_img, train_mode: False})

In [None]:
import matplotlib.pyplot as plt
print "--- Input ---"
plt.imshow(test_img[0])
plt.show()

print "--- GT ---"
plt.imshow(test_ans)
plt.show()

print "--- Our result ---"
plt.imshow(res[0])
plt.show()

#pyplot.imsave('1.jpeg', test_img[0])
#pyplot.imsave('2.jpeg', res[0])

## Looking at Disparity Maps

In [None]:
f, axs = plt.subplots(6, 6, sharex='col', sharey='row')

for i in range(33):
    axs[i/6][i%6].imshow(mask[0,:,:,i],cmap="gray",vmin=0.0, vmax=1.0)
plt.show()

In [None]:
max_shift_channel = np.argmax(mask,axis = 3)
max_shift_channel[0]

In [None]:
channel_mean = np.mean(mask[0], axis =(0,1))
channel_mean

In [None]:
plt.plot(channel_mean)
plt.show()

In [None]:
channel_act_mean = np.mean(up_conv[0], axis =(0,1))
plt.plot(channel_act_mean)
plt.show()

In [None]:
plt.hist(up_conv[0,:,:,16].ravel(), bins=100)
plt.show()

In [None]:
for layer in range(0,33):
    plt.imshow(up_conv[0,:,:,layer],cmap="gray", vmin= 0, vmax=up_conv[0,:,:,layer].max())
    plt.show()