## Training (Not using TFRecords)

### What you will see
Basically how to train a network on tensorflow.
1. Model definition 
2. Loss definition
3. Session build
4. Training loop

In [1]:
import tensorflow as tf
import sys
sys.path.append('../')
import model
import os
import subprocess
import glob
from driving_data import HandleData


# Regularization value
L2NormConst = 0.1
start_lr = 1.0
batch_size = 64
epochs = 1
input_train_hdf5 = '/home/ubuntu/datasets/Dataset_LMDB_Balanced'
input_val_hdf5 = ''
logs_path = '../logs'
save_dir = '../save'
iter_disp = 10
gpu_fraction = 0.3

os.environ["CUDA_VISIBLE_DEVICES"] = str(0)

  from ._conv import register_converters as _register_converters


### Define the model

In [2]:
# Open Model
driving_model = model.DrivingModel(training_mode=True)
    
# Get placeholders from model
model_in = driving_model.input
model_out = driving_model.output
labels_in = driving_model.label_in
model_drop = driving_model.dropout_control
train_mode = driving_model.train_mode

### Define Loss function

In [3]:
train_vars = tf.trainable_variables()
with tf.name_scope("MSE_Loss_L2Reg"):
    loss = tf.reduce_mean(tf.square(tf.subtract(labels_in, model_out))) + tf.add_n(
        [tf.nn.l2_loss(v) for v in train_vars]) * L2NormConst

# Add model accuracy
with tf.name_scope("Loss_Validation"):
    loss_val = tf.reduce_mean(tf.square(tf.subtract(labels_in, model_out)))

### Define the solver
We want to use the Adam solver to minimize or loss function.

In [4]:
# Solver configuration
# Get ops to update moving_mean and moving_variance from batch_norm
# Reference: https://www.tensorflow.org/api_docs/python/tf/contrib/layers/batch_norm
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.name_scope("Solver"):
    global_step = tf.Variable(0, trainable=False)
    starter_learning_rate = start_lr
    # decay every 1000 steps with a base of 0.9
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                               1000, 0.9, staircase=True)

    # Basically update the batch_norm moving averages before the training step
    # http://ruishu.io/2016/12/27/batchnorm/
    with tf.control_dependencies(update_ops):
        train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)

### Create the Session
We need a tf.Session() so TensorFlow can use our computer resources and run the graph

In [5]:
# Avoid allocating the whole memory
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))

### Initialize the values (Random values of weights)

In [6]:
# Initialize all random variables (Weights/Bias)
sess.run(tf.global_variables_initializer())

### Define saver object to save checkpoints

In [7]:
# Define saver object to save all the variables of the drivingModel graphph
saver = tf.train.Saver()

### Add some variables to be observed on Tensorboard

In [8]:
# Create histogram for labels
tf.summary.histogram("steer_angle", labels_in)
# Add input image/steering angle on summary
tf.summary.image("input_image", model_in, 10)

# Monitor loss, learning_rate, global_step, etc...
tf.summary.scalar("loss_train", loss)
tf.summary.scalar("learning_rate", learning_rate)
tf.summary.scalar("global_step", global_step)
# merge all summaries into a single op
merged_summary_op = tf.summary.merge_all()

# Configure where to save the logs for tensorboard
summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())

### Load the dataset

In [9]:
data = HandleData(path=input_train_hdf5, path_val=input_val_hdf5)
num_images_epoch = int(data.get_num_images() / batch_size)
print('Num samples',data.get_num_images(), 'Iterations per epoch:', num_images_epoch, 'batch size:', batch_size)

Loading training data
LMDB file
Spliting training and validation
Number training images: 38081
Number validation images: 9520
Num samples 47602 Iterations per epoch: 743 batch size: 64


### Do training

In [10]:
# For each epoch
for epoch in range(epochs):
    for i in range(int(data.get_num_images() / batch_size)):
        # Get training batch
        xs_train, ys_train = data.LoadTrainBatch(batch_size, should_augment=True)

        # Send training batch to tensorflow graph (Dropout enabled)
        train_step.run(feed_dict={model_in: xs_train, labels_in: ys_train, model_drop: 0.2, train_mode:True})

        # Display some information each x iterations
        if i % iter_disp == 0:
            # Get validation batch
            xs, ys = data.LoadValBatch(batch_size)
            # Send validation batch to tensorflow graph (Dropout disabled)
            loss_value = loss_val.eval(feed_dict={model_in: xs, labels_in: ys, model_drop: 0.0, train_mode:False})
            print("Epoch: %d, Step: %d, Loss(Val): %g" % (epoch, epoch * num_images_epoch + i, loss_value))

        # write logs at every iteration
        summary = merged_summary_op.eval(feed_dict={model_in: xs_train, labels_in: ys_train, model_drop: 0.0, train_mode:False})
        summary_writer.add_summary(summary, epoch * num_images_epoch + i)

    # Save checkpoint after each epoch
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    checkpoint_path = os.path.join(save_dir, "model")
    filename = saver.save(sess, checkpoint_path, global_step=epoch)
    print("Model saved in file: %s" % filename)

    # Shuffle data at each epoch end
    print("Shuffle data")
    data.shuffleData()

Epoch: 0, Step: 0, Loss(Val): 2.16449e+08
Epoch: 0, Step: 10, Loss(Val): nan
Epoch: 0, Step: 20, Loss(Val): nan
Epoch: 0, Step: 30, Loss(Val): nan
Epoch: 0, Step: 40, Loss(Val): nan
Epoch: 0, Step: 50, Loss(Val): nan
Epoch: 0, Step: 60, Loss(Val): nan
Epoch: 0, Step: 70, Loss(Val): nan
Epoch: 0, Step: 80, Loss(Val): nan
Epoch: 0, Step: 90, Loss(Val): nan
Epoch: 0, Step: 100, Loss(Val): nan
Epoch: 0, Step: 110, Loss(Val): nan
Epoch: 0, Step: 120, Loss(Val): nan
Epoch: 0, Step: 130, Loss(Val): nan
Epoch: 0, Step: 140, Loss(Val): nan
Epoch: 0, Step: 150, Loss(Val): nan
Epoch: 0, Step: 160, Loss(Val): nan
Epoch: 0, Step: 170, Loss(Val): nan
Epoch: 0, Step: 180, Loss(Val): nan
Epoch: 0, Step: 190, Loss(Val): nan
Epoch: 0, Step: 200, Loss(Val): nan
Epoch: 0, Step: 210, Loss(Val): nan
Epoch: 0, Step: 220, Loss(Val): nan
Epoch: 0, Step: 230, Loss(Val): nan
Epoch: 0, Step: 240, Loss(Val): nan
Epoch: 0, Step: 250, Loss(Val): nan
Epoch: 0, Step: 260, Loss(Val): nan
Epoch: 0, Step: 270, Loss(Val):