In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [2]:
import IPython.display as display
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import time
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
# 0 = all messages are logged (default behavior)
# 1 = INFO messages are not printed
# 2 = INFO and WARNING messages are not printed
# 3 = INFO, WARNING, and ERROR messages are not printed

#On Mac you may encounter an error related to OMP, this is a workaround, but slows down the code
os.environ['KMP_DUPLICATE_LIB_OK']='True' #https://github.com/dmlc/xgboost/issues/1715

In [3]:
import tensorflow as tf

In [4]:
from openbot import dataloader, data_augmentation, tfrecord_utils, utils

In [5]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [6]:
tf.__version__

'2.5.0'

## Set train and test dirs

Define the dataset directory and give it a name.

In [7]:
dataset_dir = "/home/marcelsantos/Documents/OpenBotData(copy)"
dataset_name = "my_openbot"

In [8]:
load_from_tf_record = True
if (load_from_tf_record):
    train_data_dir = os.path.join(dataset_dir, "tfrecords/train.tfrec")
    test_data_dir = os.path.join(dataset_dir, "tfrecords/test.tfrec")
else:
    train_data_dir = os.path.join(dataset_dir, "train_data")
    test_data_dir = os.path.join(dataset_dir, "test_data")

## Hyperparameters

You may have to tune the learning rate and batch size depending on your available compute resources and dataset. As a general rule of thumb, if you increase the batch size by a factor of n, you can increase the learning rate by a factor of sqrt(n). For debugging and hyperparamter tuning, you can set the number of epochs to a small value like 10. If you want to train a model which will achieve good performance, you should set it to 50 or more. In our paper we used 100.

In [9]:
TRAIN_BATCH_SIZE = 16 #128
TEST_BATCH_SIZE = 16 #128
LR = 0.0001 #0.0003
NUM_EPOCHS = 10 #100

Don't change these unless you know what you are doing

In [10]:
BN = True
FLIP_AUG = False
CMD_AUG = False

## Load using `tf.data`

If training with tfrecord data, it will load the training and testing records and created the correspondiong `tf.data`

In [11]:
if load_from_tf_record:
    def process_train_sample(features):
        image = features["image"]
        cmd  = features["cmd"]
        label = [features["left"], features["right"]]
        image = data_augmentation.augment_img(image)
        if FLIP_AUG:
            img, cmd, label = data_augmentation.flip_sample(img, cmd, label)
        if CMD_AUG:
            cmd = data_augmentation.augment_cmd(cmd)

        return (image, cmd), label

    def process_test_sample(features):
        image = features["image"]
        cmd = features["cmd"]
        label = [features["left"], features["right"]]
        return (image, cmd), label

    train_dataset = ( 
        tf.data.TFRecordDataset(train_data_dir, num_parallel_reads=AUTOTUNE)
        .map(tfrecord_utils.parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
        .map(process_train_sample, num_parallel_calls=AUTOTUNE)
    )

    # Obtains the images shapes of records from .tfrecords.
    for (image, cmd), label in train_dataset.take(1):
        shape = image.numpy().shape
        NETWORK_IMG_HEIGHT = shape[0]
        NETWORK_IMG_WIDTH = shape[1]
        print("Image shape: ", shape)
        print("Command: ", cmd.numpy())
        print("Label: ", label.numpy())

    test_dataset = (
        tf.data.TFRecordDataset(test_data_dir, num_parallel_reads=AUTOTUNE)
        .map(tfrecord_utils.parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
        .map(process_test_sample, num_parallel_calls=AUTOTUNE)
    )

    # Obtains the total number of records from .tfrecords file
    # https://stackoverflow.com/questions/40472139/obtaining-total-number-of-records-from-tfrecords-file-in-tensorflow
    image_count_train = sum(1 for _ in train_dataset)
    print ("Number of training instances: ", image_count_train)

    image_count_test = sum(1 for _ in test_dataset)
    print ("Number of test instances: ", image_count_test)

    # Prepare train and test datasets for training
    train_ds = (
        train_dataset.shuffle(TRAIN_BATCH_SIZE * 10)
        .repeat()
        .batch(TRAIN_BATCH_SIZE)
        .prefetch(AUTOTUNE)
    )

    test_ds = test_dataset.batch(TEST_BATCH_SIZE).prefetch(AUTOTUNE)

Image shape:  (96, 256, 3)
Command:  0.0
Label:  [0.29411766 0.29411766]
Number of training instances:  15268
Number of test instances:  14315


If not using tfrecord

Running this for the first time will take some time. This code will match image frames to the controls (labels) and indicator signals (commands).  By default, data samples where the vehicle was stationary will be removed. If this is not desired, you need to pass `remove_zeros=False`. If you have made any changes to the sensor files, changed `remove_zeros` or moved your dataset to a new directory, you need to pass `redo_matching=True`. 

In [12]:
from openbot import associate_frames

In [13]:
if not load_from_tf_record:
    def process_train_path(file_path):
        cmd, label = train_data.get_label(
            tf.strings.regex_replace(file_path, "[/\\\\]", "/")
        )
        img = utils.load_img(file_path)
        img = data_augmentation.augment_img(img)
        if FLIP_AUG:
            img, cmd, label = data_augmentation.flip_sample(img, cmd, label)
        if CMD_AUG:
            cmd = data_augmentation.augment_cmd(cmd)
        return (img, cmd), label

    def process_test_path(file_path):
        cmd, label = test_data.get_label(
            tf.strings.regex_replace(file_path, "[/\\\\]", "/")
        )
        img = utils.load_img(file_path)
        return (img, cmd), label
    
    train_datasets = utils.list_dirs(train_data_dir)
    test_datasets = utils.list_dirs(test_data_dir)
    
    max_offset = 1e3
    train_frames = associate_frames.match_frame_ctrl_cmd(
        train_data_dir,
        train_datasets,
        max_offset,
        redo_matching=False,
        remove_zeros=True,
    )
    test_frames = associate_frames.match_frame_ctrl_cmd(
        test_data_dir,
        test_datasets,
        max_offset,
        redo_matching=False,
        remove_zeros=True,
    )

    image_count_train = len(train_frames)
    image_count_test = len(test_frames)
    
    # To load the files as a `tf.data.Dataset` first create a dataset of the file paths. 
    # Depending on dataset size, this may take some time. If you encounter issues, you can use the 
    # commented lines instead. However, this will take **much** longer.
    
    # list_train_ds = tf.data.Dataset.list_files(train_frames)
    # list_test_ds = tf.data.Dataset.list_files(test_frames)
    
    list_train_ds = tf.data.Dataset.list_files(
        [str(train_data_dir + "/" + ds + "/*/images/*") for ds in train_datasets]
    )
    list_test_ds = tf.data.Dataset.list_files(
        [str(test_data_dir + "/" + ds + "/*/images/*") for ds in test_datasets]
    )
    
    train_data = dataloader.dataloader(train_data_dir, train_datasets)
    test_data = dataloader.dataloader(test_data_dir, test_datasets)
    
    # Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
    labeled_ds = list_train_ds.map(process_train_path, num_parallel_calls=4)
    for (image, cmd), label in labeled_ds.take(1):
        shape = image.numpy().shape
        NETWORK_IMG_HEIGHT = shape[0]
        NETWORK_IMG_WIDTH = shape[1]
        print("Image shape: ", shape)
        print("Command: ", cmd.numpy())
        print("Label: ", label.numpy())
    
    train_ds = utils.prepare_for_training(
        ds=labeled_ds,
        batch_sz=TRAIN_BATCH_SIZE,
        shuffle_buffer_sz=100 * TRAIN_BATCH_SIZE,
        prefetch_buffer_sz=TRAIN_BATCH_SIZE,
    )

    test_ds = list_test_ds.map(process_test_path, num_parallel_calls=4)
    test_ds = test_ds.batch(TEST_BATCH_SIZE)
    test_ds = test_ds.prefetch(buffer_size=10 * TRAIN_BATCH_SIZE)

## Training

In [14]:
from openbot import callbacks, losses, metrics, models

model = models.cil_mobile(NETWORK_IMG_WIDTH,NETWORK_IMG_HEIGHT,BN)
loss_fn = losses.sq_weighted_mse_angle 
metric_list = ['MeanAbsoluteError', metrics.direction_metric, metrics.angle_metric]
optimizer = tf.keras.optimizers.Adam(lr=LR)

model.compile(optimizer=optimizer,
          loss=loss_fn, 
          metrics=metric_list)
print(model.summary())

Model: "cil_mobile"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
img_input (InputLayer)          [(None, 96, 256, 3)] 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 48, 128, 32)  2432        img_input[0][0]                  
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 48, 128, 32)  128         conv2d[0][0]                     
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 48, 128, 32)  0           batch_normalization[0][0]        
_________________________________________________________________________________________



In [15]:
MODEL_NAME = dataset_name + "_" + model.name + "_lr" + str(LR) + "_bz" + str(TRAIN_BATCH_SIZE)
if BN:
    MODEL_NAME += "_bn"
if FLIP_AUG:
    MODEL_NAME += "_flip"
if CMD_AUG:
    MODEL_NAME += "_cmd"    
    
checkpoint_path = os.path.join('models', MODEL_NAME, 'checkpoints')
log_path = os.path.join('models',MODEL_NAME,'logs')
print(MODEL_NAME)

my_openbot_cil_mobile_lr0.0001_bz16_bn


In [16]:
STEPS_PER_EPOCH = np.ceil(image_count_train / TRAIN_BATCH_SIZE)
history = model.fit(train_ds, 
                    epochs=NUM_EPOCHS, 
                    steps_per_epoch=STEPS_PER_EPOCH, 
                    validation_data=test_ds, 
                    callbacks=[callbacks.checkpoint_cb(checkpoint_path),
                               callbacks.tensorboard_cb(log_path),
                               callbacks.logger_cb(log_path)])

2021-07-17 10:13:55.351718: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1661] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_NOT_INITIALIZED


Epoch 1/10
  2/955 [..............................] - ETA: 2:55 - loss: 0.1912 - mean_absolute_error: 1.7412 - direction_metric: 0.0625 - angle_metric: 0.0000e+00 

2021-07-17 10:13:57.207600: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1661] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI_ERROR_NOT_INITIALIZED


 91/955 [=>............................] - ETA: 1:48 - loss: 0.4595 - mean_absolute_error: 1.8052 - direction_metric: 0.1717 - angle_metric: 0.0330

KeyboardInterrupt: 

## Evaluation

Plot metrics and loss

In [None]:
plt.plot(history.history['MeanAbsoluteError'], label='mean_absolute_error')
plt.plot(history.history['val_MeanAbsoluteError'], label = 'val_mean_absolute_error')
plt.xlabel('Epoch')
plt.ylabel('Mean Absolute Error')
plt.legend(loc='lower right')
plt.savefig(os.path.join(log_path,'error.png'))

In [None]:
plt.plot(history.history['direction_metric'], label='direction_metric')
plt.plot(history.history['val_direction_metric'], label = 'val_direction_metric')
plt.xlabel('Epoch')
plt.ylabel('Direction Metric')
plt.legend(loc='lower right')
plt.savefig(os.path.join(log_path,'direction.png'))

In [None]:
plt.plot(history.history['angle_metric'], label='angle_metric')
plt.plot(history.history['val_angle_metric'], label = 'val_angle_metric')
plt.xlabel('Epoch')
plt.ylabel('Angle Metric')
plt.legend(loc='lower right')
plt.savefig(os.path.join(log_path,'angle.png'))

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label = 'val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='lower right')
plt.savefig(os.path.join(log_path,'loss.png'))

Save tf lite models for best and last checkpoint

In [None]:
best_index = np.argmax(np.array(history.history['val_angle_metric']) \
                     + np.array(history.history['val_direction_metric']))
best_checkpoint = str("cp-%04d.ckpt" % (best_index+1))
best_tflite = utils.generate_tflite(checkpoint_path, best_checkpoint)
utils.save_tflite (best_tflite, checkpoint_path, "best")
print("Best Checkpoint (val_angle: %s, val_direction: %s): %s" \
      %(history.history['val_angle_metric'][best_index],\
        history.history['val_direction_metric'][best_index],\
        best_checkpoint))

In [None]:
last_checkpoint = sorted([d for d in os.listdir(checkpoint_path) if os.path.isdir(os.path.join(checkpoint_path, d))])[-1]
last_tflite = utils.generate_tflite(checkpoint_path, last_checkpoint)
utils.save_tflite (last_tflite, checkpoint_path, "last")
print("Last Checkpoint (val_angle: %s, val_direction: %s): %s" \
      %(history.history['val_angle_metric'][-1], \
        history.history['val_direction_metric'][-1], \
        last_checkpoint))

Evaluate the best model

In [None]:
best_model = utils.load_model(os.path.join(checkpoint_path,best_checkpoint),loss_fn,metric_list)
test_loss, test_acc, test_dir, test_ang = best_model.evaluate(test_ds, steps=image_count_test/TEST_BATCH_SIZE, verbose=2)

In [None]:
NUM_SAMPLES = 15
(image_batch, cmd_batch), label_batch = next(iter(test_ds))
pred_batch = best_model.predict( (tf.slice(image_batch, [0, 0, 0, 0], [NUM_SAMPLES, -1, -1, -1]), tf.slice(cmd_batch, [0], [NUM_SAMPLES])) )
utils.show_test_batch(image_batch.numpy(), cmd_batch.numpy(), label_batch.numpy(), pred_batch)   

In [None]:
utils.compare_tf_tflite(best_model,best_tflite)

## Save the notebook as HTML

In [None]:
utils.save_notebook()
current_file = 'policy_learning.ipynb'
output_file = os.path.join(log_path,'notebook.html')
utils.output_HTML(current_file, output_file)