# Multimodal Classification Training

This notebook creates the Multimodal Network Architecture and trains it for grasp testset1. After training the network weights will be stored in the folder `./dataset/grasp_testset1_logs`

## Dependencies

`Python 3.5.4` is used for development and following packages are required to run the code provided in the notebook:

`pip install googledrivedownloader`<br>
`pip install matplotlib`<br>
`pip install tensorflow-gpu`<br>
`pip install keras`<br>
`pip install numpy`

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
#!pip install tensorflow==1.15.0
#!pip install keras==2.2.4

In [3]:
import os, csv, time, shutil
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
# path="/content/drive/MyDrive/Deep-Multi-Sensory-Object-Categorization"
path = "/home/alex/Deep-Multi-Sensory-Object-Categorization"

In [4]:
def print_image(image, title):
    """Print the image

    :param image: image pixels in list
    :param title: title as string to be printed on top of the image
    """
    plt.imshow(image)
    plt.title(title)
    plt.colorbar()
    plt.show()

def time_taken(start, end):
    """Human readable time between `start` and `end`

    :param start: time.time()
    :param end: time.time()
    :returns: day:hour:minute:second
    """
    time = end-start
    day = time // (24 * 3600)
    time = time % (24 * 3600)
    hour = time // 3600
    time %= 3600
    minutes = time // 60
    time %= 60
    seconds = time
    day_hour_min_sec = str('%02d' % int(day))+":"+str('%02d' % int(hour))+":"+str('%02d' % int(minutes))+":"+str('%02d' % int(seconds))
    
    return day_hour_min_sec

## Video Dataset

In [5]:
file_0 = path+"/dataset/EMILver1_preprocessed/EMILver1_vi_vgg16fc2_pca19/arr_0.npy"
file_1 = path+"/dataset/EMILver1_preprocessed/EMILver1_vi_vgg16fc2_pca19/arr_1.npy"
file_2 = path+"/dataset/EMILver1_preprocessed/EMILver1_vi_vgg16fc2_pca19/arr_2.npy"
video_frames = np.load(file_0, allow_pickle=True)
action_label = np.load(file_1, allow_pickle=True)
object_label = np.load(file_2, allow_pickle=True)
for i in range(len(video_frames)):
    a01 = video_frames[i]
    while len(a01) < 658:
        a01 = np.concatenate((a01, np.zeros((1, a01.shape[1]))))
    video_frames[i] = a01
video_frames = np.array(list(video_frames))

## Sound Dataset

In [6]:
file_0 = path+"/dataset/EMILver1_preprocessed/EMILver1_au_features/arr_0.npy"
file_1 = path+"/dataset/EMILver1_preprocessed/EMILver1_au_features/arr_1.npy"
file_2 = path+"/dataset/EMILver1_preprocessed/EMILver1_au_features/arr_2.npy"
audio_frames = np.load(file_0, allow_pickle=True)
action_label = np.load(file_1, allow_pickle=True)
object_label = np.load(file_2, allow_pickle=True)
for i in range(len(audio_frames)):
    a01 = audio_frames[i]
    while len(a01) < 658:
        a01 = np.concatenate((a01, np.zeros((1, a01.shape[1]))))
    audio_frames[i] = a01
audio_frames = np.array(list(audio_frames))

## Haptic Dataset

In [7]:
file_0 = path+"/dataset/EMILver1_preprocessed/EMILver1_sm_features/arr_0.npy"
file_1 = path+"/dataset/EMILver1_preprocessed/EMILver1_sm_features/arr_1.npy"
file_2 = path+"/dataset/EMILver1_preprocessed/EMILver1_sm_features/arr_2.npy"
haptic_frames = np.load(file_0, allow_pickle=True)
action_label = np.load(file_1, allow_pickle=True)
object_label = np.load(file_2, allow_pickle=True)
for i in range(len(haptic_frames)):
    a01 = haptic_frames[i]
    while len(a01) < 658:
        a01 = np.concatenate((a01, np.zeros((1, a01.shape[1]))))
    haptic_frames[i] = a01
haptic_frames = np.array(list(haptic_frames))

In [8]:
# one-hot encoding
num_classes = np.nanmax(action_label)+1
action_label_one_hot = np.zeros((len(action_label), num_classes)).astype(int)
for i in range(len(action_label)):
    action_label_one_hot[i, action_label[i]] = 1

num_classes = np.nanmax(object_label)+1
object_label_one_hot = np.zeros((len(object_label), num_classes)).astype(int)
for i in range(len(object_label)):
    object_label_one_hot[i, object_label[i]] = 1

# train-test-split
num_data = len(object_label)
train_id, test_id = train_test_split(np.array(range(num_data)), random_state=0)

## Multimodal Network Hyper-parameters

This network was trained for 300 epochs using Adam optimization with learning rate 1 x $10^{-4}$.

In [9]:
# Network hyper-parameters
batch = 5
training_epochs = 300
display_step = 1

behavior = "grasp"
testset = "testset1"
folder_name = behavior+'_'+testset
model_path = path+"/dataset/"+folder_name+"_logs/model.ckpt"
logs_path = path+"/dataset/"+folder_name+"_logs/"

# num_classes = category_label_train_one_hot.shape[1]
num_classes = object_label_one_hot.shape[1]

Y = tf.placeholder('float', [None, num_classes], name='LabelData')
print("Y: ", Y)

video_frames_max = 658
video_size = video_frames.shape[2]
video_X = tf.placeholder('float', [None, video_frames_max, video_size], name='InputData')

audio_frames_max = 658
audio_size = audio_frames.shape[2]
audio_keep_prob = tf.placeholder_with_default(1.0, shape=(), name='audio_keep')

haptic_frames_max = 658
haptic_size = haptic_frames.shape[2]
haptic_keep_prob = tf.placeholder_with_default(1.0, shape=(), name='haptic_keep')


Y:  Tensor("LabelData:0", shape=(?, 30), dtype=float32)



In [10]:
"""
Functions used to define models
"""

haptic_skip_2nd_maxpool = ["grasp", "hold", "low"]

def model(video_data_placeholder):
    with tf.name_scope("Model"):
        # Video
        net = tf.layers.flatten(video_data_placeholder)
        net = tf.layers.dense(inputs=net, units=256, activation=tf.nn.relu)
        net = tf.layers.dropout(inputs=net)
        net = tf.layers.dense(inputs=net, units=256, activation=tf.nn.relu)
        net = tf.layers.dropout(inputs=net)
        video_logits = tf.layers.dense(inputs=net, units=num_classes, activation=tf.nn.relu)
        
        # Audio
        audio_data_placeholder = tf.placeholder('float', [None, audio_frames_max, audio_size], name='audio_InputData')
        net = tf.layers.flatten(audio_data_placeholder)
        # Dense Layer
        net = tf.layers.dense(inputs=net, units=256, activation=tf.nn.relu)
        net = tf.layers.dropout(inputs=net, rate=audio_keep_prob)
        net = tf.layers.dense(inputs=net, units=256, activation=tf.nn.relu)
        net = tf.layers.dropout(inputs=net, rate=audio_keep_prob)
        audio_logits = tf.layers.dense(inputs=net, units=num_classes, activation=tf.nn.relu)
        
        # Haptic
        haptic_data_placeholder = tf.placeholder('float', [None, haptic_frames_max, haptic_size], name='haptic_InputData')
        net = tf.layers.flatten(haptic_data_placeholder)
        # Dense Layer
        net = tf.layers.dense(inputs=net, units=256, activation=tf.nn.relu)
        net = tf.layers.dropout(inputs=net, rate=haptic_keep_prob)
        haptic_logits = tf.layers.dense(inputs=net, units=num_classes, activation=tf.nn.relu)
        
        # Concatenate 
        logits = tf.concat([video_logits, audio_logits, haptic_logits], axis=1)
        logits = tf.nn.relu(logits)
        logits = tf.layers.dense(inputs=logits, units=num_classes)
        
    return logits


def loss(prediction, label_placeholder):
    with tf.name_scope('Loss'):
        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=prediction, labels=label_placeholder))
        # Create a summary to monitor cost tensor
        cost_scalar = tf.summary.scalar("loss", cost)
    return cost, cost_scalar

def training(prediction, label_placeholder):
    with tf.name_scope('Optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
        train_op = optimizer.minimize(cost)
    return train_op

def evaluate(prediction, Y):
    with tf.name_scope('Accuracy'):
        # Test model
        correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
        # Calculate accuracy
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
        # Create a summary to monitor accuracy tensor
        accuracy_scalar = tf.summary.scalar("accuracy", accuracy)
    return accuracy, accuracy_scalar

In [11]:
"""
Creating the Neural Network
"""

model_dict = {}
prediction = model(video_X)
model_dict["Model"] = prediction

cost, cost_scalar = loss(prediction, Y)
model_dict["Loss"] = cost
model_dict["Loss_scalar"] = cost_scalar

train_op = training(prediction, Y)
model_dict["Optimizer"] = train_op

eval_op, accuracy_scalar = evaluate(prediction, Y)
model_dict["Accuracy"] = eval_op
model_dict["Accuracy_scalar"] = accuracy_scalar

print("model_dict: ", model_dict)

# Initializing the variables
init = tf.global_variables_initializer()

# 'Saver' op to save and restore all the variables
saver = tf.train.Saver(max_to_keep=1)

Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Use keras.layers.dropout instead.


model_dict:  {'Model': <tf.Tensor 'Model/dense_8/BiasAdd:0' shape=(?, 30) dtype=float32>, 'Loss': <tf.Tensor 'Loss/Mean:0' shape=() dtype=float32>, 'Loss_scalar': <tf.Tensor 'Loss/loss:0' shape=() dtype=string>, 'Optimizer': <tf.Operation 'Optimizer/Adam' type=NoOp>, 'Accuracy': <tf.Tensor 'Accuracy/Mean:0' shape=() dtype=float32>, 'Accuracy_scalar': <tf.Tensor 'Accuracy/accuracy:0' shape=() dtype=string>}




In [12]:
if os.path.exists(logs_path):
    shutil.rmtree(logs_path)
    os.makedirs(logs_path)
else:
    os.makedirs(logs_path)

In [13]:
"""
Writing 'Time', 'Epoch', 'Cost', 'Accuracy' in CSV file
"""

epoch_cost_accuracy = []
epoch_cost_accuracy.append("Time")
epoch_cost_accuracy.append("Epoch")
epoch_cost_accuracy.append("Cost")
epoch_cost_accuracy.append("Accuracy")

with open(logs_path+folder_name+"_data.csv",'w') as f:
    writer = csv.writer(f, lineterminator="\n")
    writer.writerow(epoch_cost_accuracy)

## Training

In [14]:
"""## Training"""

# Start Training

start_time = time.time()

with tf.Session() as sess:
    # Initialize variables
    sess.run(init)
    
    # op to write logs to Tensorboard
    summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
    
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost_list = 0.0
        total_batch = int(len(train_id)/batch)
        
        # Shuffle data
        np.random.shuffle(train_id)
        
        i = 0
        # Loop over all batches
        for start, end in zip(range(0, len(train_id), batch), range(batch, len(train_id)+1, batch)):
            video_input_data, label_data = video_frames[train_id][start:end], object_label_one_hot[train_id][start:end]

            audio_input_data = audio_frames[train_id][start:end]
            audio_X = tf.get_default_graph().get_tensor_by_name("Model/audio_InputData:0")

            haptic_input_data = haptic_frames[train_id][start:end]
            haptic_X = tf.get_default_graph().get_tensor_by_name("Model/haptic_InputData:0")

            _, new_cost, cost_scalar = sess.run(
                [model_dict["Optimizer"], model_dict["Loss"], model_dict["Loss_scalar"]], 
                feed_dict={
                    video_X: video_input_data, 
                    audio_X: audio_input_data, 
                    haptic_X: haptic_input_data, 
                    Y: label_data, 
                    audio_keep_prob: 0.5, 
                    haptic_keep_prob: 0.5
                    }
                )
            # Compute average loss
            avg_cost_list += new_cost/total_batch

            summary_writer.add_summary(cost_scalar, epoch * total_batch + i)
            i += 1
        save_path = saver.save(sess, model_path, epoch)
         
        # Calculate Accuracy
        avg_accuracy_list = 0.0
        total_batch = int(len(test_id)/batch)
        i = 0
        for start, end in zip(range(0, len(test_id), batch), range(batch, len(test_id)+1, batch)):
            video_input_data, label_data = video_frames[test_id][start:end], object_label_one_hot[test_id][start:end]

            audio_input_data = audio_frames[test_id][start:end]
            audio_X = tf.get_default_graph().get_tensor_by_name("Model/audio_InputData:0")

            haptic_input_data = haptic_frames[test_id][start:end]
            haptic_X = tf.get_default_graph().get_tensor_by_name("Model/haptic_InputData:0")

            accuracy, accuracy_scalar = sess.run([model_dict["Accuracy"], model_dict["Accuracy_scalar"]], feed_dict={video_X: video_input_data, audio_X: audio_input_data, haptic_X: haptic_input_data, Y: label_data, audio_keep_prob: 1.0, haptic_keep_prob: 1.0})
            # Compute average accuracy
            avg_accuracy_list += accuracy/total_batch
            summary_writer.add_summary(accuracy_scalar, epoch * total_batch + i)
            i += 1
        
        # Printing current epoch accuracy
        epoch_cost_accuracy = []
        epoch_cost_accuracy.append(time_taken(start_time, time.time()))
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), ", Time: ", time_taken(start_time, time.time()))
            a_string = "Cost - "
            epoch_cost_accuracy.append(epoch+1)
            
            a_string += str(avg_cost_list)
            epoch_cost_accuracy.append(str(avg_cost_list))
            
            a_string = a_string[0:-2]+" --> Accuracy - "
            a_string += str(avg_accuracy_list)
            epoch_cost_accuracy.append(str(avg_accuracy_list))
            
            print(a_string)
        
        # Writing current epoch data
        with open(logs_path+folder_name+"_data.csv", 'a') as f: # append to the file created
            writer = csv.writer(f, lineterminator="\n")
            writer.writerow(epoch_cost_accuracy)
    
    print("Optimization Finished!")
    end_time = time.time()
    print("Time taken: day, hour, minutes, seconds->", time_taken(start_time, end_time))




2021-12-02 17:41:32.250730: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-02 17:41:32.279556: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2394450000 Hz
2021-12-02 17:41:32.279928: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x560124708090 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-12-02 17:41:32.279945: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
OMP: Info #155: KMP_AFFINITY: Initial OS proc set respected: 0-7
OMP: Info #216: KMP_AFFINITY: decoding x2APIC ids.
OMP: Info #157: KMP_AFFINITY: 8 available OS procs
OMP: Info #158: KMP_AFFINITY: Uniform topology
OMP: Info 





OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21315 thread 1 bound to OS proc set 2
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21314 thread 2 bound to OS proc set 4
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21319 thread 3 bound to OS proc set 6
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21320 thread 4 bound to OS proc set 1
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21321 thread 5 bound to OS proc set 3
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21322 thread 6 bound to OS proc set 5
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21324 thread 8 bound to OS proc set 0
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21323 thread 7 bound to OS proc set 7
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21326 thread 10 bound to OS proc set 4
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21327 thread 11 bound to OS proc set 6
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21328 thread 12 bound to OS proc set 1
OMP: Info #254: KMP_AFFINITY: pid 21269 tid 21325 thread 9 bound to OS proc set 2
OMP: Info #25

Epoch: 0001 , Time:  00:00:00:05
Cost - 5.8638319240676 --> Accuracy - 0.016666666915019352
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Epoch: 0002 , Time:  00:00:00:09
Cost - 2.95379562510384 --> Accuracy - 0.016666666915019352
Epoch: 0003 , Time:  00:00:00:13
Cost - 2.33797301517592 --> Accuracy - 0.1000000014901161
Epoch: 0004 , Time:  00:00:00:17
Cost - 1.86116835971673 --> Accuracy - 0.08333333457509676
Epoch: 0005 , Time:  00:00:00:22
Cost - 1.4803913599914 --> Accuracy - 0.15000000223517415
Epoch: 0006 , Time:  00:00:00:26
Cost - 1.1714843370848 --> Accuracy - 0.13333333532015482
Epoch: 0007 , Time:  00:00:00:30
Cost - 0.8165861972504 --> Accuracy - 0.13333333532015482
Epoch: 0008 , Time:  00:00:00:34
Cost - 0.60441390714711 --> Accuracy - 0.13333333532015482
Epoch: 0009 , Time:  00:00:00:38
Cost - 0.496376942067096 --> Accuracy - 0.13333333532015482
Epoch: 0010 , Time:  00:00:00:42
Cost - 0.3392734201624 --> Accuracy - 0.13333333532015482

Epoch: 0085 , Time:  00:00:06:10
Cost - 0.00089364455197937 --> Accuracy - 0.20000000298023224
Epoch: 0086 , Time:  00:00:06:14
Cost - 0.00086461078899446 --> Accuracy - 0.20000000298023224
Epoch: 0087 , Time:  00:00:06:18
Cost - 0.00084332484564381 --> Accuracy - 0.20000000298023224
Epoch: 0088 , Time:  00:00:06:22
Cost - 0.00081978233275650 --> Accuracy - 0.20000000298023224
Epoch: 0089 , Time:  00:00:06:26
Cost - 0.00079672044699287 --> Accuracy - 0.20000000298023224
Epoch: 0090 , Time:  00:00:06:31
Cost - 0.00077628210743164 --> Accuracy - 0.20000000298023224
Epoch: 0091 , Time:  00:00:06:35
Cost - 0.00075362839076357 --> Accuracy - 0.20000000298023224
Epoch: 0092 , Time:  00:00:06:39
Cost - 0.00073224128330669 --> Accuracy - 0.20000000298023224
Epoch: 0093 , Time:  00:00:06:43
Cost - 0.00071350790353284 --> Accuracy - 0.20000000298023224
Epoch: 0094 , Time:  00:00:06:47
Cost - 0.00069618676489982 --> Accuracy - 0.2166666698952516
Epoch: 0095 , Time:  00:00:06:51
Cost - 0.000677477

Epoch: 0172 , Time:  00:00:12:27
Cost - 0.000108970131704053 --> Accuracy - 0.2166666698952516
Epoch: 0173 , Time:  00:00:12:32
Cost - 0.000106953195326140 --> Accuracy - 0.2166666698952516
Epoch: 0174 , Time:  00:00:12:37
Cost - 0.000104468718139590 --> Accuracy - 0.2166666698952516
Epoch: 0175 , Time:  00:00:12:41
Cost - 0.000102192889850509 --> Accuracy - 0.2166666698952516
Epoch: 0176 , Time:  00:00:12:46
Cost - 9.997259813745687e- --> Accuracy - 0.2166666698952516
Epoch: 0177 , Time:  00:00:12:50
Cost - 9.774703165829933e- --> Accuracy - 0.2166666698952516
Epoch: 0178 , Time:  00:00:12:54
Cost - 9.582277016306763e- --> Accuracy - 0.2166666698952516
Epoch: 0179 , Time:  00:00:12:59
Cost - 9.385674527746031e- --> Accuracy - 0.2166666698952516
Epoch: 0180 , Time:  00:00:13:04
Cost - 9.147488951081567e- --> Accuracy - 0.2166666698952516
Epoch: 0181 , Time:  00:00:13:08
Cost - 8.935261282911395e- --> Accuracy - 0.2166666698952516
Epoch: 0182 , Time:  00:00:13:13
Cost - 8.79653031612785

Epoch: 0259 , Time:  00:00:18:55
Cost - 1.7418239723257204e- --> Accuracy - 0.21666666989525157
Epoch: 0260 , Time:  00:00:18:59
Cost - 1.7070554122256e- --> Accuracy - 0.21666666989525157
Epoch: 0261 , Time:  00:00:19:03
Cost - 1.67520081251698e- --> Accuracy - 0.21666666989525157
Epoch: 0262 , Time:  00:00:19:07
Cost - 1.6416907353939477e- --> Accuracy - 0.21666666989525157
Epoch: 0263 , Time:  00:00:19:11
Cost - 1.602220159687325e- --> Accuracy - 0.21666666989525157
Epoch: 0264 , Time:  00:00:19:15
Cost - 1.5765906987831433e- --> Accuracy - 0.21666666989525157
Epoch: 0265 , Time:  00:00:19:20
Cost - 1.5438750867562907e- --> Accuracy - 0.21666666989525157
Epoch: 0266 , Time:  00:00:19:24
Cost - 1.513543638035723e- --> Accuracy - 0.21666666989525157
Epoch: 0267 , Time:  00:00:19:28
Cost - 1.4759935488149897e- --> Accuracy - 0.21666666989525157
Epoch: 0268 , Time:  00:00:19:32
Cost - 1.4504303256520261e- --> Accuracy - 0.21666666989525157
Epoch: 0269 , Time:  00:00:19:36
Cost - 1.41970

In [15]:
print("Results are saved in the file " + logs_path+folder_name+"_data.csv")

Results are saved in the file /home/alex/Deep-Multi-Sensory-Object-Categorization/dataset/grasp_testset1_logs/grasp_testset1_data.csv
