# Multimodal Classification Testing

This notebook restores the multimodal network trained for grasp testset1 stored in `./dataset/grasp_testset1_logs` and test it.

## Dependencies

`Python 3.5.4` is used for development and following packages are required to run the code provided in the notebook:

`pip install googledrivedownloader`<br>
`pip install matplotlib`<br>
`pip install tensorflow-gpu`<br>
`pip install keras`<br>
`pip install numpy`

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install tensorflow==1.15.0
!pip install keras==2.2.4

In [None]:
import pickle, os, csv, time, shutil
import json
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
path="/content/drive/MyDrive/Deep-Multi-Sensory-Object-Categorization"

In [None]:
def print_image(image, title):
    """Print the image

    :param image: image pixels in list
    :param title: title as string to be printed on top of the image
    """
    plt.imshow(image)
    plt.title(title)
    plt.colorbar()
    plt.show()

def time_taken(start, end):
    """Human readable time between `start` and `end`

    :param start: time.time()
    :param end: time.time()
    :returns: day:hour:minute:second
    """
    time = end-start
    day = time // (24 * 3600)
    time = time % (24 * 3600)
    hour = time // 3600
    time %= 3600
    minutes = time // 60
    time %= 60
    seconds = time
    day_hour_min_sec = str('%02d' % int(day))+":"+str('%02d' % int(hour))+":"+str('%02d' % int(minutes))+":"+str('%02d' % int(seconds))
    
    return day_hour_min_sec

## Video Dataset

In [None]:
file_0 = path+"/dataset/EMILver1_preprocessed/EMILver1_vi_vgg16fc2_pca19/arr_0.npy"
file_1 = path+"/dataset/EMILver1_preprocessed/EMILver1_vi_vgg16fc2_pca19/arr_1.npy"
file_2 = path+"/dataset/EMILver1_preprocessed/EMILver1_vi_vgg16fc2_pca19/arr_2.npy"
video_frames = np.load(file_0, allow_pickle=True)
action_label = np.load(file_1, allow_pickle=True)
object_label = np.load(file_2, allow_pickle=True)
for i in range(len(video_frames)):
    a01 = video_frames[i]
    while len(a01) < 658:
        a01 = np.concatenate((a01, np.zeros((1, a01.shape[1]))))
    video_frames[i] = a01
video_frames = np.array(list(video_frames))

In [None]:
video_frames[0].shape

## Sound Dataset

In [None]:
file_0 = path+"/dataset/EMILver1_preprocessed/EMILver1_au_features/arr_0.npy"
file_1 = path+"/dataset/EMILver1_preprocessed/EMILver1_au_features/arr_1.npy"
file_2 = path+"/dataset/EMILver1_preprocessed/EMILver1_au_features/arr_2.npy"
audio_frames = np.load(file_0, allow_pickle=True)
action_label = np.load(file_1, allow_pickle=True)
object_label = np.load(file_2, allow_pickle=True)
for i in range(len(audio_frames)):
    a01 = audio_frames[i]
    while len(a01) < 658:
        a01 = np.concatenate((a01, np.zeros((1, a01.shape[1]))))
    audio_frames[i] = a01
audio_frames = np.array(list(audio_frames))

## Haptic Dataset

In [None]:
file_0 = path+"/dataset/EMILver1_preprocessed/EMILver1_sm_features/arr_0.npy"
file_1 = path+"/dataset/EMILver1_preprocessed/EMILver1_sm_features/arr_1.npy"
file_2 = path+"/dataset/EMILver1_preprocessed/EMILver1_sm_features/arr_2.npy"
haptic_frames = np.load(file_0, allow_pickle=True)
action_label = np.load(file_1, allow_pickle=True)
object_label = np.load(file_2, allow_pickle=True)
for i in range(len(haptic_frames)):
    a01 = haptic_frames[i]
    while len(a01) < 658:
        a01 = np.concatenate((a01, np.zeros((1, a01.shape[1]))))
    haptic_frames[i] = a01
haptic_frames = np.array(list(haptic_frames))

In [None]:
# one-hot encoding
num_classes = np.nanmax(action_label)+1
action_label_one_hot = np.zeros((len(action_label), num_classes)).astype(int)
for i in range(len(action_label)):
    action_label_one_hot[i, action_label[i]] = 1

num_classes = np.nanmax(object_label)+1
object_label_one_hot = np.zeros((len(object_label), num_classes)).astype(int)
for i in range(len(object_label)):
    object_label_one_hot[i, object_label[i]] = 1

# train-test-split
num_data = len(object_label)
train_id, test_id = train_test_split(np.array(range(num_data)), random_state=0)

## Building the Multimodal Network Architecture

<img src="pics/Multimodal.png" alt="drawing" width="400px"/>

## Multimodal Network Hyper-parameters

This network was trained for 300 epochs using Adam optimization with learning rate 1 x $10^{-4}$.

In [None]:
# Network hyper-parameters
batch = 1
training_epochs = 300
display_step = 1

behavior = "grasp"
testset = "testset1"
folder_name = behavior+'_'+testset
model_path = path+"/dataset/"+folder_name+"_logs/model.ckpt"
logs_path = path+"/dataset/"+folder_name+"_logs/"

# num_classes = category_label_train_one_hot.shape[1]
num_classes = object_label_one_hot.shape[1]

Y = tf.placeholder('float', [None, num_classes], name='LabelData')
print("Y: ", Y)

video_frames_max = 658
video_size = video_frames.shape[2]
video_X = tf.placeholder('float', [None, video_frames_max, video_size], name='InputData')

audio_frames_max = 658
audio_size = audio_frames.shape[2]
audio_keep_prob = tf.placeholder_with_default(1.0, shape=(), name='audio_keep')

haptic_frames_max = 658
haptic_size = haptic_frames.shape[2]
haptic_keep_prob = tf.placeholder_with_default(1.0, shape=(), name='haptic_keep')

In [None]:
"""
Functions used to define models
"""

haptic_skip_2nd_maxpool = ["grasp", "hold", "low"]

def model(video_data_placeholder):
    with tf.name_scope("Model"):
        # Video
        net = tf.layers.flatten(video_data_placeholder)
        net = tf.layers.dense(inputs=net, units=256, activation=tf.nn.relu)
        net = tf.layers.dropout(inputs=net)
        net = tf.layers.dense(inputs=net, units=256, activation=tf.nn.relu)
        net = tf.layers.dropout(inputs=net)
        video_logits = tf.layers.dense(inputs=net, units=num_classes, activation=tf.nn.relu)
        
        # Audio
        audio_data_placeholder = tf.placeholder('float', [None, audio_frames_max, audio_size], name='audio_InputData')
        net = tf.layers.flatten(audio_data_placeholder)
        # Dense Layer
        net = tf.layers.dense(inputs=net, units=256, activation=tf.nn.relu)
        net = tf.layers.dropout(inputs=net, rate=audio_keep_prob)
        net = tf.layers.dense(inputs=net, units=256, activation=tf.nn.relu)
        net = tf.layers.dropout(inputs=net, rate=audio_keep_prob)
        audio_logits = tf.layers.dense(inputs=net, units=num_classes, activation=tf.nn.relu)
        
        # Haptic
        haptic_data_placeholder = tf.placeholder('float', [None, haptic_frames_max, haptic_size], name='haptic_InputData')
        net = tf.layers.flatten(haptic_data_placeholder)
        # Dense Layer
        net = tf.layers.dense(inputs=net, units=256, activation=tf.nn.relu)
        net = tf.layers.dropout(inputs=net, rate=haptic_keep_prob)
        haptic_logits = tf.layers.dense(inputs=net, units=num_classes, activation=tf.nn.relu)
        
        # Concatenate 
        logits = tf.concat([video_logits, audio_logits, haptic_logits], axis=1)
        logits = tf.nn.relu(logits)
        logits = tf.layers.dense(inputs=logits, units=num_classes)
        
    return logits


def loss(prediction, label_placeholder):
    with tf.name_scope('Loss'):
        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=prediction, labels=label_placeholder))
        # Create a summary to monitor cost tensor
        cost_scalar = tf.summary.scalar("loss", cost)
    return cost, cost_scalar

def training(prediction, label_placeholder):
    with tf.name_scope('Optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
        train_op = optimizer.minimize(cost)
    return train_op

def evaluate(prediction, Y):
    with tf.name_scope('Accuracy'):
        # Test model
        correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1))
        # Calculate accuracy
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
        # Create a summary to monitor accuracy tensor
        accuracy_scalar = tf.summary.scalar("accuracy", accuracy)
    return accuracy, accuracy_scalar

In [None]:
"""
Creating the Neural Network
"""

model_dict = {}
prediction = model(video_X)
model_dict["Model"] = prediction

cost, cost_scalar = loss(prediction, Y)
model_dict["Loss"] = cost
model_dict["Loss_scalar"] = cost_scalar

train_op = training(prediction, Y)
model_dict["Optimizer"] = train_op

eval_op, accuracy_scalar = evaluate(prediction, Y)
model_dict["Accuracy"] = eval_op
model_dict["Accuracy_scalar"] = accuracy_scalar

print("model_dict: ", model_dict)

# Initializing the variables
init = tf.global_variables_initializer()

# 'Saver' op to save and restore all the variables
saver = tf.train.Saver(max_to_keep=1)

## Testing for the whole clip

In [None]:
# Evaluate on test set

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, tf.train.latest_checkpoint(logs_path))
    
    avg_accuracy_list = 0.0
    total_batch = int(len(test_id)/batch)
    
    for start, end in zip(range(0, len(test_id), batch), range(batch, len(test_id)+1, batch)):
        video_input_data, label_data = video_frames[start:end], object_label_one_hot[start:end]

        audio_input_data = audio_frames[start:end]
        audio_X = tf.get_default_graph().get_tensor_by_name("Model/audio_InputData:0")

        haptic_input_data = haptic_frames[start:end]
        haptic_X = tf.get_default_graph().get_tensor_by_name("Model/haptic_InputData:0")
    
        accuracy = sess.run(model_dict["Accuracy"], 
                            feed_dict={
                                video_X: video_input_data, 
                                audio_X: audio_input_data, 
                                haptic_X: haptic_input_data, 
                                Y: label_data, 
                                audio_keep_prob: 1.0, 
                                haptic_keep_prob: 1.0
                            }
                           )
        
        avg_accuracy_list += accuracy/total_batch
            
    print("Overall Accuracy: ", avg_accuracy_list)

## Testing for every frame in video and for every 5th frame in audio and haptic

In [None]:
# Predict after every frame in video and for every 5th frame in audio and haptic

all_frames_acc = {}
for a_frame in range(1, video_frames_max+1):
    all_frames_acc[a_frame] = [0, 0, 0] # correct, wrong, percentage

y_pred = {}
y_score = {}
for a_frame in range(1, video_frames_max+1):
    y_pred[a_frame] = []
    y_score[a_frame] = []
    
def fill_example(example, fill_frames, a_frame):
    temp = []
    temp.append(example[a_frame-1])
    for _ in range(fill_frames):
        example = np.concatenate((example, temp))
    return example

def give_me_complete_clip(num_of_frame, example, frame_list):
    fill_frames = frame_list - num_of_frame
    if fill_frames != 0:
        example = fill_example(example, fill_frames, num_of_frame)
    return example

category_labels = {"soft-blue-ball": 1, "soft-red-ball": 2, "soft-orange-ball": 3, "hard-pink-ball": 4, "light-red-car": 5, "blue-car": 6, "yellow-car": 7, "heavy-green-car": 8, "light-red-sponge": 9, "light-blue-sponge": 10, "blue-tissues": 11, "pink-tissues": 12, "soft-banana": 13, "light-hard-banana": 14, "heavy-hard-banana": 15, "hard-small-banana": 16, "green-frog": 17, "purple-duck": 18, "orange-fish": 19, "yellow-seal": 20, "small-yellow-die": 21, "big-yellow-die": 22, "light-soft-apple": 23, "light-hard-apple": 24, "heavy-soft-apple": 25, "heavy-hard-apple": 26, "heavy-blue-toy": 27, "red-toy": 28, "yellow-toy": 29, "heavy-green-toy": 30}

with tf.Session() as sess:
    sess.run(init)
    saver.restore(sess, tf.train.latest_checkpoint(logs_path))
    
    for i_example in range(len(test_id)):
        print("Example Number: ", i_example+1)
        video_input_data, label_data = video_frames[test_id][i_example], object_label_one_hot[test_id][i_example]
        
        audio_input_data = audio_frames[test_id][i_example]
        
        haptic_input_data = haptic_frames[test_id][i_example]
        
        truth = np.argmax(label_data, 0)+1
        truth_cate = list(category_labels.keys())[list(category_labels.values()).index(truth)]
        print("True Category: ", truth_cate)
        
        for a_frame in range(1, video_frames_max+1):
            try:
                video_example = give_me_complete_clip(a_frame, video_input_data[0:a_frame], video_frames_max)
                video_temp = []
                video_temp.append(video_example)

                frame5Hz = a_frame*5 # For every frame in video there are 5 frames in audio and haptic

                audio_example = give_me_complete_clip(frame5Hz, audio_input_data[0:frame5Hz], audio_frames_max)
                audio_temp = []
                audio_temp.append(audio_example)

                haptic_example = give_me_complete_clip(frame5Hz, haptic_input_data[0:frame5Hz], haptic_frames_max)
                haptic_temp = []
                haptic_temp.append(haptic_example)

                audio_X = tf.get_default_graph().get_tensor_by_name("Model/audio_InputData:0")
                haptic_X = tf.get_default_graph().get_tensor_by_name("Model/haptic_InputData:0")

                predic = sess.run(model_dict["Model"], feed_dict={video_X: video_temp, audio_X: audio_temp, haptic_X: haptic_temp, audio_keep_prob: 1.0, haptic_keep_prob: 1.0})
                argmax = sess.run(tf.argmax(predic, 1))+1
                cate = list(category_labels.keys())[list(category_labels.values()).index(argmax)]
                print("Video Frame No.: ", a_frame, "Audio/Haptic Frame No.: ", frame5Hz, "Prediction: ", cate)

                y_pred[a_frame].extend(argmax)
                y_score[a_frame].extend(predic)

                if truth_cate == cate:
                    all_frames_acc[a_frame][0] += 1
                else:
                    all_frames_acc[a_frame][1] += 1
                all_frames_acc[a_frame][2] = all_frames_acc[a_frame][0]/(all_frames_acc[a_frame][0]+all_frames_acc[a_frame][1])
            except:
                pass
            
        print("X"*50)

In [None]:
"""
Writing a CSV file that records accuracy for each frame
"""

with open(logs_path+folder_name+"_acc_VEveryFrame_AH5Frame.csv",'w') as f:
    writer = csv.writer(f, lineterminator="\n")
    
    first_row = ["Frame_No", "Correct", "Wrong", "Accuracy"]
    writer.writerow(first_row)
    for a_frame in range(1, video_frames_max+1):
        row = [a_frame]
        row.append(all_frames_acc[a_frame][0])
        row.append(all_frames_acc[a_frame][1])
        row.append(all_frames_acc[a_frame][2])
        writer.writerow(row)