In [None]:
#######################################################################################################################
# Project: Deep Virtual Rapport Agent (head gesture detector)
#
#     Jan Ondras (jo951030@gmail.com)
#     Institute for Creative Technologies, University of Southern California
#     April-October 2019
#
#######################################################################################################################
#
#     Test each model trained on one of the vra1, hatice2010, sewa, and nvb datasets on each of these datasets
#     (cross-dataset evaluation)
#
#     So far, 4 x 3 = 12 nod only cross-dataset tests were performed. 
#
#######################################################################################################################

In [1]:
###########################################################
import numpy as np
random_seed = 37
np.random.seed(random_seed)
from tensorflow import set_random_seed
set_random_seed(random_seed)
###########################################################

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from keras.models import Sequential, load_model
# from keras_tqdm import TQDMNotebookCallback
from utils import load_train_history, save_train_history, plot_loss_history, arch_to_str, evaluate_custom_metrics
from collections import defaultdict
import pickle
from keras import backend as K
import glob
import time
from matplotlib import pyplot as plt

Using TensorFlow backend.


In [2]:
#######################################################################################################################
# For a given head gesture: perform all possible cross-dataset evaluations and save the results as one .pkl file
#
# Saved to the folder ./test_results_{S,nonS}
# Filenames naming convention: test_results_{dataset_name}_{S,nonS}_{head_gesture}_{window_size}ws_{number_of_features}f_{model_architecture}.pkl 
#######################################################################################################################

#############################################
# Smooth the predicted labels?
smooth_labels = False # Name infix *_nonS_*
smooth_labels = True  # Name infix *_S_*
#############################################

HEAD_GESTURE = 'nod'
# HEAD_GESTURE = 'shake'
# HEAD_GESTURE = 'tilt'
WINDOW_SIZE = 32
N_FEATURES = 12
# GRU_ARCH = [128]
# GRU_ARCH = [32]
GRU_ARCH = [16]
# GRU_ARCH = [8]
# GRU_ARCH = [4]

dataset_type = f'{WINDOW_SIZE}ws_{N_FEATURES}f'
model_type = f'{dataset_type}_{arch_to_str(GRU_ARCH)}u'

metrics_names = ['bacc', 'f1', 'precision', 'recall']
voting_strategies = ['last', 'majority']
dataset_names = ['vra1', 'sewa', 'hatice2010', 'nvb']

datasets_path_prefix = f'/home/ICT2000/jondras/dvra_datasets'
checkpoints_path_prefix = f'/home/ICT2000/jondras/deep-virtual-rapport-agent/head_gesture_detector/checkpoints'

test_results_type = f'test_results_S' if smooth_labels else f'test_results_nonS'
if not os.path.exists(f'./test_results/{test_results_type}'):
    os.makedirs(f'./test_results/{test_results_type}')
    
# Test results are saved as dictionary:
#     test_results[train_dataset_name][test_dataset_name][voting_strategy][metric_name]
test_results = dict()

start_time = time.time()
# Iterate over models trained on various datasets
for model_path in sorted(glob.glob(f'{checkpoints_path_prefix}/*_{HEAD_GESTURE}_{model_type}.hdf5')):
    
    train_dataset_name = model_path.split('/')[-1].split('_')[0]
    if train_dataset_name not in dataset_names: continue
    test_results[train_dataset_name] = dict()
    
    # Load best model for this dataset
    # local_start_time = time.time() 
    K.clear_session()
    best_model = load_model(model_path)
    print(f'Loading model:\n\t{model_path.split("/")[-1]} \t #params: {best_model.count_params()}\n')  
    # print(f'\t\t Time to load model: {time.time() - local_start_time} s')
    
    # Iterate over datasets to test the model
    for test_dataset_name in dataset_names:
          
        test_results[train_dataset_name][test_dataset_name] = dict()
        for vs in voting_strategies:
            test_results[train_dataset_name][test_dataset_name][vs] = defaultdict(list)  

        # Load testing data
        # local_start_time = time.time() 
        print(f'\tTesting on {test_dataset_name}')
        data = np.load(f'{datasets_path_prefix}/{test_dataset_name}/segmented_datasets/{test_dataset_name}_{HEAD_GESTURE}_{dataset_type}.npz')
        X_test,  Y_test  = data['X_test'],  data['Y_test']
        # print(f'\t\t Time to load data: {time.time() - local_start_time} s')
  
        # local_start_time = time.time() 
        test_metrics = evaluate_custom_metrics(Y_true=Y_test, 
                                               Y_pred=best_model.predict_classes(X_test, 
                                                                                 batch_size=10000
#                                                                                  batch_size=len(X_test)
                                                                                ), 
                                               chunk_lens=data['test_len'], window_size=WINDOW_SIZE, 
                                               smooth=smooth_labels
                                              )
        # print(f'\t\t Time to calculate test metrics: {time.time() - local_start_time} s')
        for vs in voting_strategies:
            for mn in metrics_names:
                test_results[train_dataset_name][test_dataset_name][vs][mn].append( test_metrics[vs][mn] )
                print(f'\t\t[{vs}] {mn}: \t\t {test_metrics[vs][mn]:.4f}')
            print()
        print()        

    print(f'\t\t Total time taken: {time.time() - start_time} s\n')

# Save test results (from all testing of the current model on all datasets)
save_results_path = f'./test_results/{test_results_type}/{test_results_type}_{HEAD_GESTURE}_{model_type}.pkl'
with open(save_results_path, 'wb') as pickle_filehandler:
    pickle.dump(test_results, pickle_filehandler)
print(f'Saved to {save_results_path}\n')


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Loading model:
	hatice2010_nod_32ws_12f_16u.hdf5 	 #params: 1409

	Testing on vra1
		[last] bacc: 		 0.6177
		[last] f1: 		 0.3651
		[last] precision: 		 0.6260
		[last] recall: 		 0.2577

		[majority] bacc: 		 0.6403
		[majority] f1: 		 0.4189
		[majority] precision: 		 0.6952
		[majority] recall: 		 0.2997


	Testing on sewa
		[last] bacc: 		 0.6470
		[last] f1: 		 0.2664
		[last] precision: 		 0.1857
		[last] recall: 		 0.4711

		[majority] bacc: 		 0.6582
		[majority] f1: 		 0.2744
		[majority] precision: 		 0.1890
		[majority] recall: 		 0.5007


	Testing on hatice2010
		[last] bacc: 		 0.9322
		[last] f1: 		 0.9256
		[last] precision: 		 0.9599
		[last] recall: 		 0.8936

		[majority] bacc: 		 0.9309
		[majority] f1: 		 0.9240
		[majority] precision

In [5]:
#######################################################################################################################
# Show cross-dataset testing summary in table
#######################################################################################################################
import pickle
from tabulate import tabulate
from utils import arch_to_str
import numpy as np

#############################################
# Smooth the predicted labels?
smooth_labels = False # Name infix *_nonS_*
smooth_labels = True  # Name infix *_S_*
#############################################

HEAD_GESTURE = 'nod'
WINDOW_SIZE = 32
N_FEATURES = 12
gru_archs = [
    [4], [8], [16], [32]
]
# GRU_ARCH = [128]
# GRU_ARCH = [32]
# GRU_ARCH = [16]
# GRU_ARCH = [8]
# GRU_ARCH = [4]

metrics_names = ['bacc', 'f1', 'precision', 'recall']
voting_strategies = ['last', 'majority']
dataset_names = ['vra1', 'sewa', 'hatice2010', 'nvb']

dataset_type = f'{WINDOW_SIZE}ws_{N_FEATURES}f'

for GRU_ARCH in gru_archs:

    model_type = f'{dataset_type}_{arch_to_str(GRU_ARCH)}u'
    test_results_type = f'test_results_S' if smooth_labels else f'test_results_nonS'

    save_results_path = f'./test_results/{test_results_type}/{test_results_type}_{HEAD_GESTURE}_{model_type}.pkl'
    print(f'{test_results_type}_{HEAD_GESTURE}_{model_type}.pkl\n')

    with open(save_results_path, 'rb') as pickle_filehandler:
        d = pickle.load(pickle_filehandler)

        ########################################################################################
        # Print tables

        headers = [
            'Trained on', 
            *dataset_names
        ]

        for vs in voting_strategies:
            for mn in ['bacc']:# metrics_names:
                print(f'[{vs}] {mn}')
                tab_data = np.empty((len(dataset_names), len(dataset_names)))

                for i, train_dataset_name in enumerate(dataset_names):
                    for j, test_dataset_name in enumerate(dataset_names):
                        tab_data[i, j] = d[train_dataset_name][test_dataset_name][vs][mn][0]

                print(tabulate(tab_data, headers=headers, 
                               tablefmt='fancy_grid', 
                               showindex=dataset_names, 
                               numalign='center'
                              ))
                # Table for LaTex
    #             print(tabulate(tab_data, headers=headers, 
    #                            tablefmt='latex_booktabs', 
    #                            showindex=dataset_names, 
    #                            numalign='center'
    #                           ))
            print()
            

test_results_S_nod_32ws_12f_4u.pkl

[last] bacc
╒══════════════╤══════════╤══════════╤══════════════╤══════════╕
│ Trained on   │   vra1   │   sewa   │  hatice2010  │   nvb    │
╞══════════════╪══════════╪══════════╪══════════════╪══════════╡
│ vra1         │ 0.873037 │ 0.697702 │   0.787067   │ 0.711335 │
├──────────────┼──────────┼──────────┼──────────────┼──────────┤
│ sewa         │  0.7526  │ 0.804502 │   0.869937   │ 0.782271 │
├──────────────┼──────────┼──────────┼──────────────┼──────────┤
│ hatice2010   │ 0.649203 │ 0.724418 │   0.933496   │ 0.740443 │
├──────────────┼──────────┼──────────┼──────────────┼──────────┤
│ nvb          │ 0.731682 │ 0.745261 │   0.887106   │ 0.825396 │
╘══════════════╧══════════╧══════════╧══════════════╧══════════╛

[majority] bacc
╒══════════════╤══════════╤══════════╤══════════════╤══════════╕
│ Trained on   │   vra1   │   sewa   │  hatice2010  │   nvb    │
╞══════════════╪══════════╪══════════╪══════════════╪══════════╡
│ vra1         │ 0.872333