In [None]:
#######################################################################################################################
# Project: Deep Virtual Rapport Agent (head gesture detector)
#
#     Jan Ondras (jo951030@gmail.com)
#     Institute for Creative Technologies, University of Southern California
#     April-October 2019
#
#######################################################################################################################
################################################ Head Gesture Detector ################################################
#######################################################################################################################
#
#     Annotates frames of given csv files of vision features (by OpenFace) with nod, shake, and tilt head gestures, 
#     using the developed Head Gesture Detector.
#
#     Uses the best HGD models from ./checkpoints/final_4comb/best/final_4comb_*.hdf5
#     trained on the whole 4comb dataset (training setting TS3) using the script train_final_4comb_hgd.py
#
#     For each input recording (as csv)
#         Resample feature dataframe to 30Hz as used by the Head Gesture Detector
#         Add first derivatives of selected features
#         Annotate frames using the Head Gesture Detector
#         Save as new annotated csv dataframe
#
#     Each output csv file differs from the corresponding input csv file in the following:
#
#         All columns are resampled to 30 FPS
#
#         12 new columns of difference features:
#             Prefix "diff_" denotes first-order differences
#             Prefix "diff2_" denotes second-order differences
#
#         6 new columns of predictions from 3 independent binary head gesture classifiers (nod/shake/tilt):
#             Suffix "_prob" denotes probability of a positive (head gesture occurred) class
#             Binary output 1 denotes a positive (head gesture occurred) class
#                 These binary predictions are smoothed with a median filter with a kernel size of 9 frames (this smoothing may cause that probabilities for some frames are not in agreement with associated binary labels)
#                 Columns with suffix "_NS" denote non-smoothed binary outputs
#
#         4 new columns of fused predictions from the 3 independent binary classifiers, so that only one unique head gesture is predicted at a time:
#             Labels 0/1/2/3 denote none/nod/shake/tilt head gesture classes respectively
#             Columns with suffix "_NS" denote fused predictions of non-smoothed binary outputs (other columns are based on smoothed binary outputs)
#             
#             Columns starting with "head_gesture_max_probab" contain labels fused using the max probability fusion:
#                 If exactly one classifier detects a head gesture, assign this head gesture class      
#                 If multiple classifiers detect a head gesture, assign the highest probability class
#                     Tie resolution: if there are multiple max probabilities, assign none class
#                 Otherwise, assign none class
#
#             Columns starting with "head_gesture_unique" contain labels fused using the unique fusion:
#                 If exactly one classifier detects a head gesture, assign this head gesture class          
#                 Otherwise, assign none class
#######################################################################################################################

In [1]:
#######################################################################################################################
# So far the Head Gesture Detector (HGD) was run on
# - Mimicry DB (for the main project to develop rapport models)
# - CCDb (to evaluate and compare the performance of the HGD with previous works)
# - IPD data from Qintian Li (summer intern, 2019)
#
# The only changes between different datasets are the DATASET_NAME, paths and the main loop condition
#######################################################################################################################

###########################################################
import numpy as np
random_seed = 37
np.random.seed(random_seed)
from tensorflow import set_random_seed
set_random_seed(random_seed)
###########################################################

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from keras.models import Sequential, load_model
from utils import SMOOTHING_KERNEL_SIZE
from collections import defaultdict
import pandas as pd
from keras import backend as K
import glob
import time
# from matplotlib import pyplot as plt
import scipy.signal
from scipy import interpolate

WINDOW_SIZE = 32
MASK_VALUE = 7777777.7777777

# Unified frame rate
FRAME_RATE = 30.

# OpenFace vision features whose first and second derivatives will be calculated
diff_selected_features = [
    ' pose_Tx', 
    ' pose_Ty', 
    ' pose_Tz', 

    ' pose_Rx', 
    ' pose_Ry', 
    ' pose_Rz'    
    # add landmarks?
]

selected_features = [
    'diff_ pose_Tx', 
    'diff_ pose_Ty', 
    'diff_ pose_Tz',
    
    'diff2_ pose_Tx', 
    'diff2_ pose_Ty', 
    'diff2_ pose_Tz',

    'diff_ pose_Rx', 
    'diff_ pose_Ry', 
    'diff_ pose_Rz',
    
    'diff2_ pose_Rx', 
    'diff2_ pose_Ry', 
    'diff2_ pose_Rz',
]

head_gestures = ['nod', 'shake', 'tilt']
model_names = {
    # Models trained on the whole 4comb dataset
    'nod':   'final_4comb_nod_32ws_12f_16u.hdf5', 
    'shake': 'final_4comb_shake_32ws_12f_8u.hdf5', 
    'tilt':  'final_4comb_tilt_32ws_12f_16u.hdf5'
    # Models not trained on the whole 4comb dataset (checpoints are in the folder ./checkpoints/4comb/best)
#     'nod':   '4comb_nod_32ws_12f_16u.hdf5', 
#     'shake': '4comb_shake_32ws_12f_8u.hdf5', 
#     'tilt':  '4comb_tilt_32ws_12f_16u.hdf5'
}

# DATASET_NAME = 'QintianLi_IPD'
# input_features_dir = f'/media/DataDrive/{DATASET_NAME}_openface'
# output_dir = f'/media/DataDrive/{DATASET_NAME}_hgd_annotated_features'

# DATASET_NAME = 'ccdb'
# input_features_dir = f'/home/ICT2000/jondras/dvra_datasets/{DATASET_NAME}/openface_features'
# output_dir = f'/home/ICT2000/jondras/dvra_datasets/{DATASET_NAME}/hgd_annotated_features'

DATASET_NAME = 'mimicry'
input_features_dir = f'/home/ICT2000/jondras/dvra_datasets/{DATASET_NAME}/vision_features/original_openface_features'
output_dir = f'/home/ICT2000/jondras/dvra_datasets/{DATASET_NAME}/vision_features/annotated_features'

models_path_prefix = '/home/ICT2000/jondras/deep-virtual-rapport-agent/head_gesture_detector/checkpoints/final_4comb/best'
# models_path_prefix = '/home/ICT2000/jondras/deep-virtual-rapport-agent/head_gesture_detector/checkpoints/4comb/best'

print(f'Dataset: {DATASET_NAME}')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
    
def get_X_segments(df):
    """Generate segments (X (features)) from the dataframe. 

    Args:
        df (DataFrame): Pandas DataFrame that already contains the selected features.

    Returns:
        (3D array): Array of segments/sequences of features.
    """

    X = []

    # Pre-pad all features with (WINDOW_SIZE - 1) MASK_VALUE-s 
    padded_features = np.pad(df.values, ((WINDOW_SIZE - 1, 0), (0, 0)), 
                             mode='constant', constant_values=(MASK_VALUE, MASK_VALUE))
    assert len(padded_features) - WINDOW_SIZE + 1 == len(df), 'Padding failed!'

    # Slide window of length WINDOW_SIZE over the padded features
    for i in range(len(df)):       
        X.append( padded_features[i:i + WINDOW_SIZE] )

    return np.array(X)


def max_probab_fusion(row, smooth_flag):
    """Apply maximum probability fusion to the given row of nod, shake and tilt labels, 
    so that only one unique head gesture is predicted at a time.
    
    If exactly one classifier detects a head gesture, assign this head gesture class.      
    If multiple classifiers detect a head gesture, assign the highest probability class.
        tie resolution: if there are multiple max probabilities, assign none class.
    Otherwise, assign none class.
    
    Args:
        row (dict): DataFrame row with nod, shake and tilt binary labels and probabilities
        smooth_flag (str): Flag indicating whether to use smoothed labels ('') or non-smoothed ('_NS')
    
    Returns:
        (int): Label of the maximum probability label (0/1/2/3 denote none/nod/shake/tilt head 
        gesture classes respectively)
    """
        
    if (row[f'nod{smooth_flag}'] + row[f'shake{smooth_flag}'] + row[f'tilt{smooth_flag}']) > 1:
        max_probab_class = 1 + np.argmax([row['nod_probab'], row['shake_probab'], row['tilt_probab']])
        if max_probab_class == 1:
            if (row['nod_probab'] == row['shake_probab']) or (row['nod_probab'] == row['tilt_probab']):
                print('\t\tTie 1')
                return 0
        elif max_probab_class == 2:
            if (row['shake_probab'] == row['tilt_probab']):
                print('\t\tTie 2')
                return 0
        return max_probab_class
    return np.argmax([0, row[f'nod{smooth_flag}'], row[f'shake{smooth_flag}'], row[f'tilt{smooth_flag}']])
           
def XXX(row, smooth_flag):
    """Apply unique fusion to the given row of nod, shake and tilt labels, 
    so that only one unique head gesture is predicted at a time.
    
    If exactly one classifier detects a head gesture, assign this head gesture class. 
    Otherwise, assign none class.
        
    Args:
        row (dict): DataFrame row with nod, shake and tilt binary labels and probabilities
        smooth_flag (str): Flag indicating whether to use smoothed labels ('') or non-smoothed ('_NS')
    
    Returns:
        (int): Fused label (0/1/2/3 denote none/nod/shake/tilt head gesture classes respectively)
    """
    
    if (row[f'nod{smooth_flag}'] + row[f'shake{smooth_flag}'] + row[f'tilt{smooth_flag}']) == 1:
        return 1 + np.argmax([row[f'nod{smooth_flag}'], row[f'shake{smooth_flag}'], row[f'tilt{smooth_flag}']])
    return 0

    
start_time = time.time()    
cnt = 0
# Take only the first 8 sessions: CCDb only
if DATASET_NAME == 'ccdb':
    input_features_path = input_features_dir + '/*_*_*_*.csv'
else:
    input_features_path = input_features_dir + '/*.csv'
    
for feature_file in sorted(glob.glob(input_features_path)):
    
    sessid = feature_file.split('/')[-1].split('.')[0]
    print(f'Processing {sessid}')
    
    org_df = pd.read_csv(feature_file)
    print(f'\tOriginal dataframe length: {len(org_df)}')
    
    # Resample feature dataframe to common frame rate, if needed
    csv_frame_rate = (org_df.shape[0] - 1) / np.sum(np.diff(org_df[' timestamp']))
    print(f'\tcsv frame rate: {csv_frame_rate}')
    if round(csv_frame_rate) == FRAME_RATE:
        print(f'\tNOT resampling')
        new_df = org_df.copy()
    else:
        print(f'\tREsampling from {csv_frame_rate} to {FRAME_RATE}')
        new_df = []
        timestamps_resampled = np.arange(0., org_df.iloc[-1][' timestamp'], step=1. / FRAME_RATE)
        for col_name in org_df.columns:            
            # Get interpolation function
            f = interpolate.interp1d(x=org_df[' timestamp'], y=org_df[col_name], kind='linear')
            new_df.append( f(timestamps_resampled) )
        new_df = pd.DataFrame(np.array(new_df).T, columns=org_df.columns) 

    # Add first and second derivatives of selected features
    diff_features = dict()
    for feature_name in diff_selected_features:
        diff_features['diff_' + feature_name] =  np.diff(new_df[feature_name], prepend=new_df[feature_name][0])
        diff_features['diff2_' + feature_name] = np.diff(diff_features['diff_' + feature_name], 
                                                         prepend=diff_features['diff_' + feature_name][0])    
    new_df = new_df.assign(**diff_features)
    print(f'\tNew dataframe length: {len(new_df)}')
    
    # Generate segments from selected features
    X_pred = get_X_segments(new_df[selected_features])
       
    # Load nod, shake, and tilt models
    for head_gesture in head_gestures:

        # Predict and use last voting strategy
        K.clear_session()
        model = load_model(f'{models_path_prefix}/{model_names[head_gesture]}')
        y_pred_probab = model.predict_proba(X_pred, batch_size=10000)[:, -1, 0]
        y_pred_NS = model.predict_classes(X_pred, batch_size=10000)[:, -1, 0]
        
        # Smooth predictions
        y_pred = scipy.signal.medfilt(y_pred_NS, kernel_size=SMOOTHING_KERNEL_SIZE).astype(int)
        
        # Add new annotation columns
        new_df[f'{head_gesture}_probab'] = y_pred_probab
        new_df[f'{head_gesture}_NS'] = y_pred_NS
        new_df[head_gesture] = y_pred
        
    # Add columns for None class
    new_df['none_NS'] = np.where((new_df['nod_NS'] == 0) & (new_df['shake_NS'] == 0) & (new_df['tilt_NS'] == 0), 1, 0)
    new_df['none'] = np.where((new_df['nod'] == 0) & (new_df['shake'] == 0) & (new_df['tilt'] == 0), 1, 0)

    # Perform fusion of predictions from 3 binary classifiers
    # 1.) Fusion scheme max_probability (choose head gesture with maximum probability prediction)
    new_df['head_gesture_max_probab_NS'] = new_df.apply(max_probab_fusion, args=('_NS',), axis=1)
    new_df['head_gesture_max_probab'] = new_df.apply(max_probab_fusion, args=('',), axis=1)
    # If you get SettingWithCopyWarning
#     new_df = new_df.assign(head_gesture_max_probab=new_df.apply(max_probab_fusion, axis=1).values)
    
    # 2.) Fusion scheme unique (if 2 or 3 classifiers detect a head gesture, then assign none/other class)
    new_df['head_gesture_unique_NS'] = new_df.apply(unique_fusion, args=('_NS',), axis=1)
    new_df['head_gesture_unique'] = new_df.apply(unique_fusion, args=('',), axis=1)
    
    # Save as new annotated dataframe
#     print(new_df)
    new_df.to_csv(f'{output_dir}/{sessid}.csv', index=False)
    cnt += 1
    print(f'Time taken: {time.time() - start_time} s\n')        
#     break

print(f'\nGenerated {cnt} annotated feature files, using HGD.')

Using TensorFlow backend.


Dataset: mimicry
Processing sessid_01_P1_sid_09
	Original dataframe length: 35248
	csv frame rate: 58.04774977890571
	REsampling from 58.04774977890571 to 30.0
	New dataframe length: 18217
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Time taken: 56.33326029777527 s

Processing sessid_01_P2_sid_02
	Original dataframe length: 35248
	csv frame rate: 58.04774977890571
	REsampling from 58.04774977890571 to 30.0
	New dataframe length: 18217
Time taken: 110.34635281562805 s

Processing sessid_02_P1_sid_09
	Original dataframe length: 57269
	csv frame rate: 58.04775549962648
	REsampling from 58.04775549962648 to 30.0
	New dataframe length: 29598
Time taken: 196.27830505371094 s

Processing sessid_02_P2_sid_17
	Original dataframe length: 57269
	csv frame rate: 58.04775549962648
	REsampling from 58.04775549962

	Original dataframe length: 32813
	csv frame rate: 58.04772679426599
	REsampling from 58.04772679426599 to 30.0
	New dataframe length: 16958
Time taken: 2599.2124400138855 s

Processing sessid_19_P1_sid_09
	Original dataframe length: 52669
	csv frame rate: 58.04774931060858
	REsampling from 58.04774931060858 to 30.0
	New dataframe length: 27220
Time taken: 2672.972610473633 s

Processing sessid_19_P2_sid_22
	Original dataframe length: 52669
	csv frame rate: 58.04774931060858
	REsampling from 58.04774931060858 to 30.0
	New dataframe length: 27220
Time taken: 2748.9933512210846 s

Processing sessid_20_P1_sid_09
	Original dataframe length: 48250
	csv frame rate: 58.04775052785447
	REsampling from 58.04775052785447 to 30.0
	New dataframe length: 24936
Time taken: 2818.7101485729218 s

Processing sessid_20_P2_sid_50
	Original dataframe length: 48250
	csv frame rate: 58.04775052785447
	REsampling from 58.04775052785447 to 30.0
	New dataframe length: 24936
Time taken: 2888.3692269325256 s

Pr

	Original dataframe length: 64726
	csv frame rate: 58.04771347164339
	REsampling from 58.04771347164339 to 30.0
	New dataframe length: 33451
Time taken: 5255.163265943527 s

Processing sessid_40_P1_sid_56
	Original dataframe length: 50725
	csv frame rate: 58.04777119629402
	REsampling from 58.04777119629402 to 30.0
	New dataframe length: 26215
Time taken: 5330.280802488327 s

Processing sessid_40_P2_sid_51
	Original dataframe length: 50725
	csv frame rate: 58.04777119629402
	REsampling from 58.04777119629402 to 30.0
	New dataframe length: 26215
Time taken: 5404.6622014045715 s

Processing sessid_41_P1_sid_57
	Original dataframe length: 38977
	csv frame rate: 58.047768476141826
	REsampling from 58.047768476141826 to 30.0
	New dataframe length: 20144
Time taken: 5464.2672646045685 s

Processing sessid_41_P2_sid_51
	Original dataframe length: 38977
	csv frame rate: 58.047768476141826
	REsampling from 58.047768476141826 to 30.0
	New dataframe length: 20144
Time taken: 5523.161546707153 s



In [1]:
# Log from running on CCDb

Using TensorFlow backend.


Dataset: ccdb
Processing P1_P2_1402_C1
	Original dataframe length: 8821
	csv frame rate: 30.0
	NOT resampling
	New dataframe length: 8821
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Time taken: 24.030054330825806 s

Processing P1_P2_1402_C2
	Original dataframe length: 8821
	csv frame rate: 30.0
	NOT resampling
	New dataframe length: 8821
Time taken: 44.7995080947876 s

Processing P1_P3_1502_C1
	Original dataframe length: 9805
	csv frame rate: 30.0
	NOT resampling
	New dataframe length: 9805
Time taken: 66.95227527618408 s

Processing P1_P3_1502_C2
	Original dataframe length: 8915
	csv frame rate: 30.000033654962593
	NOT resampling
	New dataframe length: 8915
Time taken: 87.89469242095947 s

Processing P3_P4_1502_C1
	Original dataframe length: 8901
	csv frame rate: 29.9999662921727
	NOT resampling
	