In [1]:
#######################################################################################################################
# Project: Deep Virtual Rapport Agent (rapport model)
#
#     Jan Ondras (jo951030@gmail.com)
#     Institute for Creative Technologies, University of Southern California
#     April-October 2019
#
#######################################################################################################################
# Generate sequences/segments of speaker's audio and vision features and listener's labels ready for PyTorch.
#
#     Sequences of features are saved as npy files.
#     Sequence metadata and labels are saved in a csv file (dataset file).
#
#     Set the OUTPUT_DATASET_VERSION, COMMON_DATA_RATE and vision/audio_features_names below.
#     (for explanations on the dataset versions see the section Generating sequences/segments in the 
#     Report_Rapport_model.odt)
#
#     Run after the smile_gaze_va_turn_annotate_frames.ipynb script was run.
#
#     Input audio features: dvra_datasets/mimicry/audio_features/{emobase/mfcc}
#     Input vision features: dvra_datasets/mimicry/vision_features/annotated_features
#     Input voice activity: dvra_datasets/mimicry/voice_activity_detection/voice_activity_ibm_watson
#     Output dataset: dvra_datasets/mimicry/segmented_datasets
#
#     The generated dataset was used for the development of the Rappport Model.
#######################################################################################################################


import os
import glob
import time
import numpy as np
import pandas as pd

# Common rate of features and labels for all modalities (in Hz)
# COMMON_DATA_RATE = 5.
# common_data_period = str(int(1000. / COMMON_DATA_RATE)) + 'ms'

# UPDATED: keep the data rate at 30 Hz (i.e. don't discard information by downsampling to 5 Hz)
COMMON_DATA_RATE = 30.
common_data_period = str(int(1000. / COMMON_DATA_RATE)) + 'ms'

# First, upsample vision features using padding (duplicating) the frames, then groupby and average
# This upsampling avoids empty groups. 
# Upsampling is not needed for audio features that are sampled at much higher rate.
vision_upsample_period = str(int(1000. / (2 * COMMON_DATA_RATE))) + 'ms'

# Dataset version
# OUTPUT_DATASET_VERSION = 'v0'
# OUTPUT_DATASET_VERSION = 'v1'
# OUTPUT_DATASET_VERSION = 'v2'
OUTPUT_DATASET_VERSION = 'v3'

AUDIO_FEATURE_TYPE = 'emobase'
# AUDIO_FEATURE_TYPE = 'mfcc'

# Window size in frames
# WINDOW_SIZE = 4
# WINDOW_SIZE = 8
# WINDOW_SIZE = 16
WINDOW_SIZE = 32
# WINDOW_SIZE = 64

MASK_VALUE = 0.

# Whether to pad sequences to the same length WINDOW_SIZE 
# (padding may be needed only at the beginning of recordings)
# If False, only the sequences that don't require padding will be generated 
# (this discards only a small fraction (118/238077) of sequences 
#  and eliminates the need for padding and masking within PyTorch)
pad_sequences = False

# Speaker's
vision_features_names = [
    # Head translations (first-order differences)
    'diff_ pose_Tx', 
    'diff_ pose_Ty', 
    'diff_ pose_Tz',
    # Head rotations (first-order differences)
    'diff_ pose_Rx', 
    'diff_ pose_Ry', 
    'diff_ pose_Rz',
    # Head rotations (raw) as a proxy for gaze - need to be normalized (mean normalization per recording)
    'unorm_ pose_Rx', 
    'unorm_ pose_Ry', 
    # Gaze angles - need to be normalized (mean normalization per recording)
    'unorm_ gaze_angle_x', 
    'unorm_ gaze_angle_y', 
    # Smile (binary) 
#     'smile'
    # Smile (raw AU intensities)
#     ' AU06_r',
#     ' AU12_r'
    # All AU intensities
    ' AU01_r', ' AU02_r', ' AU04_r', ' AU05_r', ' AU06_r', ' AU07_r', ' AU09_r', ' AU10_r', 
    ' AU12_r', ' AU14_r', ' AU15_r', ' AU17_r', ' AU20_r', ' AU23_r', ' AU25_r', ' AU26_r', ' AU45_r'
]

# Speaker's
audio_features_names = {
    'emobase': [
        # Speaking time (in seconds) audio feature is calculated below (after synchronization of features to common data rate)
        'speaking_time', 
        # 52 emobase audio features
        'pcm_intensity_sma', 'pcm_loudness_sma', 'mfcc_sma[1]',
        'mfcc_sma[2]', 'mfcc_sma[3]', 'mfcc_sma[4]', 'mfcc_sma[5]',
        'mfcc_sma[6]', 'mfcc_sma[7]', 'mfcc_sma[8]', 'mfcc_sma[9]',
        'mfcc_sma[10]', 'mfcc_sma[11]', 'mfcc_sma[12]', 'lspFreq_sma[0]',
        'lspFreq_sma[1]', 'lspFreq_sma[2]', 'lspFreq_sma[3]', 'lspFreq_sma[4]',
        'lspFreq_sma[5]', 'lspFreq_sma[6]', 'lspFreq_sma[7]', 'pcm_zcr_sma',
        'voiceProb_sma', 'F0_sma', 'F0env_sma', 'pcm_intensity_sma_de',
        'pcm_loudness_sma_de', 'mfcc_sma_de[1]', 'mfcc_sma_de[2]',
        'mfcc_sma_de[3]', 'mfcc_sma_de[4]', 'mfcc_sma_de[5]', 'mfcc_sma_de[6]',
        'mfcc_sma_de[7]', 'mfcc_sma_de[8]', 'mfcc_sma_de[9]', 'mfcc_sma_de[10]',
        'mfcc_sma_de[11]', 'mfcc_sma_de[12]', 'lspFreq_sma_de[0]',
        'lspFreq_sma_de[1]', 'lspFreq_sma_de[2]', 'lspFreq_sma_de[3]',
        'lspFreq_sma_de[4]', 'lspFreq_sma_de[5]', 'lspFreq_sma_de[6]',
        'lspFreq_sma_de[7]', 'pcm_zcr_sma_de', 'voiceProb_sma_de', 'F0_sma_de',
        'F0env_sma_de'
    ], 
    'mfcc': [
        # Speaking time (in seconds) audio feature is calculated below (after synchronization of features to common data rate)
        'speaking_time', 
        # 57 mfcc (extended) features
        'voiceProb', 'F0', 'F0env', 'pcm_intensity', 'pcm_loudness', 'pcm_LOGenergy', 
        'pcm_fftMag_mfcc[0]', 'pcm_fftMag_mfcc[1]', 'pcm_fftMag_mfcc[2]', 'pcm_fftMag_mfcc[3]', 'pcm_fftMag_mfcc[4]', 
        'pcm_fftMag_mfcc[5]', 'pcm_fftMag_mfcc[6]', 'pcm_fftMag_mfcc[7]', 'pcm_fftMag_mfcc[8]', 'pcm_fftMag_mfcc[9]', 
        'pcm_fftMag_mfcc[10]', 'pcm_fftMag_mfcc[11]', 'pcm_fftMag_mfcc[12]', 
        'voiceProb_de', 'F0_de', 'F0env_de', 'pcm_intensity_de', 'pcm_loudness_de', 'pcm_LOGenergy_de', 
        'pcm_fftMag_mfcc_de[0]', 'pcm_fftMag_mfcc_de[1]', 'pcm_fftMag_mfcc_de[2]', 'pcm_fftMag_mfcc_de[3]', 'pcm_fftMag_mfcc_de[4]', 
        'pcm_fftMag_mfcc_de[5]', 'pcm_fftMag_mfcc_de[6]', 'pcm_fftMag_mfcc_de[7]', 'pcm_fftMag_mfcc_de[8]', 'pcm_fftMag_mfcc_de[9]', 
        'pcm_fftMag_mfcc_de[10]', 'pcm_fftMag_mfcc_de[11]', 'pcm_fftMag_mfcc_de[12]', 
        'voiceProb_de_de', 'F0_de_de', 'F0env_de_de', 'pcm_intensity_de_de', 'pcm_loudness_de_de', 'pcm_LOGenergy_de_de', 
        'pcm_fftMag_mfcc_de_de[0]', 'pcm_fftMag_mfcc_de_de[1]', 'pcm_fftMag_mfcc_de_de[2]', 'pcm_fftMag_mfcc_de_de[3]', 'pcm_fftMag_mfcc_de_de[4]', 
        'pcm_fftMag_mfcc_de_de[5]', 'pcm_fftMag_mfcc_de_de[6]', 'pcm_fftMag_mfcc_de_de[7]', 'pcm_fftMag_mfcc_de_de[8]', 'pcm_fftMag_mfcc_de_de[9]', 
        'pcm_fftMag_mfcc_de_de[10]', 'pcm_fftMag_mfcc_de_de[11]', 'pcm_fftMag_mfcc_de_de[12]'
    ]
}

# Listener's
labels_names = [
    'nod', 
    'shake', 
    'tilt', 
    'smile', 
    'gaze_away', 
    # Can be used for paraverbal prediction
    # (at inference time, can randomly choose one from a predefined set of paraverbals)
    'voice_active'
]
# and also 'take_turn' labels from speaker vision data

vision_features_dir = f'/home/ICT2000/jondras/dvra_datasets/mimicry/vision_features/annotated_features'
audio_features_dir = f'/home/ICT2000/jondras/dvra_datasets/mimicry/audio_features/opensmile_{AUDIO_FEATURE_TYPE}'

dataset_output_dir_prefix = f'/home/ICT2000/jondras/dvra_datasets/mimicry/segmented_datasets/segmented_datasets_{OUTPUT_DATASET_VERSION}'
vision_features_output_dir = f'{dataset_output_dir_prefix}/vision_features/{WINDOW_SIZE}ws'
audio_features_output_dir = f'{dataset_output_dir_prefix}/audio_features/{AUDIO_FEATURE_TYPE}/{WINDOW_SIZE}ws'
metadata_labels_output_file = f'{dataset_output_dir_prefix}/metadata_labels_{WINDOW_SIZE}ws.csv'

for dir_path in [vision_features_output_dir, audio_features_output_dir]:
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        
print(f'Dimensionality of vision features: {len(vision_features_names)}')
print(f'Dimensionality of audio features: {len(audio_features_names[AUDIO_FEATURE_TYPE])}')
print(f'Dimensionality of target labels: {len(labels_names) + 1}\n')
print(f'Window size (sequence length): {WINDOW_SIZE}')
print(f'Audio feature type: {AUDIO_FEATURE_TYPE}\n')

# Load vision feature files and annotations
file_cnt = 0
total_sequence_cnt = 0
metadata_labels = []
start_time = time.time()

# Iterate over sessions, considering each person as a speaker and as a listener 
for vision_features_file_1, vision_features_file_2 in np.reshape(sorted(glob.glob(f'{vision_features_dir}/*.csv')), 
                                                                 (-1, 2)):
    for speaker_vision_features_file, listener_vision_features_file in [[vision_features_file_1, vision_features_file_2], 
                                                                        [vision_features_file_2, vision_features_file_1]]:
        # print(speaker_vision_features_file, listener_vision_features_file)
        
        speaker_id = speaker_vision_features_file.split('/')[-1][:-4]
        speaker_sid = speaker_id[17:19]
        listener_id = listener_vision_features_file.split('/')[-1][:-4]
        listener_sid = listener_id[17:19]
        # Keep track whose speaker features and whose listener labels are used 
        # (speaker is the first, listener is the second)
        sequence_id_prefix = f'{speaker_id}_{listener_id[10:]}'
        session_id = sequence_id_prefix[7:9]
#         if int(sequence_id_prefix[7:9]) < 15:
#             continue
        print(sequence_id_prefix)

        ############################################################################################################
        # Load speaker's and listener's vision data
        speaker_vision_df = pd.read_csv(speaker_vision_features_file)
        listener_vision_df = pd.read_csv(listener_vision_features_file)
        
        # Mean-normalize speaker's rotation angles and gaze angles
        speaker_vision_df['unorm_ pose_Rx'] = speaker_vision_df[' pose_Rx'] - speaker_vision_df[' pose_Rx'].mean()
        speaker_vision_df['unorm_ pose_Ry'] = speaker_vision_df[' pose_Ry'] - speaker_vision_df[' pose_Ry'].mean()
        speaker_vision_df['unorm_ gaze_angle_x'] = speaker_vision_df[' gaze_angle_x'] - speaker_vision_df[' gaze_angle_x'].mean()
        speaker_vision_df['unorm_ gaze_angle_y'] = speaker_vision_df[' gaze_angle_y'] - speaker_vision_df[' gaze_angle_y'].mean()
        
        # Set dataframe index to frame timestamp 
        speaker_vision_df = speaker_vision_df.set_index(pd.DatetimeIndex(pd.to_datetime(speaker_vision_df[' timestamp'], unit='s')))
        listener_vision_df = listener_vision_df.set_index(pd.DatetimeIndex(pd.to_datetime(listener_vision_df[' timestamp'], unit='s')))

        # First, upsample vision features using padding (duplicating) the frames
        # (this avoids creating empty groups below)
        speaker_vision_df = speaker_vision_df.resample(vision_upsample_period).pad()
        listener_vision_df = listener_vision_df.resample(vision_upsample_period).pad()
        
        # Average features to COMMON_DATA_RATE (over intervals of length common_data_period)
        speaker_vision_groups = speaker_vision_df.groupby(pd.Grouper(freq=common_data_period))
        assert len(speaker_vision_groups.size().nonzero()[0]) == len(speaker_vision_groups)
        speaker_vision_df = speaker_vision_groups.mean()
        
        listener_vision_groups = listener_vision_df.groupby(pd.Grouper(freq=common_data_period))
        assert len(listener_vision_groups.size().nonzero()[0]) == len(listener_vision_groups)
        listener_vision_df = listener_vision_groups.mean()
        
        # Restore index to default integers (needed for assignments in 'Fix take_turn annotations' below)
        speaker_vision_df = speaker_vision_df.reset_index(drop=True)
        listener_vision_df = listener_vision_df.reset_index(drop=True)
        
        # Fix binary annotations
        for annotation_column in ['nod', 'shake', 'tilt', 'smile', 'gaze_away', 'voice_active', 'take_turn']:
            speaker_vision_df[annotation_column] = np.where((speaker_vision_df[annotation_column] >= 0.5), 1, 0)
            listener_vision_df[annotation_column] = np.where((listener_vision_df[annotation_column] >= 0.5), 1, 0)
        
        # Need to regenerate take-turn annotations even if no averaging was applied, since the original take-turn annotations annotate just one frame following a voice-active interval
#         speaker_vision_df['take_turn'] = 0
    
        # Fix take_turn annotations (became all zero in the above step)
        j = 0
        while j < len(speaker_vision_df) - 1:
            # End of VA interval 
            # => set take_turn in the first (WINDOW_SIZE - 1) frames after the voice active interval
            #    or fewer if another voice-active region starts there (or end of recording)
            if (speaker_vision_df.iloc[j]['voice_active'] == 1) and (speaker_vision_df.iloc[j + 1]['voice_active'] == 0):
                last_va_idx = j
                while ((j + 1 < len(speaker_vision_df)) 
                       and (j + 1 - last_va_idx < WINDOW_SIZE) 
                       and (speaker_vision_df.iloc[j + 1]['voice_active'] == 0)):
                    speaker_vision_df.at[j + 1, 'take_turn'] = 1
                    j += 1
            j += 1

        # Listener's turn-taking is not needed (kept just for consistency)
#         j = 0
#         while j < len(listener_vision_df) - 1:
#             # End of VA interval 
#             # => set take_turn in the first (WINDOW_SIZE - 1) frames after the voice active interval
#             #    or fewer if another voice-active region starts there (or end of recording)
#             if (listener_vision_df.iloc[j]['voice_active'] == 1) and (listener_vision_df.iloc[j + 1]['voice_active'] == 0):
#                 last_va_idx = j
#                 while ((j + 1 < len(listener_vision_df)) 
#                        and (j + 1 - last_va_idx < WINDOW_SIZE) 
#                        and (listener_vision_df.iloc[j + 1]['voice_active'] == 0)):
#                     listener_vision_df.at[j + 1, 'take_turn'] = 1
#                     j += 1
#             j += 1
        # OLD way: just one frame following the VA interval is annotated as take-turn
#         for j in range(len(listener_vision_df) - 1):
#             # End of VA interval => set take_turn in the first frame after the voice active interval
#             if (listener_vision_df.iloc[j]['voice_active'] == 1) and (listener_vision_df.iloc[j + 1]['voice_active'] == 0):
#                 listener_vision_df.iloc[j + 1]['take_turn'] = 1
        assert len(speaker_vision_df) == len(listener_vision_df)
        
        ############################################################################################################
        # Load speaker's audio data
        speaker_audio_df = pd.read_csv(f'{audio_features_dir}/{speaker_id}.csv', delimiter=';')
        
        # Set dataframe index to frame timestamp 
        speaker_audio_df = speaker_audio_df.set_index(pd.DatetimeIndex(pd.to_datetime(speaker_audio_df['frameTime'], unit='s')))
        
        # Average features to COMMON_DATA_RATE (over intervals of length common_data_period)
        speaker_audio_groups = speaker_audio_df.groupby(pd.Grouper(freq=common_data_period))
        assert len(speaker_audio_groups.size().nonzero()[0]) == len(speaker_audio_groups)
        speaker_audio_df = speaker_audio_groups.mean()
        
        # Restore index to default integers
        speaker_audio_df = speaker_audio_df.reset_index(drop=True)
        
        # Consolidate (match) the numbers of vision frames and audio frames (skipping the last excessive frame)
        if len(speaker_vision_df) == len(speaker_audio_df) - 1:
            print(f'\t Skipping last audio frame (# video_frames: {len(speaker_vision_df)}, # audio_frames: {len(speaker_audio_df)})')
            speaker_audio_df = speaker_audio_df[:len(speaker_vision_df)]
        elif len(speaker_vision_df) - 1 == len(speaker_audio_df):
            print(f'\t Skipping last video frame (# video_frames: {len(speaker_vision_df)}, # audio_frames: {len(speaker_audio_df)})')
            speaker_vision_df = speaker_vision_df[:len(speaker_audio_df)]
            listener_vision_df = listener_vision_df[:len(speaker_audio_df)]
        assert len(speaker_vision_df) == len(speaker_audio_df), f'Vision: {len(speaker_vision_df)}\t Audio: {len(speaker_audio_df)}'
        
        # Add 'speaking_time' audio feature (in seconds), after synchronization of audio and visual features to common data rate
        speaker_audio_df['speaking_time'] = np.zeros(len(speaker_audio_df), dtype=float)
        j = 0
        while j < len(speaker_vision_df) - 1:
            # Start of VA interval 
            # => set speaking_time based on the number of contiguous voice-active frames
            if (speaker_vision_df.iloc[j]['voice_active'] == 0) and (speaker_vision_df.iloc[j + 1]['voice_active'] == 1):
                first_va_idx = j
                while ((j + 1 < len(speaker_vision_df)) and (speaker_vision_df.iloc[j + 1]['voice_active'] == 1)):
                    speaker_audio_df.at[j + 1, 'speaking_time'] = (j + 1 - first_va_idx) / COMMON_DATA_RATE
                    j += 1
            j += 1
        
        ############################################################################################################
        # Generate sequences only from regions where speaker talks or just stopped talking. 
        # In particular, we extracted a sequence of audio and video features only if the target prediction frame 
        # of the sequence was within the voice-active region, or at least one frame of the sequence was within the 
        # voice-active region and the prediction frame of the sequence was not within the next voice-active region. 
        # Also, record labels and associated metadata.
        
        bool_extract = (speaker_vision_df['voice_active'] == 1) | (speaker_vision_df['take_turn'] == 1)
        # print(extract_idxs)
        # print(len(bool_extract), len(speaker_vision_df), len(listener_vision_df))

        sequence_cnt = 0
        for i, do_extract in enumerate(bool_extract):
            # print(i, do_extract)
            if do_extract:

                pad_len = WINDOW_SIZE - 1 - i
                pad_needed = (0 < pad_len)
                # Don't save sequences if padding is not desired and is needed
                if pad_sequences or (not pad_needed):
                    
                    # One training sample: 2D array (timesteps x features)
                    # Last (and also the prediction) frame is at index i
                    speaker_vision_sequence = speaker_vision_df[i + 1 - WINDOW_SIZE:i + 1][vision_features_names].values
                    speaker_audio_sequence = speaker_audio_df[i + 1 - WINDOW_SIZE:i + 1][audio_features_names[AUDIO_FEATURE_TYPE]].values
                    
                    if pad_needed:
                        speaker_vision_sequence = np.pad(speaker_vision_sequence, ((pad_len, 0), (0, 0)), 
                                                         mode='constant', constant_values=(MASK_VALUE, MASK_VALUE))
                        speaker_audio_sequence = np.pad(speaker_audio_sequence, ((pad_len, 0), (0, 0)), 
                                                         mode='constant', constant_values=(MASK_VALUE, MASK_VALUE))
                        print(f'\t Padded with {pad_len} mask values {MASK_VALUE}')

                    # Save speaker vision features sequence and speaker audio features sequence 
                    sequence_id = f'{sequence_id_prefix}_seq_{i:08}'
                    np.save(f'{vision_features_output_dir}/{sequence_id}.npy', 
                            speaker_vision_sequence)
                    np.save(f'{audio_features_output_dir}/{sequence_id}.npy', 
                            speaker_audio_sequence)

                    # Add row to metadata+labels file
                    # Besides listener's annotations, also add speaker's 'take_turn' labels
                    metadata_labels.append([sequence_id, session_id, speaker_sid, listener_sid] 
                                           + listener_vision_df.iloc[i][labels_names].tolist() 
                                           + [speaker_vision_df.iloc[i]['take_turn']])
                    sequence_cnt += 1
        total_sequence_cnt += sequence_cnt
        file_cnt += 1
        print(f'\t Sequence count: {sequence_cnt}')
        print(f'\t Time taken: {time.time() - start_time} s\n')  
        
#         break
#     break

############################################################################################################
# Save metadata and all the labels (in the dataset file)
assert len(metadata_labels) == total_sequence_cnt, f'Labels count: {len(metadata_labels)}\t Total sequence count: {total_sequence_cnt}'
metadata_labels_df = pd.DataFrame(metadata_labels, 
                                  columns=['sequence_id', 'session_id', 'speaker_sid', 'listener_sid'] 
                                          + labels_names + ['take_turn'])
metadata_labels_df.to_csv(metadata_labels_output_file, index=False)

print(f'\n Processed {file_cnt} feature files.')
print(f'\n Total sequence count: {total_sequence_cnt}')

Dimensionality of vision features: 27
Dimensionality of audio features: 53
Dimensionality of target labels: 7

Window size (sequence length): 32
Audio feature type: emobase

sessid_01_P1_sid_09_P2_sid_02




	 Skipping last video frame (# video_frames: 18401, # audio_frames: 18400)
	 Sequence count: 11336
	 Time taken: 362.20870542526245 s

sessid_01_P2_sid_02_P1_sid_09
	 Skipping last video frame (# video_frames: 18401, # audio_frames: 18400)
	 Sequence count: 8726
	 Time taken: 651.6289031505585 s

sessid_02_P1_sid_09_P2_sid_17
	 Sequence count: 14783
	 Time taken: 1113.0946781635284 s

sessid_02_P2_sid_17_P1_sid_09
	 Sequence count: 15788
	 Time taken: 1601.8737754821777 s

sessid_03_P1_sid_17_P2_sid_02
	 Skipping last audio frame (# video_frames: 13470, # audio_frames: 13471)
	 Sequence count: 9705
	 Time taken: 1767.3841366767883 s

sessid_03_P2_sid_02_P1_sid_17
	 Skipping last audio frame (# video_frames: 13470, # audio_frames: 13471)
	 Sequence count: 5652
	 Time taken: 1908.0728766918182 s

sessid_04_P1_sid_12_P2_sid_23
	 Skipping last audio frame (# video_frames: 28919, # audio_frames: 28920)
	 Sequence count: 7719
	 Time taken: 2136.0258531570435 s

sessid_04_P2_sid_23_P1_sid_12


	 Sequence count: 21642
	 Time taken: 17504.542641162872 s

sessid_34_P1_sid_09_P2_sid_36
	 Sequence count: 11481
	 Time taken: 17748.160537481308 s

sessid_34_P2_sid_36_P1_sid_09
	 Sequence count: 17165
	 Time taken: 18065.5277967453 s

sessid_35_P1_sid_33_P2_sid_36
	 Sequence count: 16665
	 Time taken: 18391.749469280243 s

sessid_35_P2_sid_36_P1_sid_33
	 Sequence count: 16757
	 Time taken: 18726.99099421501 s

sessid_36_P1_sid_13_P2_sid_40
	 Skipping last audio frame (# video_frames: 27413, # audio_frames: 27414)
	 Sequence count: 10782
	 Time taken: 18966.261962652206 s

sessid_36_P2_sid_40_P1_sid_13
	 Skipping last audio frame (# video_frames: 27413, # audio_frames: 27414)
	 Sequence count: 18696
	 Time taken: 19303.085735797882 s

sessid_37_P1_sid_13_P2_sid_40
	 Sequence count: 12255
	 Time taken: 19549.96816635132 s

sessid_37_P2_sid_40_P1_sid_13
	 Sequence count: 16783
	 Time taken: 19852.74633526802 s

sessid_38_P1_sid_56_P2_sid_30
	 Skipping last audio frame (# video_frames: 