In [None]:
#############################################################################################################
# Audio-driven upper-body motion synthesis on a humanoid robot
# Computer Science Tripos Part III Project
# Jan Ondras (jo356@cam.ac.uk), Trinity College, University of Cambridge
# 2017/18
#############################################################################################################
# Automatically create video clips from the recordings of the robot - for web-surveys.
# 1.) Add audio to video
# 2.) Create side-by-side videos
# and ensure synchronisation betwen audio and videos.
# For both NATURAL and SYNTHETIC speech. 
# Save randomisations of videos in the surveys (i.e. ground truths).
#######################################################################################################

In [None]:
#################################
# Get duration of video/audio
import subprocess
from datetime import datetime
def getDuration(filename):
    result = subprocess.Popen(["ffprobe", filename],
    stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    duration_line = [x for x in result.stdout.readlines() if "Duration" in x]
    #print duration_line
    d = (duration_line[0].split(' ')[3]).split(',')[0]
    dt = datetime.strptime(d, "%H:%M:%S.%f")
    dt0 = datetime.strptime('00:00:00.00', "%H:%M:%S.%f")
    return (dt-dt0).total_seconds()

In [None]:
# For natural speech, need to take only last LAST_N_PRED predictions on the original recordings 
# because only they correspond to the test partition
# => create endings of all .wav files
# DONE

import numpy as np
import os

TE_folder = 'TrainingExamples_16kHz'
unique_srt_VIDs = unique_srt_VIDs = np.load('./../Dataset/'+TE_folder+'/te_unique_srt_VIDs.npz')['unique_srt_VIDs'] # sorted VIDs

dt = 1./100. # 1/FPS

for v, VID in enumerate(unique_srt_VIDs):
    SID = VID[:5]
    predictions_path = './../Dataset/'+TE_folder+'/Results/MLP_SD/XXXMSBMtest_' + SID + '_1_35_AF26.npz'
    dd = np.load(predictions_path)
    if v % 2 == 0:
        LAST_N_PRED = len( dd['Y_smooth_list'][0] )
    else:
        LAST_N_PRED = len( dd['Y_smooth_list'][1] )
    #print LAST_N_PRED
    
    IN_AUDIO_PATH = './../Dataset/AudioWav_16kHz/' + VID + '.wav'
    OUT_AUDIO_PATH = './../Dataset/AudioWav_16kHz_Endings/' + VID + '.wav'
    
    print "Cut length:", LAST_N_PRED*dt
    
    org_len = getDuration(IN_AUDIO_PATH)
    startCut = org_len-LAST_N_PRED*dt
    
    cmd = 'ffmpeg -i ' + IN_AUDIO_PATH + ' -ss ' + str(startCut) + ' -c copy ' + OUT_AUDIO_PATH
    print cmd
    if os.system(cmd) != 0:
        raise ValueError('Command failed!')
    

In [None]:
#######################################################################################################
# For NATURAL speech based survey: 
# (1) sync videos with speech
# (2) create SBS videos with speech
#######################################################################################################

#######################################################################################################
# (1) Add audio to robot video and synchronise 
# DONE
#######################################################################################################
import os
import numpy as np

TE_folder = 'TrainingExamples_16kHz'
unique_srt_VIDs = unique_srt_VIDs = np.load('./../Dataset/'+TE_folder+'/te_unique_srt_VIDs.npz')['unique_srt_VIDs'] # sorted VIDs
all_srt_VIDs = np.load('./../Dataset/'+TE_folder+'/te_VIDs.npz')['VIDs']
unique_srt_SIDs = np.array([x[:5] for i, x in enumerate(unique_srt_VIDs) if i % 2 == 0]) # ['PID02', 'PID05', ..

in_path_prefix = '/home/janciovec/Desktop/' # robot videos, without audio

model_types = [
    'MLP_SD',
    'LSTM_SD',
    'MLP_SI',
    'LSTM_SI'
]

# Iterate over all subjects
for s, SID in enumerate(unique_srt_SIDs):

    # Iterate over all videos of the subject
    for v, VID in enumerate([SID + 'Task2', SID + 'Task3']):

        # Iterate over all 4 models
        for model_type in model_types:
            recording_name = model_type + "_" + VID

            print recording_name

            IN_VIDEO_PATH = in_path_prefix + 'RR_videos/' + recording_name + '.mp4'
            SY_VIDEO_PATH = in_path_prefix + 'SY_videos/SY_' + recording_name + '.mp4'    # sync
            AV_VIDEO_PATH = in_path_prefix + 'AV_videos/AV_' + recording_name + '.mp4'    # audio-visual

            IN_AUDIO_PATH = './../Dataset/AudioWav_16kHz_Endings/' + VID + '.wav'

            # Sync audio and video
            SPEEDUP =  getDuration(IN_AUDIO_PATH) / getDuration(IN_VIDEO_PATH)
            cmd = 'ffmpeg -i '+IN_VIDEO_PATH+' -filter:v "setpts='+str(SPEEDUP)+'*PTS" '+SY_VIDEO_PATH
            print cmd
            if os.system(cmd) != 0:
                raise ValueError('Command failed!')

            # Add audio to video
            cmd = 'ffmpeg -i ' + SY_VIDEO_PATH + ' -i ' + IN_AUDIO_PATH + ' -vcodec libx264 -acodec libmp3lame ' + AV_VIDEO_PATH
            print cmd
            if os.system(cmd) != 0:
                raise ValueError('Command failed!')


In [None]:
# (2)
# DONE
#######################################################################################################
# Create side-by-side video
def createSideBySideVideo(VIDEO_FILE_A, VIDEO_FILE_B, SBS_OUTPUT): # input is already synchronised
    SBS_MUTE_path_prefix = '/home/janciovec/Desktop/SBS_videos/MUTE/'
    SBS_OUTPUT_TMP = SBS_MUTE_path_prefix + SBS_OUTPUT.split('/')[-1][:-4] + '_MUTE.mp4'

    cmd = 'ffmpeg -i '+VIDEO_FILE_A+' -i '+VIDEO_FILE_B+' -filter_complex "[0:v]pad=iw*2:ih[int];[int][1:v]overlay=W/2:0[vid]" -map [vid] -c:v libx264 ' + SBS_OUTPUT_TMP
    print cmd
    if os.system(cmd) != 0:
        raise ValueError('Command failed!')
        
    # Add audio to video
    VID = SBS_OUTPUT.split('_')[-1][:10]
    #print VID
    IN_AUDIO_PATH = './../Dataset/AudioWav_16kHz_Endings/' + VID + '.wav'
    cmd = 'ffmpeg -i ' + SBS_OUTPUT_TMP + ' -i ' + IN_AUDIO_PATH + ' -vcodec libx264 -acodec libmp3lame ' + SBS_OUTPUT
    print cmd
    if os.system(cmd) != 0:
        raise ValueError('Command failed!')
        
import os
import numpy as np

TE_folder = 'TrainingExamples_16kHz'
unique_srt_VIDs = unique_srt_VIDs = np.load('./../Dataset/'+TE_folder+'/te_unique_srt_VIDs.npz')['unique_srt_VIDs'] # sorted VIDs
all_srt_VIDs = np.load('./../Dataset/'+TE_folder+'/te_VIDs.npz')['VIDs']
unique_srt_SIDs = np.array([x[:5] for i, x in enumerate(unique_srt_VIDs) if i % 2 == 0]) # ['PID02', 'PID05', ..

np.random.seed(37)
AV_path_prefix = '/home/janciovec/Desktop/AV_videos/AV_'
SBS_path_prefix = '/home/janciovec/Desktop/SBS_videos/SBS_'

rand_SI_list = []
rand_SD_list = []
    
for v, VID in enumerate(unique_srt_VIDs):
        
    ######
    # SI
    ######
    # Randomly choose which model is on left/right side
    rand_SI = np.random.randint(0, 2)
    rand_SI_list.append( [VID, str(rand_SI)] )
    
    if rand_SI == 0:                 # MLP on left:      MLP-LSTM
        VIDEO_FILE_A = AV_path_prefix + 'MLP_SI_' + VID + '.mp4'
        VIDEO_FILE_B = AV_path_prefix + 'LSTM_SI_' + VID + '.mp4'   
    else:                            # MLP on right:     LSTM-MLP
        VIDEO_FILE_A = AV_path_prefix + 'LSTM_SI_' + VID + '.mp4'
        VIDEO_FILE_B = AV_path_prefix + 'MLP_SI_' + VID + '.mp4'
        
    SBS_OUTPUT = SBS_path_prefix + 'SI_' + VID + '.mp4'
    createSideBySideVideo(VIDEO_FILE_A, VIDEO_FILE_B, SBS_OUTPUT)
        
    ######
    # SD
    ######
    # Randomly choose which model is on left/right side
    rand_SD = np.random.randint(0, 2)
    rand_SD_list.append( [VID, str(rand_SD)] )
    
    if rand_SD == 0:                 # MLP on left:      MLP-LSTM
        VIDEO_FILE_A = AV_path_prefix + 'MLP_SD_' + VID + '.mp4'
        VIDEO_FILE_B = AV_path_prefix + 'LSTM_SD_' + VID + '.mp4'   
    else:                            # MLP on right:     LSTM-MLP
        VIDEO_FILE_A = AV_path_prefix + 'LSTM_SD_' + VID + '.mp4'
        VIDEO_FILE_B = AV_path_prefix + 'MLP_SD_' + VID + '.mp4'
        
    SBS_OUTPUT = SBS_path_prefix + 'SD_' + VID + '.mp4'
    createSideBySideVideo(VIDEO_FILE_A, VIDEO_FILE_B, SBS_OUTPUT)  
        
        
# Save the randomisation
        
rand_SI_list = np.array(rand_SI_list)
np.random.shuffle(rand_SI_list)         # randomise order of videos
print rand_SI_list
rand_SD_list = np.array(rand_SD_list)
np.random.shuffle(rand_SD_list)         # randomise order of videos
print rand_SD_list
np.savez('./../Dataset/Survey/survey_groundTruths.npz', SI=rand_SI_list, SD=rand_SD_list)

In [None]:
#######################################################################################################
# For SYNTHETIC speech based survey: 
# (1) sync videos with speech
# (2) create SBS videos with speech
#######################################################################################################

# (1)
import os
import numpy as np

in_path_prefix = '/home/janciovec/Desktop/' # robot videos, without audio

TTS_methods = [
    'MOB', # MaryTTS, voice obadiah
    'MSP', # MaryTTS, voice spike
    'MPR', # MaryTTS, voice prudence
    'MPO'  # MaryTTS, voice poppy
]
model_types = ['LSTM_SI', 'MLP_SI']
IDs = ['6', '7', '8', '9'] # stories Banana, Picnic, Army, Glasses: http://docs.autismresearchcentre.com/papers/1999_Jolliffe_BC_Stories.pdf

# Iterate over 2 models
for model_type in model_types:
    # Iterate over 4 stories
    for ID in IDs:
        # Iterate over 4 voices
        for TTS_method in TTS_methods:

            recording_name = model_type + "_SYNTHETIC" + ID + TTS_method
            print recording_name

            IN_VIDEO_PATH = in_path_prefix + 'RR_videos/' + recording_name + '.mp4'
            SY_VIDEO_PATH = in_path_prefix + 'SY_videos/SY_' + recording_name + '.mp4'    # sync
            AV_VIDEO_PATH = in_path_prefix + 'AV_videos/AV_' + recording_name + '.mp4'    # audio-visual

            IN_AUDIO_PATH = './../Dataset/Synthetic_TTS/SYNTHETIC_audio_' + ID + '_' + TTS_method + '.wav'

            # Sync audio and video
            SPEEDUP =  getDuration(IN_AUDIO_PATH) / getDuration(IN_VIDEO_PATH)
            cmd = 'ffmpeg -i '+IN_VIDEO_PATH+' -filter:v "setpts='+str(SPEEDUP)+'*PTS" '+SY_VIDEO_PATH
            print cmd
            if os.system(cmd) != 0:
                raise ValueError('Command failed!')

            # Add audio to video
            cmd = 'ffmpeg -i ' + SY_VIDEO_PATH + ' -i ' + IN_AUDIO_PATH + ' -vcodec libx264 -acodec libmp3lame ' + AV_VIDEO_PATH
            print cmd
            if os.system(cmd) != 0:
                raise ValueError('Command failed!')

In [None]:
import os
import numpy as np

# (2)

#######################################################################################################
# Create side-by-side video
def createSideBySideVideo(VIDEO_FILE_A, VIDEO_FILE_B, SBS_OUTPUT, ID, TTS_method): # input is already synchronised
        
    SBS_MUTE_path_prefix = '/home/janciovec/Desktop/SBS_videos/MUTE/'
    SBS_OUTPUT_TMP = SBS_MUTE_path_prefix + SBS_OUTPUT.split('/')[-1][:-4] + '_MUTE.mp4'
    cmd = 'ffmpeg -i '+VIDEO_FILE_A+' -i '+VIDEO_FILE_B+' -filter_complex "[0:v]pad=iw*2:ih[int];[int][1:v]overlay=W/2:0[vid]" -map [vid] -c:v libx264 ' + SBS_OUTPUT_TMP
    print cmd
    if os.system(cmd) != 0:
        raise ValueError('Command failed!')
        
    # Add audio to video
    IN_AUDIO_PATH = './../Dataset/Synthetic_TTS/SYNTHETIC_audio_' + ID + '_' + TTS_method + '.wav'
    cmd = 'ffmpeg -i ' + SBS_OUTPUT_TMP + ' -i ' + IN_AUDIO_PATH + ' -vcodec libx264 -acodec libmp3lame ' + SBS_OUTPUT
    print cmd
    if os.system(cmd) != 0:
        raise ValueError('Command failed!')
        
np.random.seed(37)

AV_path_prefix = '/home/janciovec/Desktop/AV_videos/AV_'
SBS_path_prefix = '/home/janciovec/Desktop/SBS_videos/SBS_'

# EachRandomly choose where 

TTS_methods = [
    'MOB', # MaryTTS, voice obadiah
    'MSP', # MaryTTS, voice spike
    'MPR', # MaryTTS, voice prudence
    'MPO'  # MaryTTS, voice poppy
]
model_types = ['LSTM_SI', 'MLP_SI']
IDs = ['6', '7', '8', '9'] # stories Banana, Picnic, Army, Glasses: http://docs.autismresearchcentre.com/papers/1999_Jolliffe_BC_Stories.pdf

rand_list = []    # triples <story ID, character ID, position of models (0=>MLPfirst)>

# Iterate over 4 stories
for ID in IDs:
    # Iterate over 4 voices
    for TTS_method in TTS_methods:

        ######
        # SI
        ######
        # Randomly choose which model is on left/right side
        rand_i = np.random.randint(0, 2)
        rand_list.append( [ID, TTS_method, str(rand_i)] )

        VID = 'SYNTHETIC' + ID + TTS_method

        if rand_i == 0:                 # MLP on left:      MLP-LSTM
            VIDEO_FILE_A = AV_path_prefix + 'MLP_SI_' + VID + '.mp4'
            VIDEO_FILE_B = AV_path_prefix + 'LSTM_SI_' + VID + '.mp4'   
        else:                            # MLP on right:     LSTM-MLP
            VIDEO_FILE_A = AV_path_prefix + 'LSTM_SI_' + VID + '.mp4'
            VIDEO_FILE_B = AV_path_prefix + 'MLP_SI_' + VID + '.mp4'

        SBS_OUTPUT = SBS_path_prefix + VID + '.mp4'
        createSideBySideVideo(VIDEO_FILE_A, VIDEO_FILE_B, SBS_OUTPUT, ID, TTS_method)
        
##########################################################################
# Save the randomisation (which side & overall ordering of all 16 clips)
rand_list = np.array(rand_list)
print rand_list
np.random.shuffle(rand_list)         # randomise order of videos
print rand_list
np.savez('./../Dataset/Survey/survey_synthSpeech_groundTruths.npz', gt=rand_list)
print "Saved to:"
print './../Dataset/Survey/survey_synthSpeech_groundTruths.npz'