In [None]:
#####################################################################################
# Audio-driven upper-body motion synthesis on a humanoid robot
# Computer Science Tripos Part III Project
# Jan Ondras (jo356@cam.ac.uk), Trinity College, University of Cambridge
# 2017/18
#####################################################################################
# 1. cell) given audio file use pose regression model to predict (and save) movements 
# 2. cell) perform OFFLINE synthesis using these predictions

# Before starting the Jupyter Notebook you may need to run the following from the directory you start the Jupyter Notebook: 
# export PYTHONPATH=${PYTHONPATH}:~/Desktop/pynaoqi-python2.7-2.5.5.5-linux64/lib/python2.7/site-packages

In [None]:
#######################################################################################################
# 1.) Make pose predictions
#     - Extract audio features
#     - z-normalise
#     - Load model
#     - Predict
#######################################################################################################

#######################################
# BEGIN SETTINGS

# Input audio
AUDIO_INPUT = './Demo/audio.wav'

# Select pose regression model to use
model_type = 'MLP_SI'
model_type = 'LSTM_SI'

# Where to save predictions (to be used by cell 2 for synthesis)
predictions_path = './Demo/predictions' # file extension .npz added automatically

# END SETTINGS
######################################

from sklearn.preprocessing import StandardScaler
from postprocessingutils import save_predictions_and_eval_wo_truth_TTS
from keras.models import load_model
from python_speech_features import mfcc, delta, logfbank
import scipy.io.wavfile as wav
import numpy as np

AUDIO_RATE = 16000 # Hz
SEGMENT_LEN = 300 # equivalent to N_tau

model_path_prefix = './../Models/'
test_model_name = model_path_prefix + model_type + '/' + model_type + '_model.hdf5'

##########################################################################################
# Load audio data
print "Using audio: ", AUDIO_INPUT
(rate, audio_data) = wav.read(AUDIO_INPUT)
if rate != AUDIO_RATE:
    raise ValueError('Check audio rate!')

##############################
# Extract audio features
audio_features = logfbank(audio_data,samplerate=AUDIO_RATE,winlen=0.025,winstep=0.01,
                             nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97)
print "Shape of audio_features: ", audio_features.shape

##############################
# Z-normalisation
# Z-norm just this set: i.e. subject standardisation, as it was done for original dataset
audio_features = StandardScaler().fit_transform(audio_features)

##############################
# Create segments: audio_features->X_test
# for realtime testing: zero-pad segments at the beginning: #segments=#frames
if model_type == 'LSTM_SI':
    N_audio_frames = len(audio_features)
    N_features = audio_features.shape[1]
    X_test = np.zeros((N_audio_frames, SEGMENT_LEN, N_features)) # #segments=#frames
    for j in range(N_audio_frames):
        # Do zero-padding at the beginning
        if j < SEGMENT_LEN - 1:
            X_test[j, SEGMENT_LEN - j - 1:] = audio_features[:j+1]
        # Otherwise: as in the above section
        else:
            X_test[j] = audio_features[j-SEGMENT_LEN+1:j+1]
else:
    X_test = audio_features
print X_test.shape

##############################
# Load model
print "Model:", test_model_name
model = load_model(test_model_name)
print model.summary()

##############################
# Predict
test_batch_size = X_test.shape[0]
Y_pred = model.predict(X_test, batch_size=test_batch_size, verbose=1)
if model_type == 'LSTM_SI':
    Y_pred = Y_pred[:, -1, :] # last item from each segment is the (ONLINE) final prediction

print "Shape of predictions: ", Y_pred.shape

###############################################################################################################
# Save results: predictions will be saved in radians; for generation on robot
# Raw and smoothed (LPBF_4)
# also Evaluate predictions: local CCA, jerkiness, & plots
save_predictions_and_eval_wo_truth_TTS(predictions_path, 
                                   audio_features, Y_pred, SEGMENT_LEN
                                  )
print "Predictions saved, smoothed, and evaluated (local CCA, jerkiness)."

In [None]:
#######################################################################################################
# 2.) Simulate pose on robot (using above predictions)

# Before starting
# 1. run NaoQi framework
# 2. connect to the real robot OR start Choregraphe simulator and connect to the virtual robot
#######################################################################################################

#######################################
# BEGIN SETTINGS

# Simulate post-smoothed movements (or raw)
smoothPredictions = True
# smoothPredictions = False

audioOnly = True    # only audio playback is run
# audioOnly = False # audio-visual playback is run, if video file is provided

AUDIO_PLAYBACK_FILE = './Demo/audio.wav'
AUDIOVISUAL_PLAYBACK_FILE = './Demo/video.mp4'

# Path to predictions obtained in cell 1
predictions_path = './Demo/predictions.npz'

# Specify IP and port to connect to the robot
IP = "127.0.0.1"
port = 36571
IP = 'pepper.local' # real robot settings
port = 9559

# END SETTINGS
######################################

from naoqi import ALProxy
import numpy as np
import time
from subprocess import Popen
import os

FR = 100. # frame rate of pose features
dt = 1./FR
angles_names = [
    "HeadPitch", "HeadYaw", 
    "LShoulderRoll", "LShoulderPitch", "LElbowRoll", "LElbowYaw",
    "RShoulderRoll", "RShoulderPitch", "RElbowRoll", "RElbowYaw", 
    "HipRoll"
]
N_targets = len(angles_names)
angles_used_i = np.arange(N_targets)

# Connect to the robot
motionProxy = ALProxy("ALMotion", IP, port)

# Load predictions
print "Using predictions from: ", predictions_path
if smoothPredictions:
    print "Synthesising smoothed movements."
    Y_pred = np.load(predictions_path)['Y_smooth']
else:
    print "Synthesising non-smoothed movements."
    Y_pred = np.load(predictions_path)['Y_raw']
        
# Reset robot to neutral pose
for an in angles_names:
    angle_reset = 0.
    if an == 'LShoulderPitch' or an == 'RShoulderPitch':
        angle_reset = angle_reset + np.pi/2
    motionProxy.setAngles(an, angle_reset, 1.)

# Run audio/video simultaneously
if audioOnly:
    print "Audio playback started."
    Popen(["vlc", "--play-and-exit", AUDIO_PLAYBACK_FILE])  # non-blocking
else:
    print "Audio-visual playback started."
    Popen(["vlc", "--play-and-exit", AUDIOVISUAL_PLAYBACK_FILE])  # non-blocking
        
# Synthesise
st = time.time()
for frame_i, angles_vector in enumerate(Y_pred):
    for angle_i in angles_used_i:
        motionProxy.setAngles(angles_names[angle_i], angles_vector[angle_i], 1.)
    
    # Adaptive time synchronisation between robot pose generation and audio/video
    adaptive_dt = ((frame_i+1.) * dt - time.time() + st)
    if adaptive_dt > 0.:
        time.sleep(0.95*adaptive_dt)
        
et = time.time()
print "\tTotal simulation time: ", et - st, " s = ", (et - st)/60., " min"
print "===================================================================================\n"

Using predictions from:  ./Demo/predictions.npz
Synthesising smoothed movements.
Audio playback started.
