In [None]:
#####################################################################################
# Audio-driven upper-body motion synthesis on a humanoid robot
# Computer Science Tripos Part III Project
# Jan Ondras (jo356@cam.ac.uk), Trinity College, University of Cambridge
# 2017/18
#####################################################################################
# This performs online synthesis on the robot and involves the following steps:
#         Load model
#         In loop do:
#             Read audio frames
#             Extract audio features
#             Z-norm
#             Predict
#             Convert to radians (inversion)
#             Apply Kalman filter
#             Send commands to the robot
#         Save recorded audio
#         + Report latency measurements

# Before starting
# 1.) run NaoQi framework
# 2.) connect to the real robot OR start Choregraphe simulator and connect to the virtual robot

# Before starting the Jupyter Notebook you may need to run the following from the directory you start the Jupyter Notebook: 
# export PYTHONPATH=${PYTHONPATH}:~/Desktop/pynaoqi-python2.7-2.5.5.5-linux64/lib/python2.7/site-packages

In [None]:
#######################################
# BEGIN SETTINGS

# Where to save the captured audio
AUDIO_OUTPUT_FILENAME = './Demo/online_audio.wav'

# Select pose regression model to use
model_type = 'MLP_SI'
# model_type = 'LSTM_SI'

# Specify IP and port to connect to the robot
IP = "127.0.0.1"
port = 44743
IP = 'pepper.local' # real robot settings
port = 9559

# Duration of simultaneous recording and synthesis
MAX_RECORD_SECONDS = 100.

# END SETTINGS
######################################

import wave
from naoqi import ALProxy
import numpy as np
import time
from evalutils import inv_norm_Y_vec
from python_speech_features import mfcc, delta, logfbank
from KFClass import KFOnline

SEGMENT_LEN = 300

# Load z-normalisation parameters: to be applied on-the-fly (calculated over the whole original dataset)
znorm_data = np.load('./../SourceCode/Znorm/znorm_params.npz')
znorm_mean = znorm_data['mean']
znorm_std  = znorm_data['std']

FR = 100. # frame rate of generated pose features
dt = 1./FR

angles_names = [
    "HeadPitch", "HeadYaw", 
    "LShoulderRoll", "LShoulderPitch", "LElbowRoll", "LElbowYaw",
    "RShoulderRoll", "RShoulderPitch", "RElbowRoll", "RElbowYaw", 
    "HipRoll"
]
N_features = 26
N_targets = len(angles_names)
angles_used_i = np.arange(N_targets)

# Connect to the robot
motionProxy = ALProxy("ALMotion", IP, port)
# Reset robot to neutral pose
for an in angles_names:
    angle_reset = 0.
    if an == 'LShoulderPitch' or an == 'RShoulderPitch':
        angle_reset = angle_reset + np.pi/2
    motionProxy.setAngles(an, angle_reset, 1.)
    
##################################################################
# Setup Kalman filter, for each angle independently
# if filter_predictions:
# Initialize Kalman Filter; without constraints
KF_list = [] # list Kalman Filter objects for each angle
for i in range(N_targets):
    init_angle = 0.           # set initial position
    if angles_names[i] == 'LShoulderPitch' or angles_names[i] == 'RShoulderPitch':
        init_angle = np.pi/2
    KF_list.append(
        KFOnline(dt=dt, q=0.5, r=0.01, init_P_post=np.matrix('1 0 0; 0 1 0; 0 0 1'), 
                 init_x_est_post=np.matrix(str(init_angle) + '; 0; 0'))
    )

##################################################################
# Load model
from keras.models import load_model
model_path_prefix = './../Models/'
test_model_name = model_path_prefix + model_type + '/' + model_type + '_model.hdf5'
model = load_model(test_model_name)
print "MODEL = ", model_type, "\n"
print model.summary()


##################################################################
latencies_inference = [] # model inference
latencies_ops = []       # all other per-frame operations, except model inference
latencies_all = []       # all per-frame operations

audio_frames = []
raw_audio_frames = [] # required to store audio afterwards
pose_frames = []
pose_frames_filt = []
angles_vector_filt = np.zeros(N_targets) # temporary vector of filtered angles (updated each time-step)
whole_audio_data = [] # stores int16 audio data, incrementally appended

# Initialise logged audio frames with zero vectors (avoids zero-padding in real-time)
for i in range(SEGMENT_LEN):
    audio_frames.append( np.zeros(N_features) )
    
# First prediction - just to test (and the first one takes longer => avoids real-time delay)
if model_type == 'MLP_SI':
    audio_frames_SEGMENT = np.reshape(audio_frames[-1], (1, N_features)) # 1x26 matrix to make prediction on
else:
    audio_frames_SEGMENT = np.reshape(audio_frames, (1, SEGMENT_LEN, N_features)) # 1x300x26 matrix to make prediction on

model.predict(audio_frames_SEGMENT, batch_size=1, verbose=1) #[0,-1] 
print "First test prediction done. "

##################################################################
# Setup & start audio capture
# http://people.csail.mit.edu/hubert/pyaudio/
# OR: https://python-sounddevice.readthedocs.io/en/0.3.10/examples.html
import pyaudio
AUDIO_CHANNELS = 1
AUDIO_RATE = 16000 #44100
AUDIO_CHUNK = int(float(AUDIO_RATE) / FR) # number of audio samples read in one iteration; AUDIO_RATE / FR is maximum
AUDIO_FORMAT = pyaudio.paInt16 
N_audio_chunks = int(MAX_RECORD_SECONDS * FR) # int(MAX_RECORD_SECONDS * AUDIO_RATE / AUDIO_CHUNK)
LAST_N_AUDIO_DATAPOINTS_FOR_AF_EXTRACTION = int(0.025*AUDIO_RATE) #= 400 # to extract features from one last window
print "AUDIO_CHUNK = ", AUDIO_CHUNK, "\t N_audio_chunks = ", N_audio_chunks

print("************* Recording *************")
p = pyaudio.PyAudio()
stream = p.open(format=AUDIO_FORMAT, #pyaudio.paInt16, #p.get_format_from_width(WIDTH),
                channels=AUDIO_CHANNELS,
                rate=AUDIO_RATE,
                input=True, #output=True,
                frames_per_buffer=AUDIO_CHUNK)
    
st = time.time()
for i in range(N_audio_chunks):
    
    ##############################
    # Start latency measurement - for latencies_all
    lm_start = time.time() 

    ##############################
    # Read AUDIO_CHUNK audio samples
    raw_audio_data = stream.read(AUDIO_CHUNK)
    raw_audio_frames.append( raw_audio_data ) # required to store audio afterwards
    audio_data = np.fromstring(raw_audio_data , dtype=np.int16) # array of 160 ints
    whole_audio_data.extend( audio_data ) # whole audio stream (ints, not raw)
        
    ##############################
    # Extract audio features
    # 26 log filter bank features
    audio_feature_vector = logfbank(np.array(whole_audio_data[-LAST_N_AUDIO_DATAPOINTS_FOR_AF_EXTRACTION:]),
                                    samplerate=AUDIO_RATE,winlen=0.025,winstep=0.01,
                                    nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97)[0] # takes the only row
    #print audio_feature_vector.shape == (26,)
    
    ##############################
    # Z-norm
    audio_feature_vector = (audio_feature_vector - znorm_mean) / znorm_std
        
    ##############################
    # Add new audio feature vector to log (z-normed)
    audio_frames.append( audio_feature_vector )

    ##############################
    # Predict: take last SEGMENT_LEN audio feature vectors if LSTM model (already pre-padded with zerovectors)
    lm_inference_start = time.time() # Start inference latency measurement - for latencies_inference
    if model_type == 'MLP_SI':
        audio_frames_SEGMENT = np.reshape(audio_feature_vector, (1, N_features))         # 1x26 
        angles_vector = model.predict(audio_frames_SEGMENT, batch_size=1, verbose=0)[0] 
    else:
        audio_frames_SEGMENT = np.reshape(audio_frames[-SEGMENT_LEN:], (1, SEGMENT_LEN, N_features)) # 1x300x26 matrix to make prediction on
        angles_vector = model.predict(audio_frames_SEGMENT, batch_size=1, verbose=0)[0,-1] # take most recent prediction
        # [0] since only one sequence is predicted; [-1] only last timestep is taken
    lm_inference_stop = time.time() # Stop inference latency measurement - for latencies_inference
    
    ##############################
    # Convert to radians (inversion)
    angles_vector = inv_norm_Y_vec(angles_vector)
    
    ##############################
    # Record predictions
    pose_frames.append( angles_vector )
    
    ##############################
    # Post Kalman filter
    for i in range(N_targets):
        # Filter & get estimates
        angles_vector_filt[i] = KF_list[i].update(angles_vector[i])[0] # retrieve only angle (not velocity/acceleration)
    # Record filtered predictions
    pose_frames_filt.append( angles_vector_filt )
    
    ##############################
    # Stop latency measurement
    lm_beforeCmd_stop = time.time()
    
    ##############################
    # Send commands to the robot
    for angle_i in angles_used_i:
        motionProxy.setAngles(angles_names[angle_i], angles_vector_filt[angle_i], 1.)
                
    ##############################
    # Stop latency measurement
    lm_stop = time.time()
    latencies_inference.append( lm_inference_stop - lm_inference_start )
    latencies_all.append( lm_stop - lm_start )
        
    ##############################
    # Adaptive time delay
    adaptive_dt = ((i+1.) * dt - time.time() + st)
    if adaptive_dt > 0.:
        time.sleep(0.95*adaptive_dt)
        
print("************* Done *************")
et = time.time()
print "\tTotal simulation time: ", et - st, " s = ", (et - st)/60., " min; \t\tORIGINALLY SET to: ", MAX_RECORD_SECONDS
stream.stop_stream()
stream.close()
p.terminate() 

##############################
# Save recorded audio
wf = wave.open(AUDIO_OUTPUT_FILENAME, 'wb')
wf.setnchannels(AUDIO_CHANNELS)
wf.setsampwidth(p.get_sample_size(AUDIO_FORMAT))
wf.setframerate(AUDIO_RATE)
wf.writeframes(b''.join(raw_audio_frames))
wf.close()
print "Saved audio file:", AUDIO_OUTPUT_FILENAME

##############################
# Report latency measurements
latencies_all = np.array(latencies_all)
latencies_inference = np.array(latencies_inference)
latencies_ops = latencies_all - latencies_inference

print "Overall:"
print "Mean +/- std latency: ", 1000*np.mean(latencies_all), 1000*np.std(latencies_all), "; \tMax/min: ", np.max(latencies_all), np.min(latencies_all)
print 
print "Inference:"
print "Mean +/- std latency: ", 1000*np.mean(latencies_inference), 1000*np.std(latencies_inference), "; \tMax/min: ", np.max(latencies_inference), np.min(latencies_inference)
print 
print "Ops:"
print "Mean +/- std latency: ", 1000*np.mean(latencies_ops), 1000*np.std(latencies_ops), "; \tMax/min: ", np.max(latencies_ops), np.min(latencies_ops)
print 
