In [None]:
#############################################################################################################
# Audio-driven upper-body motion synthesis on a humanoid robot
# Computer Science Tripos Part III Project
# Jan Ondras (jo356@cam.ac.uk), Trinity College, University of Cambridge
# 2017/18
#############################################################################################################
# Generate synthetic speech from a given text using the MaryTTS text-to-speech system
# and perform predictions using the pose regression models. 
#############################################################################################################

In [None]:
#######################################################################################################
# Use text-to-speech system to generate SYNTHETIC speech and then predict movements using the trained model
# 1.)  Read txt file
# 2.)  Convert to audio .wav
# 3.)  Save audio .wav
#########################################
# 4.)  Extract audio features
# 5.)  Z-normalise
# 6.)  Load model
# 7.)  Predict & Evaluate: local CCA, jerkiness; Save predictions and results
#########################################
#######################################################################################################
# To generate on robot, use AutomaticallyRecordSynthesis.ipynb

In [1]:
#######################################################################################################
# Start local MaryTTS server
#######################################################################################################
from subprocess import Popen
Popen(["/home/janciovec/Desktop/marytts-5.2/bin/marytts-server"])
print "MaryTTS running at localhost:59125"

MaryTTS running at localhost:59125


In [1]:
#######################################################################################################
# 1.)  Read txt file
# 2.)  Convert to audio .wav
# 3.)  Save audio .wav
#######################################################################################################
# from gtts import gTTS
from maryTTS import txt2wav
import os

ID = '0'
ID = '1'
ID = '2'   # Glasses: 	as HOBBY task2: 1st person narration
ID = '3'   # Army: 		as STORY task3: 3rd person narration
ID = '5'   # FINAL: Glasses: 	as HOBBY task2: original 3rd person narration

IDs = ['6', '7', '8', '9'] # stories Banana, Picnic, Army, Glasses: http://docs.autismresearchcentre.com/papers/1999_Jolliffe_BC_Stories.pdf

for ID in IDs:
    
    TEXT_PATH = './../Dataset/Synthetic_TTS/SYNTHETIC_text_' + ID
    AUDIO_PATH_prefix = './../Dataset/Synthetic_TTS/SYNTHETIC_audio_' + ID

    ####################################
    # Load text
    print "=========================================== TEXT =================================================="
    f = open(TEXT_PATH + '.txt','r')
    TEXT = f.read()
    print(TEXT)
    f.close()
    print "===================================================================================================\n"

    ####################################
    # Text to speech 
    # MaryTTS:    https://github.com/marytts/marytts-txt2wav/blob/python/txt2wav.py

    TTS_method = 'G' # Google
    TTS_method = 'MOB' # MaryTTS, voice obadiah
    TTS_method = 'MSP' # MaryTTS, voice spike
    TTS_method = 'MPR' # MaryTTS, voice prudence
    TTS_method = 'MPO' # MaryTTS, voice poppy

    TTS_methods = [
        'MOB', # MaryTTS, voice obadiah
        'MSP', # MaryTTS, voice spike
        'MPR', # MaryTTS, voice prudence
        'MPO'  # MaryTTS, voice poppy
    ]

    for TTS_method in TTS_methods:

        AUDIO_PATH = AUDIO_PATH_prefix + '_' + TTS_method

        #######
        # 1.)
#         if TTS_method == 'G':
#             tts = gTTS(text=TEXT, lang='en-uk')
#             tts.save(AUDIO_PATH + '.mp3')
#             # Play mp3
#             # subprocess.Popen(["vlc", "--play-and-exit", AUDIO_PATH + '.mp3']) # non-blocking
#             # os.system('vlc --play-and-exit ' + AUDIO_PATH + '.mp3') # blocking
#             # Convert mp3 to wav: 1 channel (mono), 16kHz rate
#             os.system('ffmpeg -i ' + AUDIO_PATH + '.mp3 -ac 1 -ar 16000 ' + AUDIO_PATH + '.wav') # blocking

        #######
        # 2.)
        if TTS_method[0] == 'M':
            txt2wav(TEXT, AUDIO_PATH + '.wav', voice=TTS_method[1:]) # saves 1 channel, at 16kHz => OK

        print "Audio saved to:", AUDIO_PATH + '.wav'

        ####################################
        # Play wav
        # subprocess.Popen(["vlc", "--play-and-exit", AUDIO_PATH]) # non-blocking
        # os.system('vlc --play-and-exit ' + AUDIO_PATH + '.wav') # blocking


Katie and Emma are playing in the house. Emma picks up a banana from the fruit bowl and holds it up to her car. She says to Katie, "Look! This banana is a telephone!"

VOICE:  dfki-obadiah-hsmm 

QUERY = "http://localhost:59125/process?VOICE=dfki-obadiah-hsmm&LOCALE=en_GB&INPUT_TYPE=TEXT&OUTPUT_TYPE=AUDIO&AUDIO=WAVE&INPUT_TEXT=Katie+and+Emma+are+playing+in+the+house.+Emma+picks+up+a+banana+from+the+fruit+bowl+and+holds+it+up+to+her+car.+She+says+to+Katie%2C+%22Look%21+This+banana+is+a+telephone%21%22"
Wav file saved.
Audio saved to: ./../Dataset/Synthetic_TTS/SYNTHETIC_audio_6_MOB.wav
VOICE:  dfki-spike-hsmm 

QUERY = "http://localhost:59125/process?VOICE=dfki-spike-hsmm&LOCALE=en_GB&INPUT_TYPE=TEXT&OUTPUT_TYPE=AUDIO&AUDIO=WAVE&INPUT_TEXT=Katie+and+Emma+are+playing+in+the+house.+Emma+picks+up+a+banana+from+the+fruit+bowl+and+holds+it+up+to+her+car.+She+says+to+Katie%2C+%22Look%21+This+banana+is+a+telephone%21%22"
Wav file saved.
Audio saved to: ./../Dataset/Synthetic_TTS/SYNTHETIC_audi

Wav file saved.
Audio saved to: ./../Dataset/Synthetic_TTS/SYNTHETIC_audio_8_MPO.wav
Sarah is very long-sighted. She has only one pair of glasses, which she keeps losing. Today she has lost her glasses again and she needs to find them. She had them yesterday evening when she looked up the television programmes. She must have left them somewhere that she has been today. She asks Ted to find her glasses. She tells him that today she went to her regular early morning keep fit class, then to the post office, and last to the flower shop. Ted goes straight to the post office.

VOICE:  dfki-obadiah-hsmm 

QUERY = "http://localhost:59125/process?VOICE=dfki-obadiah-hsmm&LOCALE=en_GB&INPUT_TYPE=TEXT&OUTPUT_TYPE=AUDIO&AUDIO=WAVE&INPUT_TEXT=Sarah+is+very+long-sighted.+She+has+only+one+pair+of+glasses%2C+which+she+keeps+losing.+Today+she+has+lost+her+glasses+again+and+she+needs+to+find+them.+She+had+them+yesterday+evening+when+she+looked+up+the+television+programmes.+She+must+have+left+them+somewhe

In [None]:
#######################################################################################################
# 4.)  Extract audio features
# 5.)  Z-normalise
# 6.)  Load model
# 7.)  Predict
#######################################################################################################

from sklearn.preprocessing import StandardScaler
from postprocessingutils import save_predictions_and_eval_wo_truth_TTS
from keras.models import load_model
from python_speech_features import mfcc, delta, logfbank
import scipy.io.wavfile as wav
import numpy as np
import matplotlib.pyplot as plt
import glob

AUDIO_RATE = 16000 # Hz
SEGMENT_LEN = 300

AF_type = 'AF_logFB26'

model_types = ['LSTM_SI', 'MLP_SI']

IDs = ['6', '7', '8', '9'] # stories Banana, Picnic, Army, Glasses: http://docs.autismresearchcentre.com/papers/1999_Jolliffe_BC_Stories.pdf

TTS_methods = [
    'MOB', # MaryTTS, voice obadiah
    'MSP', # MaryTTS, voice spike
    'MPR', # MaryTTS, voice prudence
    'MPO' # MaryTTS, voice poppy
]

for model_type in model_types:
    for ID in IDs:
        for TTS_method in TTS_methods:

            AUDIO_PATH = './../Dataset/Synthetic_TTS/SYNTHETIC_audio_' + ID + '_' + TTS_method

            model_path_prefix = '/media/janciovec/a650f129-f9f8-47b1-9ef5-f0c2f4ddf422/jFiles/Skola/Cambridge/III/0_Project/src/ModelCheckpoints/'

            if model_type == 'LSTM_SI':
                test_model_name = model_path_prefix + model_type + '/1_26_12_DROP_0.00_0.00/m_0099_0.0142_0.0183.hdf5'
            else:
                test_model_name = model_path_prefix + model_type + '/1_35_AF26/m_0360_0.0143_0.0184.hdf5'

            save_results_path_prefix = './../Dataset/Synthetic_TTS/'

            ##########################################################################################
            # Load audio data
            print "Using audio: ", AUDIO_PATH + ".wav"
            (rate, audio_data) = wav.read(AUDIO_PATH + '.wav')
            if rate != AUDIO_RATE:
                raise ValueError('Check audio rate!')

            ##############################
            # Extract audio features

            audio_features = logfbank(audio_data,samplerate=AUDIO_RATE,winlen=0.025,winstep=0.01,
                                         nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97)

            print "Shape of audio_features: ", audio_features.shape

            ##############################
            # Z-norm
            # Load z-normalisation parameters: (calculated over the whole dataset)
            audio_features = StandardScaler().fit_transform(audio_features)

            ##############################
            # Create segments: audio_features->X_test
            # for realtime testing: zero-pad segments at the beginning: #segments=#frames
            if model_type == 'LSTM_SI':
                N_audio_frames = len(audio_features)
                N_features = audio_features.shape[1]
                X_test = np.zeros((N_audio_frames, SEGMENT_LEN, N_features)) # #segments=#frames
                for j in range(N_audio_frames):
                    # Do zero-padding at the beginning
                    if j < SEGMENT_LEN - 1:
                        X_test[j, SEGMENT_LEN - j - 1:] = audio_features[:j+1]
                    # Otherwise: as in the above section
                    else:
                        X_test[j] = audio_features[j-SEGMENT_LEN+1:j+1]
            else:
                X_test = audio_features
            print X_test.shape

            ##############################
            # Load model
            print "Model:", test_model_name
            model = load_model(test_model_name)
            print model.summary()

            ##############################
            # Predict
            test_batch_size = X_test.shape[0]
            Y_pred = model.predict(X_test, batch_size=test_batch_size, verbose=1)
            if model_type == 'LSTM_SI':
                Y_pred = Y_pred[:, -1, :] # last item from each segment is the (ONLINE) final prediction

            print "Shape of predictions: ", Y_pred.shape

            ###############################################################################################################
            # Save results: predictions will be saved in radians; for generation on robot
            # Raw and smoothed (LPBF_4)
            # also Evaluate predictions: local CCA, jerkiness, & plots
            save_predictions_and_eval_wo_truth_TTS(save_results_path_prefix + 'SYNTHETIC_pred_' + model_type + '_' + ID + '_' + TTS_method, 
                                               audio_features, Y_pred, SEGMENT_LEN
                                              )
            print "Predictions saved, smoothed and evaluated (local CCA, jerkiness)."