In [None]:
#############################################################################################################
# Audio-driven upper-body motion synthesis on a humanoid robot
# Computer Science Tripos Part III Project
# Jan Ondras (jo356@cam.ac.uk), Trinity College, University of Cambridge
# 2017/18
#############################################################################################################
# Extract four types of audio features (MFCC-13, LogFB-26, LogFB-52, LogFB-78)
#############################################################################################################

In [1]:
# Extract audio features from .wav for each VID (40 VIDs in total)
# Several types:
# 1.) 13 MFCC:                                                                             te_AF_MFCC13.npz
# 2.) 13 MFCC + 13 differential coefficients:                                              te_AF_MFCC26.npz   have information in the dynamics; N=2 means calculate delta features based on preceding and following N frames
# 3.) 13 MFCC + 13 differential coefficients + 13 second-order differential coefficients:  te_AF_MFCC39.npz
# 4.) 26 log filter bank:                                                                  te_AF_logFB26.npz
# 5.) 26 log filter bank + 26 differential coefficients:                                   te_AF_logFB52.npz                
# 6.) 26 log filter bank + 26 differential coefficients + 26 second-order diff coefs:      te_AF_logFB78.npz                                            

# For each type generate ONE non-normalised feature set as well as ONE z-normalised per subject

# Frame counts are same for all 3 methods => calculated only in the first method

# Done for sampling frequency 16000 Hz
# DONE
# REDONE when subject 19 eliminated
###############################################################################################################
from python_speech_features import mfcc, delta, logfbank
from sklearn.preprocessing import StandardScaler
import scipy.io.wavfile as wav
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import glob

frame_rate = 16000
NFFT = 512
AW_folder = 'AudioWav_16kHz'
TE_folder = 'TrainingExamples_16kHz'

unique_srt_VIDs = np.load('./../Dataset/'+TE_folder+'/te_unique_srt_VIDs.npz')['unique_srt_VIDs']
VIDs = [] # ordered as unique_srt_VIDs but each VID is repeated # audio windows -times

# For each VID extract audio features
for i, VID in enumerate(unique_srt_VIDs):
       
    (rate, sig) = wav.read('./../Dataset/'+AW_folder+'/' + VID + '.wav')
    
    if rate != frame_rate:
        raise ValueError('Check audio frame rate!')
#     print "Audio sampling rate: ", rate, " Hz"
#     print sig.shape

    # 1.) 13 MFCC
    # win length = 25ms, step 10ms
    mfcc_feat = mfcc(sig,rate,winlen=0.025,winstep=0.010,numcep=13, 
         nfilt=26,nfft=NFFT,lowfreq=0,highfreq=None,preemph=0.97, 
         ceplifter=22,appendEnergy=True)
    
    # Synchronise with pose features
    N_poseFrames = len(np.load('./../Dataset/LiftFromDeep/' +VID+ '.npz')['joints_3D'].astype(float))
    print VID, "#windows: ", np.shape(mfcc_feat)[0], 
    mfcc_feat = mfcc_feat[:2*N_poseFrames-1,:]
    print np.shape(mfcc_feat)[0]
    
    # 2.) 13 MFCC + 13 differential coefficients
    # Calculate differential coefs: N=2 means calculate delta features based on preceding and following N frames
    diff_1 = delta(mfcc_feat, 2)
    # Combine: append differential to original MFCCs
    diff_mfcc_feat_1 = np.concatenate((mfcc_feat, diff_1), axis=1)

    # 3.) 13 MFCC + 13 differential coefficients + 13 second-order differential coefficients
    diff_2 = delta(diff_1, 2)
    diff_mfcc_feat_2 = np.concatenate((diff_mfcc_feat_1, diff_2), axis=1)
    
    # 4.) 26 log filter bank features
    logfb = logfbank(sig,samplerate=rate,winlen=0.025,winstep=0.01,
      nfilt=26,nfft=NFFT,lowfreq=0,highfreq=None,preemph=0.97)
    logfb = logfb[:2*N_poseFrames-1,:] # sync
    
    # 5.) 26 log filter bank features + 26 differential coefficients
    diff_1_logfb = delta(logfb, 2)
    diff_logfb_feat_1 = np.concatenate((logfb, diff_1_logfb), axis=1)
    
    # 6.) 26 log filter bank features + 26 differential coefficients + 26 second-order differential coefficients
    diff_2_logfb = delta(diff_1_logfb, 2)
    diff_logfb_feat_2 = np.concatenate((diff_logfb_feat_1, diff_2_logfb), axis=1)
    
    if i == 0:
        AF_MFCC13 = mfcc_feat                                             # audio features MFCC13
        AF_MFCC26 = diff_mfcc_feat_1                                      # audio features MFCC26
        AF_MFCC39 = diff_mfcc_feat_2                                      # audio features MFCC39
        AF_logFB26  = logfb                                               # audio features logfb26
        AF_logFB52  = diff_logfb_feat_1                                   # audio features logfb52
        AF_logFB78  = diff_logfb_feat_2                                   # audio features logfb78

    else:
        AF_MFCC13 = np.concatenate((AF_MFCC13, mfcc_feat), axis=0)        # audio features MFCC13
        AF_MFCC26 = np.concatenate((AF_MFCC26, diff_mfcc_feat_1), axis=0) # audio features MFCC26
        AF_MFCC39 = np.concatenate((AF_MFCC39, diff_mfcc_feat_2), axis=0) # audio features MFCC39
        AF_logFB26 = np.concatenate((AF_logFB26,  logfb), axis=0)            # audio features logfb26
        AF_logFB52 = np.concatenate((AF_logFB52, diff_logfb_feat_1), axis=0) # audio features logfb52
        AF_logFB78 = np.concatenate((AF_logFB78, diff_logfb_feat_2), axis=0) # audio features logfb78
        
    print AF_MFCC13.shape, AF_MFCC26.shape, AF_MFCC39.shape, AF_logFB26.shape, AF_logFB52.shape, AF_logFB78.shape
    VIDs.extend( np.repeat(VID, np.shape(mfcc_feat)[0]).tolist() )
    
# Save all VIDs, associated with features, frame-wise
np.savez('./../Dataset/'+TE_folder+'/te_VIDs.npz', VIDs=VIDs)
# print np.load('./../Dataset/TrainingExamples/te_VIDs.npz')['VIDs']
print "\nTotal number of VIDs' frames: ", len(VIDs)

#######################################
from collections import Counter
print "Saving audio frame counts per VID ..."
print np.reshape(sorted(dict(Counter(VIDs)).items(), key=lambda x: x[0]), (-1,2))
frameCnts = np.reshape(sorted(dict(Counter(VIDs)).items(), key=lambda x: x[0]), (-1,2))[:,1].astype(int)
# print frameCnts
# Save frame count for each VID (ordered as sorted above)
np.savez('./../Dataset/'+TE_folder+'/te_frameCnts.npz', frameCnts=frameCnts)

#######################################
# Save all audio features (sum_i NaudioFrames_i) x feature_vector_dimension (13/26/39/26 for methods 1/2/3/4)
np.savez('./../Dataset/'+TE_folder+'/te_AF_MFCC13.npz', AF_MFCC13=AF_MFCC13)
np.savez('./../Dataset/'+TE_folder+'/te_AF_MFCC26.npz', AF_MFCC26=AF_MFCC26)
np.savez('./../Dataset/'+TE_folder+'/te_AF_MFCC39.npz', AF_MFCC39=AF_MFCC39)
np.savez('./../Dataset/'+TE_folder+'/te_AF_logFB26.npz',  AF_logFB26 =AF_logFB26)
np.savez('./../Dataset/'+TE_folder+'/te_AF_logFB52.npz',  AF_logFB52 =AF_logFB52)
np.savez('./../Dataset/'+TE_folder+'/te_AF_logFB78.npz',  AF_logFB78 =AF_logFB78)
print "Audio features, shape =", AF_MFCC13.shape, AF_MFCC26.shape, AF_MFCC39.shape, AF_logFB26.shape, AF_logFB52.shape, AF_logFB78.shape

#######################################
# Z-norm per subject
AF_MFCC13_norm = np.zeros( AF_MFCC13.shape )
AF_MFCC26_norm = np.zeros( AF_MFCC26.shape )
AF_MFCC39_norm = np.zeros( AF_MFCC39.shape )
AF_logFB26_norm = np.zeros( AF_logFB26.shape )
AF_logFB52_norm = np.zeros( AF_logFB52.shape )
AF_logFB78_norm = np.zeros( AF_logFB78.shape )

# VIDs of same subject are adjacent in the array frameCnts
offset = 0
for i in range(0, len(frameCnts), 2): # iterate over every second VID => one per subject
    tmp_range_size = frameCnts[i] + frameCnts[i+1] # total number of frames for the current subject
    print "Normalising Subject VIDs", i, i+1, 
    AF_MFCC13_norm[offset:offset+tmp_range_size] = StandardScaler().fit_transform(AF_MFCC13[offset:offset+tmp_range_size])
    AF_MFCC26_norm[offset:offset+tmp_range_size] = StandardScaler().fit_transform(AF_MFCC26[offset:offset+tmp_range_size])
    AF_MFCC39_norm[offset:offset+tmp_range_size] = StandardScaler().fit_transform(AF_MFCC39[offset:offset+tmp_range_size])
    AF_logFB26_norm[offset:offset+tmp_range_size]  = StandardScaler().fit_transform(AF_logFB26[offset:offset+tmp_range_size])
    AF_logFB52_norm[offset:offset+tmp_range_size]  = StandardScaler().fit_transform(AF_logFB52[offset:offset+tmp_range_size])
    AF_logFB78_norm[offset:offset+tmp_range_size]  = StandardScaler().fit_transform(AF_logFB78[offset:offset+tmp_range_size])
    offset += tmp_range_size
print "\nGenerating per-subject z-normalised (standardised) feature sets ..."
np.savez('./../Dataset/'+TE_folder+'/te_AF_MFCC13_norm.npz', AF_MFCC13_norm=AF_MFCC13_norm)
np.savez('./../Dataset/'+TE_folder+'/te_AF_MFCC26_norm.npz', AF_MFCC26_norm=AF_MFCC26_norm)
np.savez('./../Dataset/'+TE_folder+'/te_AF_MFCC39_norm.npz', AF_MFCC39_norm=AF_MFCC39_norm)
np.savez('./../Dataset/'+TE_folder+'/te_AF_logFB26_norm.npz',  AF_logFB26_norm =AF_logFB26_norm)
np.savez('./../Dataset/'+TE_folder+'/te_AF_logFB52_norm.npz',  AF_logFB52_norm =AF_logFB52_norm)
np.savez('./../Dataset/'+TE_folder+'/te_AF_logFB78_norm.npz',  AF_logFB78_norm =AF_logFB78_norm)

print "Z-norm audio features, shape =", AF_MFCC13_norm.shape, AF_MFCC26_norm.shape, AF_MFCC39_norm.shape, AF_logFB26_norm.shape, AF_logFB52_norm.shape, AF_logFB78_norm.shape

PID02Task2 #windows:  7636 7629
(7629, 13) (7629, 26) (7629, 39) (7629, 26) (7629, 52) (7629, 78)
PID02Task3 #windows:  6417 6409
(14038, 13) (14038, 26) (14038, 39) (14038, 26) (14038, 52) (14038, 78)
PID05Task2 #windows:  7627 7619
(21657, 13) (21657, 26) (21657, 39) (21657, 26) (21657, 52) (21657, 78)
PID05Task3 #windows:  7836 7827
(29484, 13) (29484, 26) (29484, 39) (29484, 26) (29484, 52) (29484, 78)
PID06Task2 #windows:  5902 5893
(35377, 13) (35377, 26) (35377, 39) (35377, 26) (35377, 52) (35377, 78)
PID06Task3 #windows:  6180 6171
(41548, 13) (41548, 26) (41548, 39) (41548, 26) (41548, 52) (41548, 78)
PID08Task2 #windows:  6556 6549
(48097, 13) (48097, 26) (48097, 39) (48097, 26) (48097, 52) (48097, 78)
PID08Task3 #windows:  6243 6235
(54332, 13) (54332, 26) (54332, 39) (54332, 26) (54332, 52) (54332, 78)
PID09Task2 #windows:  6974 6967
(61299, 13) (61299, 26) (61299, 39) (61299, 26) (61299, 52) (61299, 78)
PID09Task3 #windows:  6496 6487
(67786, 13) (67786, 26) (67786, 39) (6

In [7]:
###############################################################################################################
# Get audio/video durations
###############################################################################################################

import scipy.io.wavfile as wav
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import glob

durations = []
unique_srt_VIDs = sorted( [x.split('/')[-1] for x in glob.glob('./../Dataset/ImgSeq/*') ] ) # sorted VIDs

for i, VID in enumerate(unique_srt_VIDs):

    (rate, sig) = wav.read('./../Dataset/AudioWav/' + VID + '.wav')
    d = len(sig)*(1./rate)
    durations.append(d)
    print VID, ": ", d, " s"
    
total_duration = np.sum(durations)
mean_duration  = np.mean(durations)
print "\nTotal duration: ", total_duration, " s = ", total_duration/60., " min"
print "Mean  duration: ", mean_duration, " s = ", mean_duration/60., " min"

PID02Task2 :  76.370430839  s
PID02Task3 :  64.1799546485  s
PID05Task2 :  76.2775510204  s
PID05Task3 :  78.3673469388  s
PID06Task2 :  59.0251247166  s
PID06Task3 :  61.8115192744  s
PID08Task2 :  65.5731519274  s
PID08Task3 :  62.4384580499  s
PID09Task2 :  69.7527437642  s
PID09Task3 :  64.9694331066  s
PID10Task2 :  49.2495238095  s
PID10Task3 :  55.6582312925  s
PID11Task2 :  48.5529251701  s
PID11Task3 :  67.9183673469  s
PID13Task2 :  49.7139229025  s
PID13Task3 :  29.8144217687  s
PID15Task2 :  46.9507482993  s
PID15Task3 :  76.9973696145  s
PID16Task2 :  40.3562811791  s
PID16Task3 :  27.3066666667  s
PID17Task2 :  52.1287981859  s
PID17Task3 :  55.4724716553  s
PID18Task2 :  46.9507482993  s
PID18Task3 :  58.7929251701  s
PID19Task2 :  51.1535600907  s
PID19Task3 :  53.1272562358  s
PID20Task2 :  26.1224489796  s
PID20Task3 :  59.0019047619  s
PID21Task2 :  35.1317913832  s
PID21Task3 :  32.9723356009  s
PID22Task2 :  67.0360090703  s
PID22Task3 :  73.003537415  s
PID23Task2