## train, dev, test split
- train set = 3696 utterance (sa removed)
- dev set = 50 speakers, 400 utterance
- core test set = 24 speakers, 192 utterance

## input preprocessing
- featurewise zero mean unit variance scaling

In [1]:
import tensorflow as tf
import numpy as np
from python_speech_features import mfcc, fbank, delta
from sklearn.preprocessing import StandardScaler
import scipy.io.wavfile as wav
import subprocess
import os, time, pickle

In [2]:
phn_61 = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl', 'dh',
          'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 
          'ih', 'ix', 'iy', 'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau', 'pcl',
          'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux', 'v', 'w', 'y', 'z', 'zh']

mapping = {'ah': 'ax', 'ax-h': 'ax', 'ux': 'uw', 'aa': 'ao', 'ih': 'ix',
               'axr': 'er', 'el': 'l', 'em': 'm', 'en': 'n', 'nx': 'n',
               'eng': 'ng', 'sh': 'zh', 'hv': 'hh', 'bcl': 'h#', 'pcl': 'h#',
               'dcl': 'h#', 'tcl': 'h#', 'gcl': 'h#', 'kcl': 'h#',
               'q': 'h#', 'epi': 'h#', 'pau': 'h#'}

phn_39 = ['ae', 'ao', 'aw', 'ax', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', 
             'er', 'ey', 'f', 'g', 'h#', 'hh', 'ix', 'iy', 'jh', 'k', 'l', 
             'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 't', 'th', 'uh', 'uw',
             'v', 'w', 'y', 'z', 'zh']

development_set = ['faks0', 'mmdb1', 'mbdg0', 'fedw0', 'mtdt0', 'fsem0', 'mdvc0', 'mrjm4', 'mjsw0', 'mteb0',
                  'fdac1', 'mmdm2', 'mbwm0', 'mgjf0', 'mthc0', 'mbns0', 'mers0', 'fcal1', 'mreb0', 'mjfc0',
                  'fjem0', 'mpdf0', 'mcsh0', 'mglb0', 'mwjg0', 'mmjr0', 'fmah0', 'mmwh0', 'fgjd0', 'mrjr0',
                  'mgwt0', 'fcmh0', 'fadg0', 'mrtk0', 'fnmr0', 'mdls0', 'fdrw0', 'fjsj0', 'fjmg0', 'fmml0',
                  'mjar0', 'fkms0', 'fdms0', 'mtaa0', 'frew0', 'mdlf0', 'mrcs0', 'majc0', 'mroa0', 'mrws1']

core_test_set = ['mdab0', 'mwbt0', 'felc0', 'mtas1', 'mwew0', 'fpas0', 'mjmp0', 'mlnt0', 'fpkt0',
             'mlll0', 'mtls0', 'fjlm0', 'mbpm0', 'mklt0', 'fnlp0', 'mcmj0', 'mjdh0', 'fmgd0',
            'mgrt0', 'mnjm0', 'fdhc0', 'mjln0', 'mpam0', 'fmld0']


TIMIT_DIR = './' # root directory for timit, it would be joined with timit/train or timit/test
TFRECORD_DIR = './data' # directory for tfrecords files

In [3]:
def prepare_timit_dataset(train_set=True, dev_set=True, test_set=True, feats_type='mfcc'):
    '''
    feats_type:
    - mfcc: 13 mel frequency cepstral coefficients + delta + delta delta, total 39 dimension
    - fbank: 40 log filter bank with energy + delta + delta delta, total 123 dimension
    '''
    
    def create_tfrecords(tfrecord_path, root_dir, fname, filter_fn):
        writer = tf.python_io.TFRecordWriter(os.path.join(tfrecord_path, (fname + '.tfrecords')))
        feats_list = []
        phoneme_list = []
        start = time.time()
        cnt = 0
        for path, dirs, files in os.walk(root_dir):
            for file in files:
                if filter_fn(file, path):
                    continue
                if file.endswith('wav'):
                    fullFileName = os.path.join(path, file)
                    fnameNoSuffix = os.path.splitext(fullFileName)[0]
                    fNameTmp = fnameNoSuffix + '_tmp.wav'
                    # convert nist file format to wav with command line program 'sox'
                    subprocess.call(['sox', fullFileName, fNameTmp], shell=True)
                    rate, sig = wav.read(fNameTmp)
                    os.remove(fNameTmp)

                    if feats_type == 'mfcc':
                        mfcc_feat = mfcc(sig, rate)
                        mfcc_feat_delta = delta(mfcc_feat, 2)
                        mfcc_feat_delta_delta = delta(mfcc_feat_delta, 2)
                        feats = np.concatenate((mfcc_feat, mfcc_feat_delta, mfcc_feat_delta_delta), axis=1)
                    else: # fbank
                        filters, energy = fbank(sig, rate, nfilt=40)
                        log_filters, log_energy = np.log(filters), np.log(energy)
                        logfbank_feat = np.concatenate((log_filters, log_energy.reshape(-1,1)), axis=1)
                        logfbank_feat_delta = delta(logfbank_feat, 2)
                        logfbank_feat_delta_delta = delta(logfbank_feat_delta, 2)
                        feats = np.concatenate((logfbank_feat, logfbank_feat_delta, logfbank_feat_delta_delta), axis=1)
                    feats_list.append(feats)

                    # .phn
                    phoneme = []
                    with open(fnameNoSuffix + '.phn', 'r') as f:
                        for line in f.read().splitlines():
                            phn = line.split(' ')[2]
                            p_index = phn_61.index(phn)
                            phoneme.append(p_index)
                    phoneme_list.append(phoneme)

                    cnt += 1
                    
        if fname == 'train':
            scaler = StandardScaler()
            scaler.fit(np.concatenate(feats_list, axis=0))
            print('scaler.n_samples_seen_:', scaler.n_samples_seen_)
            pickle.dump(scaler, open(os.path.join(tfrecord_path, 'scaler.pkl'), 'wb'))
            
        if not os.path.exists(os.path.join(tfrecord_path, 'scaler.pkl')):
            raise Exception('scaler.pkl not exist, call with [train_set=True]')
        else:
            scaler = pickle.load(open(os.path.join(tfrecord_path, 'scaler.pkl'), 'rb'))
        
        for feats, phoneme in zip(feats_list, phoneme_list):
            seq_exam = tf.train.SequenceExample()
            seq_exam.context.feature['feats_dim'].int64_list.value.append(feats.shape[1])
            seq_exam.context.feature['feats_seq_len'].int64_list.value.append(feats.shape[0])
            seq_exam.context.feature['labels_seq_len'].int64_list.value.append(len(phoneme))

            feats = scaler.transform(feats)
            for feat in feats:
                seq_exam.feature_lists.feature_list['features'].feature.add().float_list.value[:] = feat
            for p in phoneme:
                seq_exam.feature_lists.feature_list['labels'].feature.add().int64_list.value.append(p)
            writer.write(seq_exam.SerializeToString())

        writer.close()
        print('{} created: {} utterances - {:.0f}s'.format(fname+'.tfrecords', cnt, (time.time()-start)))
    # end create_tfrecords() definition
    
    tfrecord_path = os.path.join(TFRECORD_DIR, feats_type)
    if not os.path.isdir(tfrecord_path):
        os.makedirs(tfrecord_path)
    
    if train_set:
        create_tfrecords(tfrecord_path, os.path.join(TIMIT_DIR, 'timit/train'), 'train',
                         lambda file, _: file.startswith('sa'))
    if dev_set:
        create_tfrecords(tfrecord_path, os.path.join(TIMIT_DIR, 'timit/test'), 'dev', 
                         lambda file, path: file.startswith('sa') or os.path.split(path)[1] not in development_set)
    if test_set:
        create_tfrecords(tfrecord_path, os.path.join(TIMIT_DIR, 'timit/test'), 'test', 
                         lambda file, path: file.startswith('sa') or os.path.split(path)[1] not in core_test_set)

In [4]:
prepare_timit_dataset(feats_type='fbank')

scaler.n_samples_seen_: 1128519
train.tfrecords created: 3696 utterances - 549s
dev.tfrecords created: 400 utterances - 58s
test.tfrecords created: 192 utterances - 28s
