* sox: convert timit file format to .wav
* sklearn: framewise zero mean, unit variance

* 'SA' files removed
* reduced phoneme 61 -> 39
* 3696 train, 1344 test total 5040 utterances
* speech features: mfcc [num of cepstrum: 13, windows length: 25ms, windows distance: 10ms] + delta + delta_delta -> total 39 dim

In [1]:
import numpy as np
from python_speech_features import mfcc, fbank, delta
from sklearn.preprocessing import StandardScaler
import scipy.io.wavfile as wav
import subprocess
import os, time

In [2]:
## original phonemes
phn_61 = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl', 'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi', 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy', 'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau', 'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux', 'v', 'w', 'y', 'z', 'zh']

mapping = {'ah': 'ax', 'ax-h': 'ax', 'ux': 'uw', 'aa': 'ao', 'ih': 'ix', \
               'axr': 'er', 'el': 'l', 'em': 'm', 'en': 'n', 'nx': 'n',\
               'eng': 'ng', 'sh': 'zh', 'hv': 'hh', 'bcl': 'h#', 'pcl': 'h#',\
               'dcl': 'h#', 'tcl': 'h#', 'gcl': 'h#', 'kcl': 'h#',\
               'q': 'h#', 'epi': 'h#', 'pau': 'h#'}

phn_39 = ['ae', 'ao', 'aw', 'ax', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', \
             'er', 'ey', 'f', 'g', 'h#', 'hh', 'ix', 'iy', 'jh', 'k', 'l', \
             'm', 'n', 'ng', 'ow', 'oy', 'p', 'r', 's', 't', 'th', 'uh', 'uw',\
             'v', 'w', 'y', 'z', 'zh']

TRAIN_DIR_TIMIT = os.path.join('timit','train')
TRAIN_DIR_FEATS = os.path.join('data','train','feats', 'mfcc')
TRAIN_DIR_LABELS = os.path.join('data','train','labels')

TEST_DIR_TIMIT = os.path.join('timit','test')
TEST_DIR_FEATS = os.path.join('data','test','feats', 'mfcc')
TEST_DIR_LABELS = os.path.join('data','test','labels')

In [3]:
def gen_feats_labels_files(data_type='train', feat_type='mfcc'):
    
    if data_type == 'train':
        DIR = (TRAIN_DIR_TIMIT, TRAIN_DIR_FEATS, TRAIN_DIR_LABELS)
    else: # types == 'test'
        DIR = (TEST_DIR_TIMIT, TEST_DIR_FEATS, TEST_DIR_LABELS)
        
    if not os.path.isdir(DIR[1]):
        os.makedirs(DIR[1])
    if not os.path.isdir(DIR[2]):
        os.makedirs(DIR[2])
    
    start = time.time()
    cnt = 0
    for path, dirs, files in os.walk(DIR[0]):
        for file in files:
            if file.startswith('sa'): # exclude all 'SA' files according to 'https://github.com/zzw922cn/Automatic_Speech_Recognition'
                continue
            if file.endswith('wav'):
                # .wav
                fullFileName = os.path.join(path, file)
                fnameNoSuffix = os.path.splitext(fullFileName)[0]
                fNameTmp = fnameNoSuffix + '_tmp.wav'
                subprocess.call(['sox', fullFileName, fNameTmp], shell=True)
                rate, sig = wav.read(fNameTmp)
                
                if feat_type == 'mfcc':
                    mfcc_feat = mfcc(sig, rate)
                    mfcc_feat_delta = delta(mfcc_feat, 2)
                    mfcc_feat_delta_delta = delta(mfcc_feat_delta, 2)
                    feats = np.concatenate((mfcc_feat, mfcc_feat_delta, mfcc_feat_delta_delta), axis=-1)
                else: # filter bank + energy
                    filters, energy = fbank(sig, rate, nfilt=40)
                    fbank_feat = np.concatenate((filters, energy.reshape(-1,1)), axis=-1)
                    fbank_feat_delta = delta(fbank_feat, 2)
                    fbank_feat_delta_delta = delta(fbank_feat_delta, 2)
                    feats = np.concatenate((fbank_feat, fbank_feat_delta, fbank_feat_delta_delta), axis=-1)
                    
                featFileName = os.path.join(DIR[1], fnameNoSuffix.split(os.sep)[-2] + '-' + fnameNoSuffix.split(os.sep)[-1] + '.npy')
                np.save(featFileName, feats)
                os.remove(fNameTmp)
                
                # .phn
                phoneme = []
                with open(fnameNoSuffix + '.phn', 'r') as f:
                    for line in f.read().splitlines():
                        phn = line.split(' ')[2]
                        p_index = phn_61.index(phn)
                        phoneme.append(p_index)
                phoneme = np.array(phoneme)
                
                labelFileName = os.path.join(DIR[2], fnameNoSuffix.split(os.sep)[-2] + '-' + fnameNoSuffix.split(os.sep)[-1] + '.npy')
                np.save(labelFileName, phoneme)
                
                cnt += 1
    print('{}-{}: {} utterances - {:.0f}s'.format(data_type, feat_type, cnt, (time.time()-start)))

In [4]:
gen_feats_labels_files(data_type='test', feat_type='mfcc')

test-mfcc: 1344 utterances - 71s


In [5]:
gen_feats_labels_files(data_type='train', feat_type='mfcc')

train-mfcc: 3696 utterances - 197s


In [6]:
# feature-wise zeros mean, unit variance
def scale_features():
    start = time.time()
    X_train = []
    for file in os.listdir(TRAIN_DIR_FEATS):
        X_train.append(np.load(os.path.join(TRAIN_DIR_FEATS, file)))
    X_train = np.concatenate(X_train, axis=0)
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    for file in os.listdir(TRAIN_DIR_FEATS):
        fname = os.path.join(TRAIN_DIR_FEATS, file)
        X = np.load(fname)
        scaler.transform(X)
        np.save(fname, X)
    
    for file in os.listdir(TEST_DIR_FEATS):
        fname = os.path.join(TEST_DIR_FEATS, file)
        X = np.load(fname)
        scaler.transform(X)
        np.save(fname, X)
    
    print('{:.0f}s elpased'.format(time.time()-start))

In [7]:
scale_features()

42s elpased
