In [1]:
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
from matplotlib import pyplot as plt
import librosa.display
import os
from time import time
import librosa
import tensorflow as tf
import soundfile

import warnings
warnings.filterwarnings('ignore')

# separation trainset and validation set
def sep_train_val(path_timit, train_rate=0.8) :
    train = []
    val = []
    for dr in os.listdir(os.path.join(path_timit,'train')) :
        list_dr = os.listdir(os.path.join(path_timit,'train',dr))
        for num in range(len(list_dr)) :
            if num < len(list_dr)*train_rate :
                train.append(os.path.join(path_timit,'train',dr,list_dr[num]))
            else :
                val.append(os.path.join(path_timit,'train',dr,list_dr[num]))
    
    return train, val


# make input(mfcc) and label base phn file
def wav2mfcc(path_file, filename, tf=0.02,ts=0.01,sr=16000, num_mfcc=20) :
    Ts = int(sr*ts)
    Tf = int(sr*tf)

    wav, Fs = librosa.load(os.path.join(path_file, f'{filename}.wav'), sr = sr)
    mfccs = []
    labels = []
    with open(os.path.join(path_file,f'{filename}.phn')) as f :
        lines = f. readlines()
        for line in lines:
            line = line.split(' ')
            start = int(line[0])
            end = int(line[1])
            phn = line[2].strip()
            mfcc = librosa.feature.mfcc(y=wav[start:end],sr=Fs,
                n_mfcc=num_mfcc,win_length=Tf, hop_length=Ts).T
            if phn in voiced :
                class_label = [1., 0., 0.]
            elif phn in unvoiced :
                class_label = [0., 1., 0.]
            else :
                class_label = [0., 0., 1.]
            label = []
            for i in range(len(mfcc)) :
                label.append(class_label)
            #print(start, end, len(mfcc), mfcc.shape, np.array(label).shape)
            mfccs.append(mfcc)
            labels.append(label)
    
    #print(np.concatenate(mfccs).shape)
    #print(np.concatenate(labels).shape)
    return np.concatenate(mfccs), np.concatenate(labels)


# make model_io to using model
def model_io(dir_list, opt) :
    mfccs = []
    labels = []
    for dir in dir_list :
        for file in os.listdir(dir) :
            if file.endswith('.wav') :
                file_name = file[:-4]
                if f'{file_name}.phn' in os.listdir(dir) :
                    wav_mfccs, wav_labels =wav2mfcc(dir,file_name,tf=opt['tf'],
                        ts = opt['ts'],sr=opt['sr'],num_mfcc=opt['num_mfcc'])
                    mfccs.append(wav_mfccs)
                    labels.append(wav_labels)
    return np.concatenate(mfccs), np.concatenate(labels)

# MLP model
def mk_model(D_in, D_out, layers) :
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(layers[0], input_shape=(D_in,), activation= 'relu'))
    for i in range(len(layers)-1) :
        model.add(tf.keras.layers.Dense(layers[i+1], activation= 'relu'))
    model.add(tf.keras.layers.Dense(D_out, activation='softmax'))
    return model


def test_wav(path_file, opt) :
    #load wavfile
    wav, Fs = librosa.load(path_file, sr = opt['sr'])

    Ns = int(Fs*opt['ts'])
    Nf = int(Fs*opt['tf'])
    K = (len(wav)-Nf)//Ns

    # load start point, end point, labels
    phn_start = []
    phn_end = []
    phn_label = []
    with open(f'{path_file[:-4]}.phn') as f :
        lines = f. readlines()
        for line in lines:
            line = line.split(' ')
            phn_start.append(int(line[0]))
            phn_end.append(int(line[1]))
            phn = line[2].strip()
            if phn in voiced :
                phn_label.append(0)
            elif phn in unvoiced :
                phn_label.append(1)
            elif phn in silence :
                phn_label.append(2)

    # separation frame and make label
    wav_y = []
    len_phn = len(phn_start)
    for k in range(K) :
        for n in range(len_phn) :
            if k*Ns > phn_end[n] :
                continue
            if (k*Ns)+Nf < phn_end[n] :
                wav_y.append(phn_label[n])
            elif phn_end[n]-(k*Ns) > (k*Ns)+Nf-phn_end[n] :
                wav_y.append(phn_label[n])
            elif n != len_phn-1 :
                wav_y.append(phn_label[n+1])
            else : wav_y.append(phn_label[n])

            break
    return librosa.feature.mfcc(y=wav,sr=Fs,n_mfcc=opt['num_mfcc'],win_length=Nf, hop_length=Ns).T, np.array(wav_y), wav


def test_model(path_timit,model, opt) :
    total_count = 0
    acc_count = 0
    for dr in os.listdir(os.path.join(path_timit, 'test')) :
        for speaker in os.listdir(os.path.join(path_timit,'test',dr)) :
            os.system('mkdir -p %s'%os.path.join('result',dr,speaker))
            for file in os.listdir(os.path.join(path_timit,'test',dr, speaker)) :

                # if not wav file or not exist phn file
                if not file.endswith('.wav') :
                    continue
                if not f'{file[:-4]}.phn' in os.listdir(os.path.join(path_timit,'test',dr, speaker)) :
                    continue
                wav_x, wav_y, wav = test_wav(os.path.join(path_timit,'test',dr, speaker, file), opt)
                
                wav_x = wav_x[:len(wav_y),...]
                y = model.predict(wav_x)
                y_hat = np.zeros(len(y))
                for n in range(len(y)) :
                    y_hat[n] = np.argmax(y[n,:])
                    if y_hat[n] == wav_y[n] :
                        acc_count = acc_count + 1
                total_count = total_count + len(y)

                # to plot
                Ns = int(opt['sr']*opt['ts'])
                pred_y = np.zeros(len(y)*Ns, dtype=float)
                true_y = np.zeros(len(y)*Ns, dtype=float)
                for n in range(len(y)) :
                    pred_y[n*Ns:(n+1)*Ns] = np.max(wav)/2*(2-y_hat[n])
                    true_y[n*Ns:(n+1)*Ns] = np.max(wav)/2*(2-wav_y[n])
                plt.figure()
                plt.plot(wav)
                plt.plot(pred_y, 'r', label='prediction')
                plt.plot(true_y, 'b', label='ground truth')
                plt.legend(loc='lower right')
                plt.savefig(os.path.join('result',dr,speaker, f'{file[:-4]}.png'))
                plt.close()

    print(f'test accuracy : {acc_count/total_count*100}')


########################################
# voiced, unvoiced, silence
########################################
voiced = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'axr', 'ay', 'eh', 'er', 'ey', 'ih', 'ix', 'iy', 'ow', 'oy', 'uh', 'uw', 'ux', 'b', 'd', 'dh', 'el', 'em', 'en', 'g', 'jh', 'l', 'm', 'n', 'ng', 'nx', 'r', 'v', 'w', 'wh', 'y', 'z', 'zh', 'eng', 'hv', 'bcl', 'dcl', 'gcl']
unvoiced = ['ch', 'dx', 'f', 'hh', 'k', 'p', 'q', 's', 'sh', 't', 'th', 'ax-h']
silence = ['kcl', 'pcl', 'tcl', 'pau', 'epi', 'h#']

def main() :
    # option
    path_data = 'timit_wav'
    mk_data = False
    apply_train = False
    opt = dict()
    opt['sr'] = 16000
    opt['ts'] = 0.01
    opt['tf'] = 0.02
    #opt['num_mfcc'] = 2
    #opt['num_mfcc'] = 6
    opt['num_mfcc'] = 13
    #opt['num_mfcc'] = 20
    layers = [32, 64, 32]

    path_temp = 'temp/mfcc_'+str(opt['num_mfcc'])
    path_model = os.path.join('temp','model')

    if mk_data :
        os.system('mkdir -p %s'%path_temp)
        ###################################
        # separation train and validation
        #####################################
        dir_train, dir_val = sep_train_val(path_data)
        print("sep train and val data :", len(dir_train), len(dir_val))
        train_x, train_y = model_io(dir_train, opt)
        print('model_io(train) :', train_x.shape, train_y.shape)
        np.save(f'{path_temp}/train_x', train_x)
        np.save(f'{path_temp}/train_y', train_y)
        val_x, val_y = model_io(dir_val, opt)
        print('model_io(val) :', val_x.shape, val_y.shape)
        np.save(f'{path_temp}/val_x', val_x)
        np.save(f'{path_temp}/val_y', val_y)
    else :
        print("Pass make_model_io")
        train_x = np.load(f'{path_temp}/train_x.npy')
        train_y = np.load(f'{path_temp}/train_y.npy')
        val_x = np.load(f'{path_temp}/val_x.npy')
        val_y = np.load(f'{path_temp}/val_y.npy')

    if apply_train :
        ######################################
        # train model
        ######################################
        model = mk_model(opt['num_mfcc'], 3, layers)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
        model.fit(train_x, train_y, epochs=100, validation_data=(val_x, val_y), callbacks=[early])

        os.system('mkdir -p %s'%path_model)
        model.save(os.path.join(path_model,'mfcc_'+str(opt['num_mfcc'])))
    else :
        print("Pass train model")
        model = tf.keras.models.load_model(os.path.join(path_model,'mfcc_'+str(opt['num_mfcc'])))
    
    ########################################
    # test
    ########################################
    test_model(path_data,model, opt)

if __name__ == '__main__' :
    main()

Pass make_model_io
Pass train model
test accuracy : 86.15005886760142
