 # Packages

In [1]:
# -*- coding: utf-8 -*-
import os
import librosa
import numpy as np
# import wave
# import time
# import pyaudio
# import seaborn
# import IPython
# import pylab as plt
# import scipy.signal as signal
# import scipy.io.wavfile as wav
import matplotlib.pyplot as plt
# from sys import argv
# from matplotlib import cm
# from scipy.fftpack import fft,ifft
# from python_speech_features import mfcc
# from scipy.spatial.distance import cdist

# Read wave files and calculate MFCC

In [2]:
def read_and_mfcc(wavepath):
    nmfcc = 24
    # python_speech_feature
    #(rate,sig) = wav.read(wavepath)
    #mfccfeat = mfcc(signal=sig, samplerate=rate, numcep=nmfcc, nfilt=26, nfft=1152)
    # librosa
    y, sr = librosa.load(wavepath)
    mfccfeat = np.transpose(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = nmfcc),[1,0])
    #print(len(mfccfeat),len(mfccfeat[0]))
    return mfccfeat

# DTW

In [3]:
def dist_for_Euclid(p1, p2):
    dist = 0.0
    elem_type = type(p1)
    if elem_type == float or elem_type == int:
        dist = float(abs(p1 - p2))
    else:
        sumval = 0.0
        for i in range(len(p1)):
            sumval += pow(p1[i] - p2[i], 2)
        dist = pow(sumval, 0.5)
    return dist

def dtw(x, y, dist = dist_for_Euclid, warp=1):
    """
    Computes Dynamic Time Warping (DTW) of two sequences.
    :param array x: N1*M array
    :param array y: N2*M array
    :param func dist: distance used as cost measure
    :param int warp: how many shifts are computed.
    Returns the minimum distance, the cost matrix, the accumulated cost matrix, and the wrap path.
    """
    assert len(x)
    assert len(y)
    r, c = len(x), len(y)
    D0 = np.zeros((r + 1, c + 1))
    D0[0, 1:] = np.inf
    D0[1:, 0] = np.inf
    D1 = D0[1:, 1:]  # view
    for i in range(r):
        for j in range(c):
            D1[i, j] = dist(x[i], y[j])
    C = D1.copy()
    for i in range(r):
        for j in range(c):
            min_list = [D0[i, j]]
            for k in range(1, warp + 1):
                i_k = min(i + k, r - 1)
                j_k = min(j + k, c - 1)
                min_list += [D0[i_k, j], D0[i, j_k]]
            D1[i, j] += min(min_list)
    if len(x)==1:
        path = np.zeros(len(y)), range(len(y))
    elif len(y) == 1:
        path = range(len(x)), np.zeros(len(x))
    else:
        path = _traceback(D0)
        
    return D1[-1, -1] / sum(D1.shape), C, D1, path


def _traceback(D):
    i, j = np.array(D.shape) - 2
    p, q = [i], [j]
    while (i > 0) or (j > 0):
        tb = np.argmin((D[i, j], D[i, j+1], D[i+1, j]))
        if tb == 0:
            i -= 1
            j -= 1
        elif tb == 1:
            i -= 1
        else:  # (tb == 2):
            j -= 1
        p.insert(0, i)
        q.insert(0, j)
    return np.array(p), np.array(q)

def dtw_display(s1, s2, distfunc=dist_for_Euclid):
    val, path = dtw(s1, s2, distfunc)
    w = len(s1)
    h = len(s2)
    mat = [[1] * w for i in range(h)]
    for node in path :
        x, y = node
        mat[y][x] = 0
    mat = np.array(mat)
    plt.subplot(2, 2, 2)
    c = plt.pcolor(mat, edgecolors='k', linewidths=4)
    plt.title('Dynamic Time Warping (%f)' % val)
    plt.subplot(2, 2, 1)
    plt.plot(s2, range(len(s2)), 'g')
    plt.subplot(2, 2, 4)
    plt.plot(range(len(s1)), s1, 'r')
    plt.show()

# Train and Test Function
### extract mfcc features from the train dataset and build a model, save it to a text file.

In [4]:
def train_each_model(dirpath, savepath, digitstr, distfun = dist_for_Euclid, standard = 0):
    filelist = os.listdir(dirpath)
    mfcclist = []
    # print(filelist)
    for i in range(len(filelist)):
        filepath = os.path.join(dirpath,filelist[i])
        # print(filepath)
        if os.path.isfile(filepath):
            (filedir,filename) = os.path.split(filepath)
            (shotname,extension) = os.path.splitext(filename)
            # print(shotname, extension)
            if extension == ".wav":
                mfccfeat = read_and_mfcc(filepath)
                # print(len(mfccfeat),len(mfccfeat[0]))
                mfcclist.append(mfccfeat)        
    # print(mfcclist[0].shape)
    count = np.zeros(len(mfcclist[standard]))
    sumall = np.zeros(mfcclist[standard].shape)
    for i in range(len(mfcclist)):
        dist, cost, acc, path = dtw(mfcclist[standard], mfcclist[i], distfun)
        for j in range(len(path[0])):
            count[int(path[0][j])] += 1
            sumall[int(path[0][j])] += mfcclist[i][path[1][j]]
    # print(count,sumall)
    avg = np.zeros(sumall.shape)
    for i in range(len(count)):
        for j in range(len(sumall[i])):
            avg[i][j] = sumall[i][j] / count[i]
    # print(avg)
    np.savetxt(os.path.join(savepath, digitstr + ".txt"), avg)
    b = np.loadtxt(os.path.join(savepath, digitstr + ".txt"), dtype = np.float32)
    # print(digitstr, b)
    
def train_all_model(traindir, savepath, distfun = dist_for_Euclid):
    #digit_list = ['zero','one','two','three','four','five','six','seven','eight','nine']
    digitlist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    for i in range(len(digitlist)):
        dirpath = os.path.join(traindir, "digit_" + digitlist[i])
        if os.path.isdir(dirpath):
            train_each_model(dirpath, savepath, digitlist[i])
            
def load_model(modeldir):
    modellist = []
    digitlist = []
    filelist = os.listdir(modeldir)
    for i in range(len(filelist)):
        modelpath = os.path.join(modeldir, filelist[i])
        # print(modelpath)
        if os.path.isfile(modelpath):
            (filedir, modelname) = os.path.split(modelpath)
            (shotname, extension) = os.path.splitext(modelname)
            # print(shotname, extension)
            if extension == ".txt":
                mfcc = np.loadtxt(modelpath, dtype = np.float32)
                modellist.append(mfcc)
                digitlist.append(shotname)
    return modellist, digitlist

def digit_recognition(modellist, digitlist, filepath, distfun = dist_for_Euclid):
    if os.path.isfile(filepath):
        (filedir,filename) = os.path.split(filepath)
        (shotname,extension) = os.path.splitext(filename)
        # print(shotname, extension)
        if extension == ".wav":
            mfccfeat = read_and_mfcc(filepath)
            # print(len(mfccfeat),len(mfccfeat[0]))
            mindist = np.inf
            for i in range(len(modellist)):
                dist, _, _, _ = dtw(modellist[i], mfccfeat)
                # print(digitlist[i], dist)
                if dist < mindist:
                    mindigit = i
                    mindist = dist
    return digitlist[mindigit]

# Processing
### Train the model and save the model to a text file

In [17]:
train_all_model("../../dataset/records", "../model3")
modellist, digitlist = load_model("../model3")

### Calculate accuracy via recognize the test dataset

In [18]:
testdir = "../../dataset/test"
filelist = os.listdir(testdir)
countall = 0
countright = 0
for i in range(len(filelist)):
    filepath = os.path.join(testdir,filelist[i])
    if os.path.isfile(filepath):
        (filedir,filename) = os.path.split(filepath)
        (shotname,extension) = os.path.splitext(filename)
        # print(shotname, extension)
        if extension == ".wav":
            countall += 1
            result = digit_recognition(modellist, digitlist, filepath)
            # print(shotname[0], result)
            if shotname[0] == result:
                countright += 1

0 5
0 0
0 7
0 5
0 7
1 9
1 9
1 9
1 9
1 9
2 2
2 7
2 2
2 2
2 2
3 9
3 5
3 7
3 5
3 2
4 5
4 5
4 7
4 5
4 4
5 5
5 7
5 5
5 5
5 5
6 6
6 6
6 6
6 6
6 6
7 5
7 7
7 7
7 7
7 7
8 8
8 8
8 8
8 8
8 8
9 9
9 9
9 9
9 9
9 9


### Output the results

In [19]:
print(countright, countall, countright / countall)

29 50 0.58


# Record wave files via a tkinter module

In [51]:
from pyaudio import PyAudio,paInt16
from datetime import datetime
import wave
from tkinter import *
import sys
import tkinter.messagebox

# define of params
NUM_SAMPLES =160
FRAMERATE = 16000
CHANNELS = 1
SAMPWIDTH = 2
FORMAT = paInt16
TIME = 125
FRAMESHIFT = 160

def save_wave_file(filename,data):
    '''save the date to the wav file'''
    wf = wave.open(filename,'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(SAMPWIDTH)
    wf.setframerate(FRAMERATE)
    wf.writeframes(b"".join(data)) # ""中间不能有空格，不然语音录入会有很多中断。
    wf.close()

# First cut the wave file and compute the 
def recognize(filepath):
    cut_invalid(filepath)
    result = digit_recognition(modellist, digitlist, filepath)
    #result = "8"
    tkinter.messagebox.showinfo('Recognition Result', "You have spoken "+result+".")
    
def my_button(root,label_text,button_text,button_stop,button_func,stop_func):
    '''create label and button'''
    info_label = Label(root,text=label_text,width=100,height=3).pack()
    record_button = Button(root,text=button_text,command=button_func,anchor='center',width=30,height=3).pack()
    exit_button = Button(root,text=button_stop,command=lambda:stop_func(root),anchor='center',width=30,height=3).pack()

def record_wave():
    '''record the input of wave'''
    savedir = "../myrecords"
    pa = PyAudio()

    stream = pa.open(format=FORMAT,
                     channels=CHANNELS,
                     rate=FRAMERATE,
                     input=True,
                     frames_per_buffer=NUM_SAMPLES)

    save_buffer = []
    count = 0
                        
    while count < TIME*4:
        string_audio_data = stream.read(NUM_SAMPLES)
        frame = count*NUM_SAMPLES/float(FRAMESHIFT)
        time = count*NUM_SAMPLES/float(FRAMERATE)
        save_buffer.append(string_audio_data)
        count += 1
   
    filename = datetime.now().strftime("%Y-%m-%d_%H_%M_%S")+".wav"
    filepath = os.path.join(savedir, filename)
    save_wave_file(filepath, save_buffer)
      
    save_buffer = []
    print (filename+",saved.")
    
    recognize(filepath)

def record_stop(root):
    # stop record the wave
    root.destroy()
    # sys.exit(0)

def record_module():
    root = Tk()
    root.geometry('400x200+450+250')
    root.title('Record a wave')
    root.resizable(False, False)
    my_button(root, "Record a wave with digital speech and recognize it.", "Clik to record", "Exit record", record_wave, record_stop)
    root.mainloop()

# Record a wave file with digital speech and recognize it.

In [52]:
record_module()

2018-11-19_17_03_06.wav,saved.
