# DEEE725 Speech Signal Processing Lab
### 2023 Spring, Kyungpook National University 
### Instructor: Gil-Jin Jang

## Lab 01 Korean digit recognition using python-hmmlearn
version 2, 2023/03/24
source: [jayaram1125's github repository](https://github.com/jayaram1125/Single-Word-Speech-Recognition-using-GMM-HMM-)

__update description:__

1. assigns sound files 8 and 9 for test out of 0...9, the rest (0...7) are for training
    no random selection for reproducibility
2. folder structure change

> segmented/${username}/${dnum}/kdigits${trial}-${dnum}.wav
> > for example, for user "gjang", digit 2, recording trial 0 (1st)
> > "segmented/gjang/2/kdigits0-2.wav"

In [15]:
# import necessary packages
import numpy as np
import matplotlib.pyplot as plt
#from scikits.talkbox.features import mfcc
#librosa.feature.mfcc(*, y=None, sr=22050, S=None, n_mfcc=20, dct_type=2, norm='ortho', lifter=0, **kwargs)[source]
from librosa.feature import mfcc
from scipy.io import wavfile
from hmmlearn import hmm
import numpy as np
import os
import warnings
import scipy.stats as sp
from time import time

warnings.filterwarnings("ignore")

__hyperparameters__ - CHANGE THEM TO IMPROVE PERFORMANCE
1. number of MFCC (feature dimension), try `num_mfcc` 6, 10, 13

2. Parameters needed to train GMMHMM: number of HMM states, number of Gaussian mixtures, diagonal or full covariance matrix, etc.

In [16]:
# 1. number of MFCC (feature dimension)
num_mfcc = 6
#num_mfcc = 10
#num_mfcc = 13
# 2. Parameters needed to train GMMHMM
m_num_of_HMMStates = 3  # number of states
m_num_of_mixtures = 2  # number of mixtures for each hidden state
m_covarianceType = 'diag'  # covariance type
m_n_iter = 10  # number of iterations
m_bakisLevel = 2

In [17]:
# extract MFCC features
def extmfcc(file):
    samplerate, d = wavfile.read(file)
    #features.append(mfcc(d, nwin=int(samplerate * 0.03), fs=samplerate, nceps= 6)[0])
    x = np.float32(d)
    hop=samplerate//100
    mc = mfcc(y=x, sr=samplerate, n_mfcc=num_mfcc, hop_length=hop, win_length=hop*2)
    return np.transpose(mc, (1,0))

__load data files__

1. find files: 
    for user `"gjang"`, digit 2, recording trial 0 (1st)
    `"segmented/gjang/2/kdigits0-2.wav"`
2. extract MFCC features for training and testing
    for each digit, indexes 4 and 9 for test, and the rest for training

In [18]:
#fpaths = []
#labels = []
spoken = []
m_trainingsetfeatures = []
m_trainingsetlabels = []
m_testingsetfeatures = []
m_testingsetlabels = []
n_folds = 5   # 0...3 for training, 4 for testing

apath = 'segmented'
count = 0
for username in os.listdir(apath):
    apath2 = apath + '/' + username    # example: segmented/gjang
    for ii in range(10):   #dnum in os.listdir(apath2):
        dnum = str(ii)
        apath3 = apath2 + '/' + dnum     # example: segmented/gjang/2
        if dnum not in spoken:
            spoken.append(dnum)
        for trial in range(10):
            file = apath3 + '/' + "kdigits{}-{}.wav".format(trial,dnum)      # segmented/gjang/2/kdigits0-2.wav
            mc = extmfcc(file)

            # display file names for the first 20 files only
            count += 1
            if count <= 20:
                print(file, dnum, end=' '); print(mc.shape, end=' ')
            elif count == 21:
                print('...'); print('')

            # 0...3 for training, 4 for testing
            if trial % n_folds == (n_folds-1):
                if count <= 20: print('testing')
                m_testingsetfeatures.append(mc)
                m_testingsetlabels.append(dnum)
            else:
                if count <= 20: print('training')
                m_trainingsetfeatures.append(mc)
                m_trainingsetlabels.append(dnum)


print('Words spoken:', spoken)
#print("number of labels and features = %d, %d" % ( len(labels), len(features) ))
#print("feature shape = ", end='')
#print(features[0].shape)

segmented/ohjihyeon/0/kdigits0-0.wav 0 (52, 6) training
segmented/ohjihyeon/0/kdigits1-0.wav 0 (43, 6) training
segmented/ohjihyeon/0/kdigits2-0.wav 0 (37, 6) training
segmented/ohjihyeon/0/kdigits3-0.wav 0 (41, 6) training
segmented/ohjihyeon/0/kdigits4-0.wav 0 (42, 6) testing
segmented/ohjihyeon/0/kdigits5-0.wav 0 (43, 6) training
segmented/ohjihyeon/0/kdigits6-0.wav 0 (42, 6) training
segmented/ohjihyeon/0/kdigits7-0.wav 0 (43, 6) training
segmented/ohjihyeon/0/kdigits8-0.wav 0 (42, 6) training
segmented/ohjihyeon/0/kdigits9-0.wav 0 (42, 6) testing
segmented/ohjihyeon/1/kdigits0-1.wav 1 (42, 6) training
segmented/ohjihyeon/1/kdigits1-1.wav 1 (41, 6) training
segmented/ohjihyeon/1/kdigits2-1.wav 1 (41, 6) training
segmented/ohjihyeon/1/kdigits3-1.wav 1 (61, 6) training
segmented/ohjihyeon/1/kdigits4-1.wav 1 (35, 6) testing
segmented/ohjihyeon/1/kdigits5-1.wav 1 (57, 6) training
segmented/ohjihyeon/1/kdigits6-1.wav 1 (46, 6) training
segmented/ohjihyeon/1/kdigits7-1.wav 1 (35, 6) trai

In [19]:
# gjang: shuffling the data (x)
# c = list(zip(features, labels))
# np.random.shuffle(c)
# features,labels = zip(*c)

In [20]:
# test and training for 100 files
ntest  = len(m_testingsetlabels)
ntrain = len(m_trainingsetlabels)
nfiles = ntest + ntrain

print("[training] number of labels and features = %d, %d" % 
        ( len(m_trainingsetlabels), len(m_trainingsetfeatures)) )
print("[test] number of labels and features = %d, %d" % 
        ( len(m_testingsetlabels), len(m_testingsetfeatures)) )

print ('Loading data completed')

[training] number of labels and features = 80, 80
[test] number of labels and features = 20, 20
Loading data completed


In [21]:
# model initialization
gmmhmmindexdict = {}
index = 0
for word in spoken:
    gmmhmmindexdict[word] = index
    index = index +1

def initByBakis(inumstates, ibakisLevel):
    startprobPrior = np.zeros(inumstates)
    startprobPrior[0: ibakisLevel - 1] = 1/float((ibakisLevel - 1))
    transmatPrior = getTransmatPrior(inumstates, ibakisLevel)
    return startprobPrior, transmatPrior

def getTransmatPrior(inumstates, ibakisLevel):
    transmatPrior = (1 / float(ibakisLevel)) * np.eye(inumstates)

    for i in range(inumstates - (ibakisLevel - 1)):
        for j in range(ibakisLevel - 1):
            transmatPrior[i, i + j + 1] = 1. / ibakisLevel

    for i in range(inumstates - ibakisLevel + 1, inumstates):
        for j in range(inumstates - i - j):
            transmatPrior[i, i + j] = 1. / (inumstates - i)

    return transmatPrior

m_startprobPrior ,m_transmatPrior = initByBakis(m_num_of_HMMStates,m_bakisLevel)

print("StartProbPrior=")
print(m_startprobPrior)

print("TransMatPrior=")
print(m_transmatPrior)

StartProbPrior=
[1. 0. 0.]
TransMatPrior=
[[0.5 0.5 0. ]
 [0.  0.5 0.5]
 [0.  0.  1. ]]


In [22]:
# acoustic model definition
class SpeechModel:
    def __init__(self,Class,label):
        self.traindata = np.zeros((0,num_mfcc))
        self.Class = Class
        self.label = label
        self.model  = hmm.GMMHMM(n_components = m_num_of_HMMStates, n_mix = m_num_of_mixtures, \
                transmat_prior = m_transmatPrior, startprob_prior = m_startprobPrior, \
                covariance_type = m_covarianceType, n_iter = m_n_iter)

In [23]:
# training GMMHMM Models 
start = time()

speechmodels = [None] * len(spoken)
for key in gmmhmmindexdict:
    speechmodels[gmmhmmindexdict[key]] = SpeechModel(gmmhmmindexdict[key],key)

for i in range(0,len(m_trainingsetfeatures)):
     for j in range(0,len(speechmodels)):
         if int(speechmodels[j].Class) == int(gmmhmmindexdict[m_trainingsetlabels[i]]):
            speechmodels[j].traindata = np.concatenate((speechmodels[j].traindata , m_trainingsetfeatures[i]))

for speechmodel in speechmodels:
    speechmodel.model.fit(speechmodel.traindata)

print ('Training completed -- {0} GMM-HMM models are built for {0} different types of words'.format(len(spoken)))
print('time elapsed: %.2f seconds' % ( time() - start ))
print (" "); print(" ")

Training completed -- 10 GMM-HMM models are built for 10 different types of words
time elapsed: 1.11 seconds
 
 


In [24]:
# testing
print("Prediction started")
m_PredictionlabelList = []

for i in range(0,len(m_testingsetfeatures)):
    scores = []
    for speechmodel in speechmodels:
         scores.append(speechmodel.model.score(m_testingsetfeatures[i]))
    id  = scores.index(max(scores))
    m_PredictionlabelList.append(speechmodels[id].Class)
    print(str(np.round(scores, 3)) + " " + str(max(np.round(scores, 3))) +" "+":"+ speechmodels[id].label)

accuracy = 0.0
count = 0
print("")
print("Prediction for Testing DataSet:")

for i in range(0,len(m_testingsetlabels)):
    print( "Label"+str(i+1)+":"+m_testingsetlabels[i])
    if gmmhmmindexdict[m_testingsetlabels[i]] == m_PredictionlabelList[i]:
       count = count+1

accuracy = 100.0*count/float(len(m_testingsetlabels))

print("")
print("accuracy ="+str(accuracy))
print("")

# end of testing

Prediction started
[-1067.984 -1389.925 -1369.197 -1336.733 -1515.543 -1299.77  -1304.884
 -1276.37  -1105.513 -1324.105] -1067.984 :0
[ -982.101 -1380.902 -1360.873 -1375.918 -1567.11  -1408.339 -1325.061
 -1273.369 -1047.708 -1400.093] -982.101 :0
[-1182.129  -887.468 -1189.939 -1304.515 -1447.553  -986.576 -1147.166
 -1164.415 -1027.541 -1061.327] -887.468 :1
[-1457.844 -1160.518 -1550.703 -1583.105 -1902.252 -1311.899 -1462.488
 -1474.446 -1321.626 -1385.078] -1160.518 :1
[-1122.272 -1232.233  -952.422 -1012.834 -1042.446 -1104.844 -1082.363
 -1167.347 -1083.392 -1329.701] -952.422 :2
[-1971.297 -2115.838 -1472.155 -1773.196 -1985.454 -1608.241 -1564.731
 -1625.342 -1740.519 -1800.439] -1472.155 :2
[-1206.125 -1452.649 -1318.973  -929.712 -1043.063 -1408.404 -1178.564
 -1121.677 -1146.838 -1534.051] -929.712 :3
[-1614.335 -1826.858 -1740.596 -1248.63  -1390.55  -1786.381 -1537.176
 -1486.657 -1512.634 -1900.368] -1248.63 :3
[-1139.745 -1333.682 -1285.552  -965.03   -911.163 -1317.1

## End of Lab 01