In [None]:
!pip -q install librosa
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [272]:
import librosa, librosa.display
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as idsp
from matplotlib.patches import ConnectionPatch
import scipy.spatial.distance as dist
from numpy.lib.function_base import extract

#MFCC

In [292]:
class MFCC():
  def __init__(self, path=['18020758_HoangPhuongLinh/26_30','18020909_Trần Công Minh/c1','18020909_Trần Công Minh/c3'], keep_all=False, keep_size=2):
    PATH = '/content/drive/MyDrive/Ex1-2022/02/'
    text = {}
    for p in path:
      if '18020758_HoangPhuongLinh' in p:
        text[p] = open(PATH+p+'.txt').read().split('\n')[:-1]
      else:
        text[p] = open(PATH+p+'.txt').read().split('\n')[:-1]
    
    times, commands = self.split_data(text, dict(), dict())

    sound = {}
    sr = {}
    for p in path:
      sound[p], sr[p] = librosa.load(PATH+p+'.wav', duration=22050)

    self.sounds = {}
    for p in commands:
      for e, i in enumerate(commands[p]):
        if i not in self.sounds:
          self.sounds[i] = []
        self.sounds[i].append([sound[p][int(22050*times[p][e][0]):int(22050*times[p][e][1])],sr[p]])

    self.sounds_sr = {}
    for command in self.sounds:
      if command not in self.sounds_sr:
        self.sounds_sr[command] = []
      if keep_all:
        self.sounds_sr[command]+=self.sounds[command]
      else:
        for i in range(keep_size):
          ra = np.random.randint(len(command))
          self.sounds_sr[command].append(self.sounds[command][i])

    self.mfccs_features = {}
    self.extract_feature()
  
  def split_data(self, text, times, commands):
    for p in text:
      if p not in times:
        times[p] = []
        commands[p] = []
      for i in text[p]:
        times[p].append([float(i.split('\t')[0]),float(i.split('\t')[1])])
        commands[p].append(i.split('\t')[2])
    return times, commands

  def extract_feature(self):
    for command in self.sounds_sr:
      if command not in self.mfccs_features:
        self.mfccs_features[command] = []
      for sound, sr in self.sounds_sr[command]:
        # print(len(sound))
        if len(sound)>9000:
          self.mfccs_features[command].append(self.extract_feature_mfcc(sound, sr))

  def extract_feature_mfcc(self,sound, sr):
      mfcc = librosa.feature.mfcc(y=sound, sr=sr, n_mfcc=13)
      mfcc_delta = librosa.feature.delta(mfcc)
      mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
      return np.concatenate((mfcc, mfcc_delta, mfcc_delta2))

#Speech Recognition DTW

In [None]:
class SpeechRecognitionDTW():
  def __init__(self):
    mfcc =  MFCC()
    self.features = mfcc.mfccs_features

  def dp(self, dist_mat):

      N, M = dist_mat.shape
      cost_mat = np.zeros((N + 1, M + 1))
      for i in range(1, N + 1):
          cost_mat[i, 0] = np.inf
      for i in range(1, M + 1):
          cost_mat[0, i] = np.inf

      traceback_mat = np.zeros((N, M))
      for i in range(N):
          for j in range(M):
              penalty = [
                  cost_mat[i, j],   
                  cost_mat[i, j + 1],
                  cost_mat[i + 1, j]] 
              i_penalty = np.argmin(penalty)
              cost_mat[i + 1, j + 1] = dist_mat[i, j] + penalty[i_penalty]
              traceback_mat[i, j] = i_penalty

      i = N - 1
      j = M - 1
      path = [(i, j)]
      while i > 0 or j > 0:
          tb_type = traceback_mat[i, j]
          if tb_type == 0:
              i = i - 1
              j = j - 1
          elif tb_type == 1:
              i = i - 1
          elif tb_type == 2:
              j = j - 1
          path.append((i, j))

      cost_mat = cost_mat[1:, 1:]
      return (path[::-1], cost_mat)

  def dtw(self, sound, sr):
    mfcc = librosa.feature.mfcc(y=sound, sr=sr, n_mfcc=13)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    feature = np.concatenate((mfcc, mfcc_delta, mfcc_delta2))
    count = {}
    for command in self.features:
      if command not in count:
        count[command] = []
      for fea in self.features[command]:
        dist_mat = dist.cdist(feature.T, fea.T, "cosine")
        path, cost_mat = self.dp(dist_mat)
        count[command].append(cost_mat[-1, -1])
    
        average = dict([(k,np.min(v)) for (k,v) in count.items()])
    return min(average, key=average.get)

#HMM

In [None]:
!pip -q install hmmlearn

[?25l[K     |██▌                             | 10 kB 21.9 MB/s eta 0:00:01[K     |█████                           | 20 kB 14.1 MB/s eta 0:00:01[K     |███████▋                        | 30 kB 9.9 MB/s eta 0:00:01[K     |██████████                      | 40 kB 9.3 MB/s eta 0:00:01[K     |████████████▋                   | 51 kB 4.5 MB/s eta 0:00:01[K     |███████████████▏                | 61 kB 5.3 MB/s eta 0:00:01[K     |█████████████████▊              | 71 kB 5.7 MB/s eta 0:00:01[K     |████████████████████▏           | 81 kB 5.9 MB/s eta 0:00:01[K     |██████████████████████▊         | 92 kB 6.6 MB/s eta 0:00:01[K     |█████████████████████████▎      | 102 kB 5.2 MB/s eta 0:00:01[K     |███████████████████████████▊    | 112 kB 5.2 MB/s eta 0:00:01[K     |██████████████████████████████▎ | 122 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████████████| 129 kB 5.2 MB/s 
[?25h

In [None]:
from hmmlearn import hmm
import os

In [291]:
class HMM():
   def __init__(self, n_components=4, path=['18020758_HoangPhuongLinh/','18020909_Trần Công Minh/']):
      PATH = '/content/drive/MyDrive/Ex1-2022/02/'
      subfolder = []
      for p in path:
        subfolder+=[os.path.join(PATH+p,i) for i in os.listdir(PATH+p)]
      k = [i.replace('/content/drive/MyDrive/Ex1-2022/02/', '').replace('.wav', '') for i in subfolder if 'txt' not in i]
      mfcc = MFCC(k, keep_all=True)
      features = mfcc.mfccs_features

      self.hmm_models = []

      for i in features:
        # fea = np.concatenate([mf.T for mf in features[i]])
        models = []
        hmm_trainer = hmm.GaussianHMM(n_components=n_components, covariance_type='diag', n_iter=1800)
        for j in features[i]:
          models = hmm_trainer.fit(j.T)
        self.hmm_models.append((models, i))
        hmm_trainer = None

   def get_score(self, sound, sr):
      mfcc = librosa.feature.mfcc(y=sound, sr=sr, n_mfcc=13)
      mfcc_delta = librosa.feature.delta(mfcc)
      mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
      feature = np.concatenate((mfcc, mfcc_delta, mfcc_delta2)).T
      scores=[]
      for item in self.hmm_models:
        hmm_model, label = item
        score = hmm_model.score(feature)
        scores.append(score)

      index=np.array(scores).argmax()
      return self.hmm_models[index][1]

#Đánh giá mô hình DTW

In [None]:
speechrecognitionDTW = SpeechRecognitionDTW()

In [None]:
k = 1
y, sr = librosa.load(f'/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c{k}.wav', duration=22050)
t = open(f'/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c{k}.txt').read().split('\n')
times = []
commands = []
for i in t[:-1]:
  times.append([float(i.split('\t')[0]),float(i.split('\t')[1])])
  commands.append(i.split('\t')[2])
command_dict = {}
for e, i in enumerate(commands):
  if i not in command_dict:
    command_dict[i] = []
  command_dict[i].append(times[e])

WORD = 'xuong'
s = y[int(22050*command_dict[WORD][0][0]):int(22050*command_dict[WORD][0][1])]
print(speechrecognitionDTW.dtw(s, sr))

idsp.Audio(data=s, rate=sr)

xuong


In [None]:
import tqdm
pre = []
lab = []
for k in tqdm.tqdm_notebook(range(1,101)):
  y, sr = librosa.load(f'/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c{k}.wav', duration=22050)
  t = open(f'/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c{k}.txt').read().split('\n')

  times = []
  commands = []
  for i in t[:-1]:
    times.append([float(i.split('\t')[0]),float(i.split('\t')[1])])
    commands.append(i.split('\t')[2])

  for i in range(len(times)):
    if times[i][1] - times[i][0] > 0.2:
      pre.append(speechrecognitionDTW.dtw(y[int(22050*times[i][0]):int(22050*times[i][1])], sr))
      lab.append(commands[i])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
wrong = 0
for i in range(len(pre)):
  if pre[i]!=lab[i]:
    wrong+=1

print(f'Accuracy: {(len(lab)-wrong)/len(lab)}')

find_wrong = {}
label = {}
for i in range(len(lab)):
  if lab[i].strip() not in find_wrong:
    find_wrong[lab[i].strip()] = 0
  if lab[i].strip() not in label:
    label[lab[i].strip()] = 0
  if lab[i].strip()!=pre[i].strip():
    find_wrong[lab[i].strip()]+=1
  label[lab[i].strip()]+=1
print('Accuracy:')
for i in find_wrong:
  print(f' {i}: {(label[i]-find_wrong[i])/label[i]: .3f}')

Accuracy: 0.5763765541740675
Accuracy:
 sil:  0.992
 len:  0.080
 xuong:  1.000
 phai:  0.342
 B:  0.970
 A:  0.393
 trai:  0.087
 nhay:  0.377
 ban:  0.958


# Đánh giá mô hình HMM

In [None]:
hmm_models = HMM()

In [297]:
k = 1
y, sr = librosa.load(f'/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c{k}.wav', duration=22050)
t = open(f'/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c{k}.txt').read().split('\n')
times = []
commands = []
for i in t[:-1]:
  times.append([float(i.split('\t')[0]),float(i.split('\t')[1])])
  commands.append(i.split('\t')[2])
command_dict = {}
for e, i in enumerate(commands):
  if i not in command_dict:
    command_dict[i] = []
  command_dict[i].append(times[e])

WORD = 'len'
s = y[int(22050*command_dict[WORD][0][0]):int(22050*command_dict[WORD][0][1])]

print(hmm_models.get_score(s, sr))
idsp.Audio(data=s, rate=sr)

len


In [298]:
import tqdm
pre = []
lab = []
for k in tqdm.tqdm_notebook(range(1,101)):
  y, sr = librosa.load(f'/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c{k}.wav', duration=22050)
  t = open(f'/content/drive/MyDrive/Ex1-2022/02/18021371_NguyenManhTuan/c{k}.txt').read().split('\n')

  times = []
  commands = []
  for i in t[:-1]:
    times.append([float(i.split('\t')[0]),float(i.split('\t')[1])])
    commands.append(i.split('\t')[2])

  for i in range(len(times)):
    if times[i][1] - times[i][0] > 0.2:
      pre.append(hmm_models.get_score(y[int(22050*times[i][0]):int(22050*times[i][1])], sr))
      lab.append(commands[i])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/100 [00:00<?, ?it/s]

In [299]:
wrong = 0
for i in range(len(pre)):
  if pre[i]!=lab[i]:
    wrong+=1

print(f'Accuracy: {(len(lab)-wrong)/len(lab)}')

find_wrong = {}
label = {}
for i in range(len(lab)):
  if lab[i].strip() not in find_wrong:
    find_wrong[lab[i].strip()] = 0
  if lab[i].strip() not in label:
    label[lab[i].strip()] = 0
  if lab[i].strip()!=pre[i].strip():
    find_wrong[lab[i].strip()]+=1
  label[lab[i].strip()]+=1
print('Accuracy:')
for i in find_wrong:
  print(f' {i}: {(label[i]-find_wrong[i])/label[i]: .3f}')

Accuracy: 0.1172291296625222
Accuracy:
 sil:  0.008
 len:  1.000
 xuong:  0.050
 phai:  0.000
 B:  0.000
 A:  0.000
 trai:  0.000
 nhay:  0.000
 ban:  0.000
