In [1]:
import numpy as np
import soundfile as sf
import os 
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import torch.nn.functional as F
import torch

from Lab3 import lab3_proto as proto3
from Lab3 import lab3_tools as tools3
from Lab2 import lab2_proto as proto2
from Lab2 import lab2_tools as tools2
from Lab1 import lab1_proto as proto1
from Lab1 import lab1_tools as tools1
from Lab2.prondict import prondict 

# Preparing the Data for DNN Training 

## 4.1 Target Class Definition

In [2]:
phoneHMMs = np.load("./Lab2/lab2_models_all.npz", allow_pickle=True)["phoneHMMs"].item()
phones = sorted(phoneHMMs.keys())
nstates = {phone: phoneHMMs[phone]['means'].shape[0] for phone in phones}
stateList = [ph + '_' + str(id) for ph in phones for id in range(nstates[ph])]
print(stateList)
print()
print(stateList.index('ay_2'))

['ah_0', 'ah_1', 'ah_2', 'ao_0', 'ao_1', 'ao_2', 'ay_0', 'ay_1', 'ay_2', 'eh_0', 'eh_1', 'eh_2', 'ey_0', 'ey_1', 'ey_2', 'f_0', 'f_1', 'f_2', 'ih_0', 'ih_1', 'ih_2', 'iy_0', 'iy_1', 'iy_2', 'k_0', 'k_1', 'k_2', 'n_0', 'n_1', 'n_2', 'ow_0', 'ow_1', 'ow_2', 'r_0', 'r_1', 'r_2', 's_0', 's_1', 's_2', 'sil_0', 'sil_1', 'sil_2', 'sp_0', 't_0', 't_1', 't_2', 'th_0', 'th_1', 'th_2', 'uw_0', 'uw_1', 'uw_2', 'v_0', 'v_1', 'v_2', 'w_0', 'w_1', 'w_2', 'z_0', 'z_1', 'z_2']

8


## 4.2 Forced Alignment

In [3]:
# the dataset is not on the github due to the copyright
filename = './../tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav'
samples, samplingrate = tools3.loadAudio(filename)
lmfcc = proto1.mfcc(samples)

wordTrans = list(tools3.path2info(filename)[2])  # Transcription using words
print(f"wordTrans: {wordTrans}")

phoneTrans = proto3.words2phones(wordTrans, prondict) # Transcription using phonemes
print(f"phoneTrans: {phoneTrans}")

utteranceHMM = proto2.concatHMMs(phoneHMMs, phoneTrans)
stateTrans = [phone + '_' + str(stateid) for phone in phoneTrans for stateid in range(nstates[phone])]  # Transcription using states
print(f"stateTrans[10]: {stateTrans[10]}")

obsloglik = tools2.log_multivariate_normal_density_diag(lmfcc, utteranceHMM["means"], utteranceHMM["covars"])
viterbiLoglik, viterbiPath = proto2.viterbi(obsloglik, np.log(utteranceHMM['startprob'][:-1]), np.log(utteranceHMM['transmat'][:-1, :-1]), forceFinalState=True)

viterbiStateTrans = [stateTrans[state] for state in viterbiPath]

trans = tools3.frames2trans(viterbiStateTrans, outfilename='z43a.lab')

wordTrans: ['z', '4', '3']
phoneTrans: ['sil', 'z', 'iy', 'r', 'ow', 'sp', 'f', 'ao', 'r', 'sp', 'th', 'r', 'iy', 'sp', 'sil']
stateTrans[10]: r_1


  viterbiLoglik, viterbiPath = proto2.viterbi(obsloglik, np.log(utteranceHMM['startprob'][:-1]), np.log(utteranceHMM['transmat'][:-1, :-1]), forceFinalState=True)


In [4]:
mspec_res = proto1.mspec(samples)

In [5]:
example = np.load("./Lab3/lab3_example.npz", allow_pickle=True)["example"].item()
# Compare each variable with its corresponding value in the example dictionary

print(f"lmfcc: {np.allclose(lmfcc, example['lmfcc'])}")
print(f"Our wordTrans: \n{wordTrans}\nCorrect wordTrans: \n{example['wordTrans']}")
print(f"Our phoneTrans: \n{phoneTrans}\nCorrect phoneTrans: \n{example['phoneTrans']}")
print(f"Our stateTrans: \n{stateTrans}\nCorrect stateTrans: \n{example['stateTrans']}")
print(f"obsloglik: {np.allclose(obsloglik, example['obsloglik'])}")
print(f"viterbiLoglik: {np.allclose(viterbiLoglik, example['viterbiLoglik'])}")
print(f"viterbiPath: {np.allclose(viterbiPath, example['viterbiPath'])}")
print(f"Our viterbiStateTrans: \n{viterbiStateTrans}\nCorrect viterbiStateTrans: \n{example['viterbiStateTrans']}")

lmfcc: True
Our wordTrans: 
['z', '4', '3']
Correct wordTrans: 
['z', '4', '3']
Our phoneTrans: 
['sil', 'z', 'iy', 'r', 'ow', 'sp', 'f', 'ao', 'r', 'sp', 'th', 'r', 'iy', 'sp', 'sil']
Correct phoneTrans: 
['sil', 'z', 'iy', 'r', 'ow', 'sp', 'f', 'ao', 'r', 'sp', 'th', 'r', 'iy', 'sp', 'sil']
Our stateTrans: 
['sil_0', 'sil_1', 'sil_2', 'z_0', 'z_1', 'z_2', 'iy_0', 'iy_1', 'iy_2', 'r_0', 'r_1', 'r_2', 'ow_0', 'ow_1', 'ow_2', 'sp_0', 'f_0', 'f_1', 'f_2', 'ao_0', 'ao_1', 'ao_2', 'r_0', 'r_1', 'r_2', 'sp_0', 'th_0', 'th_1', 'th_2', 'r_0', 'r_1', 'r_2', 'iy_0', 'iy_1', 'iy_2', 'sp_0', 'sil_0', 'sil_1', 'sil_2']
Correct stateTrans: 
['sil_0', 'sil_1', 'sil_2', 'z_0', 'z_1', 'z_2', 'iy_0', 'iy_1', 'iy_2', 'r_0', 'r_1', 'r_2', 'ow_0', 'ow_1', 'ow_2', 'sp_0', 'f_0', 'f_1', 'f_2', 'ao_0', 'ao_1', 'ao_2', 'r_0', 'r_1', 'r_2', 'sp_0', 'th_0', 'th_1', 'th_2', 'r_0', 'r_1', 'r_2', 'iy_0', 'iy_1', 'iy_2', 'sp_0', 'sil_0', 'sil_1', 'sil_2']
obsloglik: True
viterbiLoglik: True
viterbiPath: True
Our vi

## 4.3 Feature Extraction 

In [6]:
def feature_extraction(path):
  data = []

  for root, dirs, files in os.walk(path):
    for file in tqdm(files):
      if file.endswith('.wav'):
        filename = os.path.join(root, file)
        samples, samplingrate = tools3.loadAudio(filename)

        lmfcc = proto1.mfcc(samples) # Features used for HMM & DNN
        mspec_res = proto1.mspec(samples) # Features used for DNN

        wordTrans = list(tools3.path2info(filename))[2]  # Transcription using words
        phoneTrans = proto3.words2phones(wordTrans, prondict) # Transcription using phonemes
        targets = proto3.forcedAlignment(lmfcc, phoneHMMs, phoneTrans) # Align states to each utterance

        # converting targets to indices to save memory
        target_idx = np.array([stateList.index(target) for target in targets])

        data.append({'filename': filename, 'lmfcc': lmfcc,'mspec': mspec_res, 'targets': target_idx})

  return data


In [7]:
print("Extraction features from train data")
trainData = feature_extraction('../tidigits/disc_4.1.1/tidigits/train')
# Save the data to avoid computing it again
np.savez('trainData.npz', trainData=trainData)

print("Extracting features from test data")
testData = feature_extraction('../tidigits/disc_4.2.1/tidigits/test')
np.savez('testData.npz', testData=testData)

Extraction features from train data


0it [00:00, ?it/s]
0it [00:00, ?it/s]
  _, viterbi_path = viterbi(obslogik, np.log(utteranceHMM["startprob"][:-1]), np.log(utteranceHMM["transmat"][:-1, :-1]), forceFinalState=True)
100%|██████████| 77/77 [00:08<00:00,  9.61it/s]
100%|██████████| 77/77 [00:08<00:00,  9.00it/s]
100%|██████████| 77/77 [00:11<00:00,  6.84it/s]
100%|██████████| 77/77 [00:10<00:00,  7.44it/s]
100%|██████████| 77/77 [00:11<00:00,  6.82it/s]
100%|██████████| 77/77 [00:12<00:00,  6.23it/s]
100%|██████████| 77/77 [00:08<00:00,  9.08it/s]
100%|██████████| 77/77 [00:08<00:00,  9.16it/s]
100%|██████████| 77/77 [00:07<00:00, 10.95it/s]
100%|██████████| 77/77 [00:07<00:00,  9.98it/s]
100%|██████████| 77/77 [00:07<00:00, 10.50it/s]
100%|██████████| 77/77 [00:07<00:00, 10.09it/s]
100%|██████████| 77/77 [00:08<00:00,  9.50it/s]
100%|██████████| 77/77 [00:06<00:00, 11.20it/s]
100%|██████████| 77/77 [00:09<00:00,  8.40it/s]
100%|██████████| 77/77 [00:08<00:00,  9.54it/s]
100%|██████████| 77/77 [00:07<00:00, 10.31it/s]
10

Extracting features from test data


0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 77/77 [00:07<00:00, 10.99it/s]
100%|██████████| 77/77 [00:08<00:00,  9.46it/s]
100%|██████████| 77/77 [00:08<00:00,  9.29it/s]
100%|██████████| 77/77 [00:07<00:00, 10.67it/s]
100%|██████████| 77/77 [00:07<00:00, 10.13it/s]
100%|██████████| 77/77 [00:07<00:00, 10.74it/s]
100%|██████████| 77/77 [00:07<00:00, 10.40it/s]
100%|██████████| 77/77 [00:08<00:00,  9.59it/s]
100%|██████████| 77/77 [00:07<00:00, 10.68it/s]
100%|██████████| 77/77 [00:07<00:00, 10.46it/s]
100%|██████████| 77/77 [00:08<00:00,  9.15it/s]
100%|██████████| 77/77 [00:07<00:00, 10.61it/s]
100%|██████████| 77/77 [00:08<00:00,  8.66it/s]
100%|██████████| 77/77 [00:07<00:00, 10.01it/s]
100%|██████████| 77/77 [00:07<00:00,  9.68it/s]
100%|██████████| 77/77 [00:08<00:00,  8.84it/s]
100%|██████████| 77/77 [00:07<00:00, 10.76it/s]
100%|██████████| 77/77 [00:07<00:00, 10.38it/s]
100%|██████████| 77/77 [00:06<00:00, 11.21it/s]
100%|██████████| 77/77 [00:08<00:00,  9.45it/s]
10

## 4.4 Training and Validation Sets

In [8]:
def splitDataByGender(data_dict,gender, train_utterances):
  train_data, val_data = [], []
  for current_speaker in data_dict[gender].keys():
    # if train_data contains 90 or more procent of total gender utterances, remaining data is stored as validation
    if len(train_data) >= train_utterances:
      #print(f"len(train_data): {len(train_data)} > {train_utterances} --> Creating val set instead")
      val_data.extend(data_dict[gender][str(current_speaker)])
    # Otherwise, we keep adding to train data until we achieve 90%
    else:
      #print(f"len(train_data): {len(train_data)} < {train_utterances}")
      train_data.extend(data_dict[gender][str(current_speaker)])
  print(f"train_data: {len(train_data)} \t val_data: {len(val_data)}")
  return train_data, val_data

def splitData(total_data, split=0.1):
  data_by_gender = {"man":{}, "woman": {}}
  print(f"Total_data length = {len(total_data)}")
  for data in total_data:
    gender, speakerID, _, _ = tools3.path2info(data["filename"])  # path2info returns tuple (gender, speakerID, digits, repetition)
    if speakerID not in data_by_gender[gender]:
      data_by_gender[gender][speakerID] = []
    data_by_gender[gender][speakerID].append(data)

  # Calculate total utterances by summing the lengths of each gender's list
  total_male_utterances = sum(len(utterances) for utterances in data_by_gender["man"].values())
  total_female_utterances = sum(len(utterances) for utterances in data_by_gender["woman"].values())

  train_male_utterances = int(total_male_utterances * (1-split))     # compute how many male utterances to achieve 90%
  train_female_utterances = int(total_female_utterances * (1-split)) # compute how many female utterances to achieve 90%
  print(f"total male utterances: {total_male_utterances}\ntrain_male_utterances: {train_male_utterances}")
  print(f"total female utterances: {total_female_utterances}\ntrain_female_utterances: {train_female_utterances}")

  male_train_data, male_val_data = splitDataByGender(data_by_gender, "man", train_male_utterances)
  female_train_data, female_val_data = splitDataByGender(data_by_gender, "woman", train_female_utterances)

  train_data, val_data = [], []
  train_data.extend(male_train_data)
  train_data.extend(female_train_data)
  val_data.extend(male_val_data)
  val_data.extend(female_val_data)

  print(f"train data has {len(train_data)} elements")
  print(f"val data has {len(val_data)} elements")

  return train_data, val_data


In [10]:
train_data_path = "./trainData.npz"
test_data_path = "./testData.npz"
trainData = np.load(train_data_path, allow_pickle=True)["trainData"]
testData = np.load(test_data_path, allow_pickle=True)["testData"]

print(f"trainData has {len(trainData)} elements")
for key in trainData[0].keys():
  print(f"key: {key} - {trainData[0][key]}")


trainData has 8623 elements
key: filename - ../tidigits/disc_4.1.1/tidigits/train/woman/cl/za.wav
key: lmfcc - [[ 103.1708824   -82.31110635   14.29338738 ...  -38.65502294
  -155.95559352  -48.36368006]
 [ 109.42118969  -11.18161627  117.72133428 ...    8.63922846
   -40.50538129   21.4327149 ]
 [ 116.09423861  -46.88554472   25.04088065 ...  -75.15169037
   -16.42606443   67.31586026]
 ...
 [ 203.16342222  109.37593821   16.40208643 ...  -27.98419695
    -7.0104208     4.57830005]
 [ 164.15952204   87.7355812    85.18308825 ...  117.86320303
    15.88054796   27.40947736]
 [ 170.64996493  107.48479268  124.78120348 ...   -5.24549154
    49.41318624   10.30882344]]
key: mspec - [[-0.12084544  1.03020631  0.46282056 ...  2.09551978  2.27765554
   2.41654246]
 [ 2.54187435  3.06084006  3.07138977 ...  2.16677831  1.97251175
   2.2343435 ]
 [ 1.52510378  2.79239405  1.89996556 ...  1.91766317  1.89914442
   2.66109756]
 ...
 [ 4.40261508  3.47242441  2.87425463 ...  1.88805656  1.8249337

## 4.5 Acoustic Context (Dynamic Features)

In [11]:
def add_context(features, context=3):
    """
    Augments the features by adding context frames around each time step in the feature matrix.

    Args:
    features (np.array): The original feature matrix where each row is a time step and columns are features.
    context (int): The number of frames to include from before and after the current frame.

    Returns:
    np.array: An augmented feature matrix including context.
    """
    rows, cols = features.shape
    context_features = np.zeros((rows, cols * (2 * context + 1)))

    for i in range(rows):
        for j in range(-context, context + 1):
            if 0 <= i + j < rows:
                context_features[i, (j + context) * cols: (j + context + 1) * cols] = features[i + j]
            else:
                # Use mirroring for edge cases
                mirrored_index = min(max(0, i + j), rows - 1)
                context_features[i, (j + context) * cols: (j + context + 1) * cols] = features[mirrored_index]

    return context_features

In [17]:
add_context(trainData[0]["lmfcc"])

array([[103.1708824 , -82.31110635,  14.29338738, ...,   5.21621609,
        -62.65111028, -48.97697333],
       [103.1708824 , -82.31110635,  14.29338738, ..., -95.22756163,
         16.2071291 ,  40.90530715],
       [103.1708824 , -82.31110635,  14.29338738, ..., -72.15412933,
        -66.9767564 , -71.81263328],
       ...,
       [153.06616189,  26.72406375,  29.80793612, ...,  -5.24549154,
         49.41318624,  10.30882344],
       [128.26044834,  37.93693872,  99.15316669, ...,  -5.24549154,
         49.41318624,  10.30882344],
       [177.653294  ,  91.29562855,  43.55066378, ...,  -5.24549154,
         49.41318624,  10.30882344]])

## 4.6 Feature Standardisation 

In [None]:

def preprocessing(totalData, _num_classes):
  Ns = [len(data['targets']) for data in totalData]
  N = sum(Ns)
  D_lmfcc = trainData[0]["lmfcc"].shape[1]
  D_mspec = trainData[0]["mspec"].shape[1]
  X_lmfcc = np.zeros((N,D_lmfcc)).astype("float32")
  X_mspec = np.zeros((N,D_mspec)).astype("float32")
  Y = np.zeros(N)
  print(f"X_lmfcc: {X_lmfcc.shape}")
  print(f"X_mspec: {X_mspec.shape}")
  print(f"Y: {Y.shape}")

  startPointer = 0
  for i, data in enumerate(totalData):
    # add if statement for if we want to use context dependant data or not. Currently only uses non-context data as 4.5 is not done
    X_lmfcc[startPointer: startPointer+Ns[i]] = data["lmfcc"]
    X_mspec[startPointer: startPointer+Ns[i]] = data["mspec"]
    Y[startPointer: startPointer+Ns[i]] = data["targets"]
    startPointer += Ns[i] # move pointer to next empty index

  # class labels are 39., 40. and such and we need ints for the one_hot function
  Y = F.one_hot(torch.tensor(Y, dtype=torch.int64), num_classes=_num_classes)
  return X_lmfcc, X_mspec, Y

# Creating scalers to standardize the data
scalerLMFCC = StandardScaler()
scalerMSPEC = StandardScaler()

# Preprocessing the data
trainLMFCCX, trainMSPECX, trainY = preprocessing(trainData, len(stateList))
valLMFCCX, valMSPECX, valY = preprocessing(valData, len(stateList))
testLMFCCX, testMSPECX, testY = preprocessing(testData, len(stateList))

# Standardizing the data
scalerLMFCC.fit(trainLMFCCX)
trainLMFCCX = scalerLMFCC.transform(trainLMFCCX)
valLMFCCX = scalerLMFCC.transform(valLMFCCX)
testLMFCCX = scalerLMFCC.transform(testLMFCCX)

scalerMSPEC.fit(trainMSPECX)
trainMSPECX = scalerMSPEC.transform(trainMSPECX)
valMSPECX = scalerMSPEC.transform(valMSPECX)
testMSPECX = scalerMSPEC.transform(testMSPECX)

print(f"Preproccsed all data")
print(f"trainLMFCCX: {trainLMFCCX.shape} \t trainMSPECX: {trainMSPECX.shape} \t trainY: {trainY.shape}")
print(f"valLMFCCX: {valLMFCCX.shape} \t valMSPECX: {valMSPECX.shape} \t valY: {valY.shape}")
print(f"testLMFCCX: {testLMFCCX.shape} \t testMSPECX: {testMSPECX.shape} \t testY: {testY.shape}")

# 5 Phoneme Recognition with Deep Neural Networks

## 5.1 Detailed Evaluation

## 5.2 Possible Questions