In [2]:
import numpy as np

# Preparing the Data for DNN Training 

## 4.1 Target Class Definition

In [4]:
phoneHMMs = np.load("../Lab2/lab2_models_all.npz", allow_pickle=True)["phoneHMMs"].item()
phones = sorted(phoneHMMs.keys())
nstates = {phone: phoneHMMs[phone]['means'].shape[0] for phone in phones}
stateList = [ph + '_' + str(id) for ph in phones for id in range(nstates[ph])]
print(stateList)
print()
print(stateList.index('ay_2'))

['ah_0', 'ah_1', 'ah_2', 'ao_0', 'ao_1', 'ao_2', 'ay_0', 'ay_1', 'ay_2', 'eh_0', 'eh_1', 'eh_2', 'ey_0', 'ey_1', 'ey_2', 'f_0', 'f_1', 'f_2', 'ih_0', 'ih_1', 'ih_2', 'iy_0', 'iy_1', 'iy_2', 'k_0', 'k_1', 'k_2', 'n_0', 'n_1', 'n_2', 'ow_0', 'ow_1', 'ow_2', 'r_0', 'r_1', 'r_2', 's_0', 's_1', 's_2', 'sil_0', 'sil_1', 'sil_2', 'sp_0', 't_0', 't_1', 't_2', 'th_0', 'th_1', 'th_2', 'uw_0', 'uw_1', 'uw_2', 'v_0', 'v_1', 'v_2', 'w_0', 'w_1', 'w_2', 'z_0', 'z_1', 'z_2']

8


## 4.2 Forced Alignment

In [None]:
filename = '/content/drive/MyDrive/KTH/DD2119_Speech_Recognition/data/tidigits/tidigits/disc_4.1.1/tidigits/train/man/nw/z43a.wav'
samples, samplingrate = loadAudio(filename)
lmfcc = mfcc(samples)

wordTrans = list(path2info(filename)[2])  # Transcription using words
print(f"wordTrans: {wordTrans}")

phoneTrans = words2phones(wordTrans, prondict) # Transcription using phonemes
print(f"phoneTrans: {phoneTrans}")

utteranceHMM = concatHMMs(phoneHMMs, phoneTrans)
stateTrans = [phone + '_' + str(stateid) for phone in phoneTrans for stateid in range(nstates[phone])]  # Transcription using states
print(f"stateTrans[10]: {stateTrans[10]}")

obsloglik = log_multivariate_normal_density_diag(lmfcc, utteranceHMM["means"], utteranceHMM["covars"])
viterbiLoglik, viterbiPath = viterbi(obsloglik, np.log(utteranceHMM['startprob'][:-1]), np.log(utteranceHMM['transmat'][:-1, :-1]), forceFinalState=True)

viterbiStateTrans = [stateTrans[state] for state in viterbiPath]

trans = frames2trans(viterbiStateTrans, outfilename='z43a.lab')

In [None]:
mspec_res = mspec(samples)

In [None]:
example = np.load("lab3_example.npz", allow_pickle=True)["example"].item()
# Compare each variable with its corresponding value in the example dictionary

print(f"lmfcc: {np.allclose(lmfcc, example['lmfcc'])}")
print(f"Our wordTrans: \n{wordTrans}\nCorrect wordTrans: \n{example['wordTrans']}")
print(f"Our phoneTrans: \n{phoneTrans}\nCorrect phoneTrans: \n{example['phoneTrans']}")
print(f"Our stateTrans: \n{stateTrans}\nCorrect stateTrans: \n{example['stateTrans']}")
print(f"obsloglik: {np.allclose(obsloglik, example['obsloglik'])}")
print(f"viterbiLoglik: {np.allclose(viterbiLoglik, example['viterbiLoglik'])}")
print(f"viterbiPath: {np.allclose(viterbiPath, example['viterbiPath'])}")
print(f"Our viterbiStateTrans: \n{viterbiStateTrans}\nCorrect viterbiStateTrans: \n{example['viterbiStateTrans']}")

## 4.3 Feature Extraction 

In [None]:
from tqdm import tqdm
def feature_extraction(path):
  data = []

  for root, dirs, files in os.walk(path):
    for file in tqdm(files):
      if file.endswith('.wav'):
        filename = os.path.join(root, file)
        samples, samplingrate = loadAudio(filename)

        lmfcc = mfcc(samples) # Features used for HMM & DNN
        mspec_res = mspec(samples) # Features used for DNN

        wordTrans = list(path2info(filename))[2]  # Transcription using words
        phoneTrans = words2phones(wordTrans, prondict) # Transcription using phonemes
        targets = forcedAlignment(lmfcc, phoneHMMs, phoneTrans) # Align states to each utterance

        # converting targets to indices to save memory
        target_idx = np.array([stateList.index(target) for target in targets])

        data.append({'filename': filename, 'lmfcc': lmfcc,'mspec': mspec_res, 'targets': target_idx})

  return data


In [None]:
#print("Extraction features from train data")
#trainData = feature_extraction('/content/drive/MyDrive/KTH/DD2119_Speech_Recognition/Labs/data/tidigits/tidigits/disc_4.1.1/tidigits/train')
# Save the data to avoid computing it again
#np.savez('trainData.npz', trainData=trainData)

print("Extracting features from test data")
testData = feature_extraction('/content/drive/MyDrive/KTH/DD2119_Speech_Recognition/Labs/data/tidigits/tidigits/disc_4.2.1/tidigits/test')
np.savez('testData.npz', testData=testData)

In [None]:
from google.colab import files

# Specify the path to the files in your Google Drive
train_data_path = '/content/trainData.npz'
test_data_path = '/content/testData.npz'


# Download the files
# files.download(train_data_path)
files.download(test_data_path)

## 4.4 Training and Validation Sets

In [None]:
def splitDataByGender(data_dict,gender, train_utterances):
  train_data, val_data = [], []
  for current_speaker in data_dict[gender].keys():
    # if train_data contains 90 or more procent of total gender utterances, remaining data is stored as validation
    if len(train_data) >= train_utterances:
      #print(f"len(train_data): {len(train_data)} > {train_utterances} --> Creating val set instead")
      val_data.extend(data_dict[gender][str(current_speaker)])
    # Otherwise, we keep adding to train data until we achieve 90%
    else:
      #print(f"len(train_data): {len(train_data)} < {train_utterances}")
      train_data.extend(data_dict[gender][str(current_speaker)])
  print(f"train_data: {len(train_data)} \t val_data: {len(val_data)}")
  return train_data, val_data

def splitData(total_data, split=0.1):
  data_by_gender = {"man":{}, "woman": {}}
  print(f"Total_data length = {len(total_data)}")
  for data in total_data:
    gender, speakerID, _, _ = path2info(data["filename"])  # path2info returns tuple (gender, speakerID, digits, repetition)
    if speakerID not in data_by_gender[gender]:
      data_by_gender[gender][speakerID] = []
    data_by_gender[gender][speakerID].append(data)

  # Calculate total utterances by summing the lengths of each gender's list
  total_male_utterances = sum(len(utterances) for utterances in data_by_gender["man"].values())
  total_female_utterances = sum(len(utterances) for utterances in data_by_gender["woman"].values())

  train_male_utterances = int(total_male_utterances * (1-split))     # compute how many male utterances to achieve 90%
  train_female_utterances = int(total_female_utterances * (1-split)) # compute how many female utterances to achieve 90%
  print(f"total male utterances: {total_male_utterances}\ntrain_male_utterances: {train_male_utterances}")
  print(f"total female utterances: {total_female_utterances}\ntrain_female_utterances: {train_female_utterances}")

  male_train_data, male_val_data = splitDataByGender(data_by_gender, "man", train_male_utterances)
  female_train_data, female_val_data = splitDataByGender(data_by_gender, "woman", train_female_utterances)

  train_data, val_data = [], []
  train_data.extend(male_train_data)
  train_data.extend(female_train_data)
  val_data.extend(male_val_data)
  val_data.extend(female_val_data)

  print(f"train data has {len(train_data)} elements")
  print(f"val data has {len(val_data)} elements")

  return train_data, val_data


In [None]:
trainData, valData = splitData(trainData)


## 4.5 Acoustic Context (Dynamic Features)

## 4.6 Feature Standardisation 

# 5 Phoneme Recognition with Deep Neural Networks

## 5.1 Detailed Evaluation

## 5.2 Possible Questions