Papers referenced:

https://pdfs.semanticscholar.org/ed65/7f82934353854bf00d7a3c923eb3aec4370b.pdf

ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf

https://arxiv.org/pdf/1707.07413.pdf

https://arxiv.org/pdf/1803.05563.pdf

I tried converting the labels to phonemes using phonemizer(https://github.com/bootphon/phonemizer) but did not get better results. I've commented out the phonemizer parts.

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
import torchaudio
import pandas as pd
import numpy as np
from numpy import ma
import librosa as lib
import librosa.display as display
import matplotlib.pyplot as plt
import os
import time
import math
#import phonemizer      #used to convert labels to phonemes (didn't make a difference during training but so far nothing has worked)
import collections
torch.backends.cudnn.enabled = True   #the ctc loss function has a special input format for cudnn so I tried seeing if
                                      #disabling it would make a difference (it did not)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


The functions below handle converting labels to ground truth indexes as well as converting network outputs to characters.

In [3]:
ground_truth = ['blank', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
                'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', "'"]#, '.', ",", '?']

def format_label(label, ground_truth, target_len):
    label = label.lower() #convert capital letters to lowercase
    output = torch.zeros([target_len]) #all targets must be the same length for batch training
    for i in range(0, target_len):
        if i < len(label):
            if label[i] not in ground_truth: #replace any unaccounted for characters with ' '
                output[i] = ground_truth.index(' ')
            else:
                output[i] = ground_truth.index(label[i])
        else:
            output[i] = -1
    return output.int()

def second_largest(nums): #used for debugging
    largest = -99999999
    secondLargest = -99999999
    index = -1
    Lindex = -1
    for i in range(0,len(nums)):
        if nums[i] > largest:
            secondLargest = largest
            largest = nums[i]
            index = Lindex
            Lindex = i
            
        if nums[i] < largest and nums[i] > secondLargest:
            secondLargest = nums[i]
            index = i
    return secondLargest, index

def transcribe_output(output, trained, last_char):
    sentence = []
    seconds = []
    lastMax = -1
    for i in range(0,len(output)):
        maxNum = output[i][0].max(0)[1]
        letter = ground_truth[maxNum]
        if letter == 'blank':
            seconds.append(ground_truth[second_largest(output[i][0])[1]])
            if i == last_char[0]:
                seconds.append('STOP')
            continue
        elif lastMax == maxNum and not trained: #ignore duplicate letters if the network is fully trained
            continue
        else:
            lastMax = maxNum
            sentence.append(letter)
            if i == last_char[0]:
                sentence.append('STOP')
    return sentence, seconds

#extra transcribe output function added to handle phoneme outputs (requires dataset to be initialized and coverted to phonemes)
#def transcribe_output2(output, trained, last_char):
#    sentence = []
#    seconds = []
#    lastMax = -1
#    for i in range(0,len(output)):
#        maxNum = output[i][0].max(0)[1]
#        letter = dataset.phones[maxNum]
#        if letter == 'blank':
#            seconds.append(dataset.phones[second_largest(output[i][0])[1]])
#            if i == last_char[0]:
#                seconds.append('STOP')
#            continue
#        elif lastMax == maxNum and not trained: #ignore duplicate letters if the network is not fully trained
#            continue
#        else:
#            lastMax = maxNum
#            sentence.append(letter)
#            if i == last_char[0]:
#                sentence.append('STOP')
#    return sentence, seconds

In [4]:
dirRoot = "./datasets/LibriSpeech/train-clean-360/"

The class below handles importing and formatting the LibriSpeech dataset. 

In [5]:
class LibriSpeech_Dataset(Dataset):
    def __init__(self, dirRoot, n_mels, max_len, window_size, skip, style = 'lib'):
        # dirRoot: path to Librispeech dataset
        # n_mels: number of mel coefficients used in the mel spectrum (using 80)
        # max_len: max length of the audio files used in the dataset (sample rate is 16000Hz)
        # window_size: sets the input window size
        # skip: sets the number of samples to skip before creating the next input window
        # style: determines whether librosa or torchaudio is used to calculate the mel_spectrum
        
        self.dir_root = dirRoot
        self.n_mels = n_mels
        self.max_len = max_len
        self.window = window_size
        self.skip = skip
        self.max_target_len = 0
        self.outliers = []
        self.lengths = []
        self.style = style
        self.word_list = []
        self.targ_type = 'alph'
        
        #first create folder info lists
        self.speaker_list = os.listdir(dirRoot)
        self.chapter_list = []
        for i in self.speaker_list:
            self.chapter_list.append(os.listdir(dirRoot + i + '/'))
        
        #add all file names to a single list
        self.file_names = []
        self.labels = []
        x = 0
        running_label_max = 0
        running_audio_max = 0
        for i in range(0, len(self.speaker_list)): #for each speaker
            #uncomment the next two lines for dataset class debugging (skips long initialization time)
            #if x > 200:
            #    break
            for j in range(0, len(self.chapter_list[i])): #for the chapters they read
                file_path = self.speaker_list[i] + '/' + self.chapter_list[i][j] + '/' #set file path prefix
                label_file_name = self.speaker_list[i] + '-' + self.chapter_list[i][j] + '.trans.txt'
                try: 
                    label_file = pd.read_csv(self.dir_root + file_path + label_file_name)
                except FileNotFoundError:
                    print(label_file_name + ' not found')
                    continue
                for z in os.listdir(dirRoot + file_path): #for every audio file in the chapter folder
                    if z[-4:] == 'flac': #skip txt files
                        label_index = z[-9:-5]
                        label_index = int(label_index)
                        if (label_index == 0): #label_file ignores the first label entry
                            continue
                        full_label = label_file.iloc[label_index - 1][0] #0 corresponds to label 1
                        label = full_label.split(' ', 1)
                        audio = torchaudio.load(self.dir_root + file_path + z, out = None, normalization = True)
                        if audio[0].size(0) < self.max_len:  #only add audio files shorter then the max_len
                            self.labels.append(label[1])     #append the label without the file info
                            self.file_names.append(file_path + z)
                            self.lengths.append(audio[0].size(0))
                            if audio[0].size(0) > running_audio_max: #used for debugging
                                running_audio_max = audio[0].size(0)
                                self.max_len_idx = x
                            if len(label[1]) > running_label_max:    
                                #words = label[1].split(' ')
                                running_label_max = len(label[1])    #used for label padding
                                self.max_target_idx = x
                            x += 1
                self.max_label_len = running_label_max
                
    #function below is used to convert targets to phonemes and change the dataset to phoneme mode          
    #def init_phoneme_targets(self):
    #    self.targ_type = 'phone'
    #    self.phone_targets = []
    #    self.phones = ['blank', ' ', 'dh', 'ax', 'b', 'er', 'd', 'l', 'ay', 'f', 'ah', 'v', 'ae', 't', 's', 'ey', 
    #                   'hh', 'k', 'r', 'iy', 'eh', 'n', 'p', 'aa', 'm', 'ch', 'ih', 'th', 'w', 'ow', 'uw', 'z', 'ng', 
    #                   'g', 'sh', 'uh', 'ao', 'y', 'aw', 'oy', 'jh', 'zh']
    #    for i in range(0, len(self.labels)):
    #        temp = phonemizer.phonemize(self.labels[i], separator = sep)
    #        words = temp.split(' ')
    #        target = torch.zeros([self.max_label_len])
    #        ind = 0
    #        for word in words:
    #            phones = word.split('^')
    #            for phone in phones:
    #                if phone is '':
    #                    continue
    #                #if phone not in self.phones:
    #                #    self.phones.append(phone)
    #                target[ind] = self.phones.index(phone)
    #                ind += 1
    #            if word is not words[-1]:
    #                target[ind] = 1
    #                ind += 1
    #        self.phone_targets.append(target)
            
    #load an audio file using torch audio                      
    def get_audio(self, index):
        data = torchaudio.load(self.dir_root + self.file_names[index], out = None, normalization = True)
        raw_data = torchaudio.transforms.LC2CL()(data[0])
        return (raw_data)
    
    #get a mel spectrum using either librosa ('lib') or torchaudio ('torch')
    # ***torchaudio's melspectrum function does not return the same spectrum as librosa (use librosa)
    def get_melSpec(self, index, style = 'lib'):
        #lib returns a log energy mel spectrum
        if style == 'lib':
            data, sample_rate = lib.core.load(self.dir_root + self.file_names[index], sr = None)
            mel_feats = lib.feature.melspectrogram(y = data, n_mels = self.n_mels, sr = sample_rate,
                                               n_fft = 400, hop_length = 160, power = 1)
            #mel_feats = lib.core.power_to_db(mel_feats)
            mel_feats = ma.log(mel_feats)
            mel_feats = torch.from_numpy(mel_feats)
            mel_feats = mel_feats.float()
        #torch returns a decibel mel spectrum (not the same as the librosa decibel spectrum)
        elif style == 'torch':
            data = torchaudio.load(self.dir_root + self.file_names[index], out = None, normalization = True)
            raw_data = torchaudio.transforms.LC2CL()(data[0])
            mel_feats = torchaudio.transforms.MEL2(sr = data[1], n_mels = self.n_mels, ws = 400, hop = 160)(raw_data)
            mel_feats = mel_feats.permute(0, 2, 1)
            mel_feats = mel_feats[0]
            #mel_feats = lib.core.db_to_power(mel_feats.numpy(), ref = 1)
            #mel_feats = torch.from_numpy(mel_feats)
        return mel_feats
    
    #orders the audio files from shortest to longest
    def order(self):
        ordered_labels = []
        ordered_file_names = []
        ordered_lengths = []
        minIndex = -1
        for j in range(0, len(self.lengths)):
            runningMin = 999999999
            for i in range(0, len(self.lengths)):
                if self.lengths[i] < runningMin:
                    runningMin = self.lengths[i]
                    minIndex = i
            ordered_labels.append(self.labels.pop(minIndex))
            ordered_file_names.append(self.file_names.pop(minIndex))
            ordered_lengths.append(self.lengths.pop(minIndex))
        self.labels = ordered_labels
        self.file_names = ordered_file_names
        self.lengths = ordered_lengths
    
    #copies a set number of mel spectrum columns to a 1-D tensor
    def get_sample(self, feats, sampleNum):
        feats = feats.permute(1, 0)
        x = torch.zeros([self.n_mels * self.window])
        for i in range(0, self.window):
            x[i*self.n_mels:(i+1)*self.n_mels] = feats[sampleNum + i]
        return x
    
    def __getitem__(self, index):
        log_feats = self.get_melSpec(index, style = self.style)
        ref_len = self.max_len//160
        if log_feats.size(1) < ref_len:  #pad inputs to the same size
            length = log_feats.size(1)//self.skip - self.window
            padding = torch.zeros([self.n_mels, ref_len-log_feats.size(1)])
            log_feats = torch.cat([log_feats, padding], dim = 1)
        else:
            length = ref_len//self.skip - self.window
            self.outliers.append(index) #used for debugging
            log_feats = log_feats.narrow(1, 0, ref_len)
        
        formatted_feats = torch.zeros([(log_feats.size(1) - self.window)//self.skip, self.n_mels * self.window])
        for j in range(0, formatted_feats.size(0)): #copy each input window into the final output array
            formatted_feats[j] = self.get_sample(log_feats, j*self.skip)
            
        if self.targ_type == 'alph':
            target = format_label(self.labels[index], ground_truth, self.max_label_len)
        else:
            target = self.phone_targets[index]
        return formatted_feats, target, self.labels[index], length
            
    def __len__(self):
        return (len(self.labels))
    
            
timer1 = time.time()

dataset = LibriSpeech_Dataset(dirRoot, 80, 240000, 3, 3, style = 'lib')

timer2 = time.time()
print(timer2 - timer1)

print(len(dataset))

325.2520925998688
73133


The code block below is used to test librosa's and torchaudio's mel spectrum functions. Currently librosa is set to return a log energy mel spectrum and torch returns a decibel mel spectrum. Librosa's decibel spectrum does not match the torchaudio spectrum so I stopped using torchaudio. Librosa is about 10 times slower than torchaudio so finding a way to fix this bug could greatly speed up training.

In [6]:
mels = dataset.get_melSpec(1, 'torch')
libMels = dataset.get_melSpec(1, 'lib')
#librosa not currently configured to print decibels
print(mels[4][0:20])
print(libMels[4][0:20])

tensor([-67.0145, -59.8686, -56.0617, -59.7010, -63.2066, -63.2283, -60.3874,
        -64.3774, -53.2120, -58.0290, -59.6683, -58.4790, -57.3591, -59.2487,
        -76.2957, -61.0005, -80.0000, -62.4648, -67.3784, -61.4863])
tensor([-7.4339, -7.3955, -8.1012, -6.7015, -7.2257, -7.0282, -7.6561, -7.5710,
        -6.9217, -7.0185, -6.8555, -6.8587, -7.3021, -7.9134, -7.4400, -7.5288,
        -8.0137, -6.5596, -7.0451, -6.7670])


Only run the code block below if the dataset needs to be ordered,

In [8]:
#run this block to order the dataset
x = 0
timer1 = time.time()
dataset.order()
#dataset.init_phoneme_targets()
timer2 = time.time()
print(timer2-timer1)

11.980390548706055


In [9]:
#play an audio track to ensure the label matches the audio
import IPython.display as ipd
print(dataset.labels[5])
ipd.Audio(dataset.get_audio(5), rate = 16000)

IF YOU WANT A GAMELESS STATE LET THE DESTRUCTION GO ON AS IT NOW IS GOING WITH SIXTEEN THOUSAND LICENSED GUNNERS IN THE FIELD EACH YEAR AND YOU WILL SURELY HAVE IT RIGHT SOON DELAWARE


I have been using a bidirection LSTM RNN as my network. I have tried using 3 layer and 5 layer BLSTMs with both 300 and 512 hidden sizes. The network is currently configured to a 3 layer BLSTM with a hidden size of 300. I also tried training on a smaller (shorter audio tracks only) ordered dataset first then switching to a larger unordered dataset.

In [21]:
class Net(nn.Module):  
    def __init__(self, input_sz):
        super(Net, self).__init__()
        self.drop = nn.Dropout(0.3)
        self.drop2 = nn.Dropout(0.3)
        self.lstm = nn.LSTM(input_size = input_sz, hidden_size = 300, num_layers = 3, bidirectional = True, dropout = 0.3)
        self.fc1 = nn.Linear(600, 512)
        self.fc2 = nn.Linear(512, len(ground_truth))

        
    def forward(self, x): #expected dims: batch, sample, data
        x = x.permute(1, 0, 2).contiguous()
        x, self.hidden = self.lstm(x, self.hidden)
        x = x.clamp(min = -50, max = 50)  #mentioned in a paper for stable CTC training
        x = self.drop(x)
        y = self.fc1(x.view(x.size(0) * x.size(1), x.size(2)).contiguous())  #fc input dims: sample x batch, hidden_size x 2
        y = self.drop2(y)
        y = self.fc2(y)
        return F.log_softmax(y.view(x.size(0), x.size(1), y.size(1)).contiguous(), dim = 2) #final dims: sample, batch, length of ground truth

    def init_hidden(self, batch_size):
        mean = torch.zeros([6, batch_size, 300])
        std = torch.zeros([6, batch_size, 300])
        std += (1/512)**(0.5)
        return (torch.normal(mean, std).to(device),
                torch.normal(mean, std).to(device))


model = Net(dataset.n_mels * dataset.skip).to(device)
#state_dict = torch.load('./asr_test.t7')
print(model)

Net(
  (drop): Dropout(p=0.3)
  (drop2): Dropout(p=0.3)
  (lstm): LSTM(240, 300, num_layers=3, dropout=0.3, bidirectional=True)
  (fc1): Linear(in_features=600, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=29, bias=True)
)


In [22]:
batch_size = 8
kwargs = {'num_workers': 8, 'pin_memory': True} if device == 'cuda' else {}
#change shuffle to False when performing ordered training
train_loader = torch.utils.data.DataLoader(dataset, batch_size = batch_size, shuffle = True, **kwargs)

I have tried training the network with a learn rate of 0.001 and 0.0001. The scheduler is currently configured to never actually decrease the learn rate (100 epochs would take weeks using librosa).

In [23]:
optimizer = torch.optim.ASGD(model.parameters(), lr = 0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 100, gamma = 0.1) 
ctc_loss = torch.nn.CTCLoss(blank = 0)

Go to https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/loss.py for details on the PyTorch CTC loss function.

In [24]:
def train(model, epoch):
    model.train()
    dataLoadStart = time.time()
    avg_loss = 0
    for batch_idx, (data, target, labels, lengths) in enumerate(train_loader):
        start = time.time()
        #print(start-dataLoadStart)
        lengths = lengths.to(device)
        #print("Data load time: " + str(start-dataLoadStart))
        loop_bound = data.size(0)
        optimizer.zero_grad()
        data = data.to(device)
        target = target.to(device)
        model.hidden = model.init_hidden(loop_bound)
        target_lens = torch.zeros([loop_bound]).to(device)
        out = model(data)
        
        for i in range(0, loop_bound):    #find the unpadded length of each target
            target_lens[i] = len(labels[i])
            #target_lens[i] = len(target[i].nonzero())
        loss = ctc_loss(out, target, tuple(lengths), tuple(target_lens))
        loss.backward()
        avg_loss += loss.data
        
        #for name, x in model.named_parameters(): #clamping gradients stabilizes CTC training
        #    if name not in ['fc1.weight', 'fc2.weight', 'fc1.bias', 'fc2.bias']:
        #        x.grad = x.grad.clamp(min = -1, max = 1)
        
        optimizer.step()
        end = time.time()
        #print("Network execution time: " + str(end-start))
        if batch_idx%100 == 0:
            print(loss.data)
        if batch_idx%200 == 0: #print average loss for the epoch and transcribe the first output in the batch
            print("Epoch " + str(epoch) + " Average loss: " + str(avg_loss/(batch_idx + 1)))
            print(labels[0])
            print(transcribe_output(out.cpu(), True, lengths))
        dataLoadStart = time.time()
    

So far I haven't seen any reduction in the average loss during training. The first list of characters printed is the network's outputs for the first item in the batch transcribed to letters and the second list is the second most likely character when the network chose the blank token. STOP was added to get an idea of where the outputs that correspond to input padding begin.

In [25]:
for i in range(1,31):
    print("Beginning epoch " + str(i))
    scheduler.step()
    train(model, i)
    #test(model) 

Beginning epoch 1
tensor(5.8851, device='cuda:0')
Epoch 1 Average loss: tensor(5.8851, device='cuda:0')
I MEAN I'M GLAD I'VE LIVED
(['r', 'h', 'r', 'y', 'r', 'u', 'r', 'h', 'a', 'r', 'r', 'n', 'v', ' ', 'k', 'p', 'h', 'p', 'r', ' ', 'h', 'p', 'p', 'k', 'r', 'r', 'm', 'p', 'y', 'r', 'p', 'y', 'k', 'p', 'h', 'b', 'm', 'v', 'u', 'v', 'a', 'z', 'u', 'r', 'p', 'p', 'p', 'p', 'k', 'p', 'h', 'v', 'y', 'p', 'z', 'a', 'p', 'u', 'a', 'h', 'm', 'k', 'z', 'k', 'v', 'p', 'k', 'k', 'k', 'y', 'm', 'k', ' ', 'k', 'k', 'n', 'r', 'a', 'a', 'h', 'k', 'k', 'k', 'a', 'k', 'a', 'a', 'STOP', 'a', 'k', 'k', ' ', 'k', 'k', 'm', 'a', 'a', 'a', 'a', 'm', 'a', 'a', 'a', 'a', 'a', 'm', 'a', 'r', 'u', 'm', 'y', 'y', 'm', 'm', 'm', 'm', 'm', 'y', 'a', 'm', 'r', 'm', 'm', 'm', 'r', 'r', 'm', 'm', 'a', 'a', 'a', 'a', 'm', 'a', 'k', 'm', 'a', 'm', 'a', 'a', 'a', 'm', 'r', 'm', 'a', 'm', 'r', 'r', 'r', 'a', 'm', 'a', 'm', 'r', 'v', 'r', 'm', 'm', 'h', 'r', 'r', 'r', 'k', 'a', 'm', 'a', 'u', 'a', 'r', 'm', 'a', 'm', 'y',

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), './asr_test.t7')