# writeup

Steps to run code:  
- Place this notebook in the same folder with all npy files used
- Run the following code cells one by one except the appendix part (which is used for debugging)
- Get the result in test_result.csv
  
Model architecture:  
- The model consists of encoder, decoder and attention part
- Encoder has a bidirectional LSTM layer and three pBLSTM layer, each halving the sequence length, and two linear layer for calculating keys and values
- Attention is calculated in decoder, using key, value, and the hidden state of decoder as query
- Decoder has an embedding layer, two LSTM layer implemented by LSTMcell, and a linear layer to predict letter
  
Hyperparameters:
- Input data uses data augmentation, specifically frequency masking with param=5 and time masking with param=30
- Encoder has a hidden size of 256, and decoder 512
- The size of key and value are both 128
- Decoder uses weight tying between embedding layer and prediction layer
- Use Adam with initial learning rate 1e-3 and weight decay 5e-6
- Use a scheduler to half the learning rate if the validation distance doesn't improve 0.2% in 3 epochs
- Use cross entropy loss

Description:  
First a mapping between letters and their indices is formed by retrieving all letters in training data, and is represented by a list index2letter and a dictionary letter2index. Then in each batch the dataloader randomly sample 32 pieces of speech and their text labels, then convert labels into indices in alphabet letter by letter.
After getting a batch, the model first calculate its representation in encoder by pyramid LSTM, then get keys and values in every time step. Then in decoder it decode the representation step by step, and calculate attention using all keys and values and the hidden state in each step. Besides, it uses lengths of training data to mask attentions, and uses lengths of labels to mask losses, to get rid of padded ones. Loss is added up in each sequence, and the average loss of a batch is used for back propagation.
The current state of the model is saved once its validation L distance breaks the record. After the training process complete, the last model state saved will be retrieved and used for the speech to text task.

# Set up


In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torchaudio
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
from Levenshtein import distance
import seaborn as sns
import time
import random

In [2]:
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
num_workers = 4 if cuda else 0
print("Cuda = "+str(cuda)+" with num_workers = "+str(num_workers))

Cuda = True with num_workers = 4
Cuda = True with num_workers = 4


# Load data

In [3]:
train_X = np.load("train.npy", allow_pickle=True)
val_X = np.load("dev.npy", allow_pickle=True)

In [4]:
# preprocess transcripts, transform it to indices and build dictionary

# retrieve all characters from train and dev transcripts
def create_dictionaries(raw_transcripts):
    '''
    Create dictionaries for letter2index and index2letter transformations
    Return a list of all possible letters for index2letter,
    and a dictionary for letter2index
    '''
    letter_list = ["<sos>", "<eos>", " "]   # include start/end of sentence and space
    for transcript in raw_transcripts:
        for word in transcript:
            word = str(word, "utf-8")
            for character in word:
                if character not in letter_list:
                    letter_list.append(character)
    letter_list.remove('\x00')
    dictionary = {}
    for i, character in enumerate(letter_list):
        dictionary[character] = i
    return letter_list, dictionary


def transform_letter_to_index(raw_transcripts, letter2index):
    '''
    Transforms text input to numerical input by converting each letter 
    to its corresponding index from letter_list

    Args:
        raw_transcripts: Raw text transcripts with the shape of (N, )
    
    Return:
        transcripts: Converted index-format transcripts. This would be a list with a length of N
          each element would be a numpy-array containing indices of a sentence
    '''
    transcripts = []  
    for transcript in raw_transcripts:
        sentence = " ".join([str(word, "utf-8") for word in transcript])
        # sentence = transcript[0]  # for toy data
        transcript_indices = np.array([letter2index["<sos>"]] + [letter2index[c] for c in sentence] + [letter2index["<eos>"]])    # add sos and eos for decoder
        transcripts.append(transcript_indices)
    return transcripts


# transform indices back to letters for submission
# enter a 1D tensor consisting of indices of characters in a sequence, return the sequence string
def transform_index_to_letter(sentence_indices, index2letter):
    sentence = ""
    for i in sentence_indices:
        c = index2letter[i]
        if c == "<sos>":
            continue
        if c == "<eos>":
            return sentence
        sentence += c
    return sentence

In [5]:
train_Y_raw = np.load("train_transcripts.npy", allow_pickle=True)
val_Y_raw = np.load("dev_transcripts.npy", allow_pickle=True)
index2letter, letter2index = create_dictionaries([train_Y_raw, val_Y_raw])
train_Y = transform_letter_to_index(train_Y_raw, letter2index)
val_Y = transform_letter_to_index(val_Y_raw, letter2index)

In [7]:
# define dataset
train_transform = nn.Sequential(
    torchaudio.transforms.FrequencyMasking(freq_mask_param=5),
    torchaudio.transforms.TimeMasking(time_mask_param=30)
)

class LASDataset(Dataset):

    def __init__(self, X, Y, letter2index, train=False):
        # directly assign the original data instead of copying to save memory
        # Y could be none in case of test
        self.length = len(X)
        self.X = X    # (number of samples, indefinite number of time steps, feature_length), ndarray of objects(ndarrays)
        self.Y = Y    # (number of samples, indefinite number of time steps), ndarray of objects(ndarrays)
        self.letter2index = letter2index
        self.train = train
  

    def __len__(self):
        return self.length
  

    def __getitem__(self, index):
        x = self.X[index]
        # train/validation
        if self.Y is not None:
            y = self.Y[index]
            return x, y
        # test
        else:
            return x

    # used to retrieve lengths of sequences and pad sequences
    # then convert result to tensor
    # batch_first=True for both input and label, and all across the training
    def collate_fn(self, data):
        inputs = []
        labels = []
        input_lengths = torch.zeros(len(data)).long()   # used to add mask in attention
        label_lengths = torch.zeros(len(data)).long()   # used to add mask in loss
        # train/val
        if self.Y is not None:
            for i, (x, y) in enumerate(data):
                if self.train:
                    inputs.append(train_transform(torch.tensor(x).permute(1,0)).permute(1,0))  # data augmentation
                else:
                    inputs.append(torch.tensor(x))  # data augmentation
                labels.append(torch.tensor(y))
                input_lengths[i] = len(x)
                label_lengths[i] = len(y)
            padded_inputs = rnn.pad_sequence(inputs, batch_first=True)  # tensor of batch_size*longest_input_length*feature_length
            '''
            # data augmentation
            if self.length > 10000:
                train_transforms = nn.Sequential(
                    torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
                    torchaudio.transforms.TimeMasking(time_mask_param=35)
                )
                padded_inputs = train_transforms(padded_inputs.permute(1, 2, 0))
                padded_inputs = padded_inputs.permute(2, 0, 1)
            '''
            padded_labels = rnn.pad_sequence(labels, batch_first=True, padding_value=self.letter2index["<eos>"])  # tensor of batch_size*longest_label_length, pad with <eos> index instead of 0
            return padded_inputs.float(), padded_labels.long(), input_lengths, label_lengths
        # test
        else:
            for i, x in enumerate(data):
                inputs.append(torch.tensor(x))
                input_lengths[i] = len(x)
            padded_inputs = rnn.pad_sequence(inputs, batch_first=True)  # tensor of batch_size*longest_input_length*feature_length
            return padded_inputs.float(), input_lengths

In [8]:
# load training data
# set drop_last=True for counting samples in calculating average accuracy
batch_size = 32
train_dataset = LASDataset(train_X, train_Y, letter2index, train=True)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=num_workers, collate_fn=train_dataset.collate_fn)
val_dataset = LASDataset(val_X, val_Y, letter2index)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True, num_workers=num_workers, collate_fn=val_dataset.collate_fn)

# Model definition

In [9]:
class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    Read paper and understand the concepts and then write your implementation here.
    '''
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True)

    # input: N * time_steps * input_dim (padded)
    # output: N * time_steps * hiddex_dim
    def forward(self, x):
        batch_size, seq_len, input_dim = x.shape
        # truncate odd-length sequence
        if seq_len % 2:
            x = x[:,:-1,:]
        x = x.reshape(batch_size, seq_len//2, input_dim*2)   # use concatenation for adjacent feature vectors
        out, _ = self.blstm(x)
        return out

In [10]:
class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key, value and unpacked_x_len.
    Key and value are linear projections of the output from pBLSTM network for the laster.
    '''
    def __init__(self, input_dim, encoder_hidden_dim, key_value_size=128):
        super(Encoder, self).__init__()
        # The first LSTM at the very bottom
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=encoder_hidden_dim, num_layers=1, bidirectional=True, batch_first=True)

        # TODO: Define the blocks of pBLSTMs
        # take 2 hidden vectors of length hidden_dim*2, output one vector of the same length
        self.pblstm1 = pBLSTM(encoder_hidden_dim * 4, encoder_hidden_dim)
        self.pblstm2 = pBLSTM(encoder_hidden_dim * 4, encoder_hidden_dim)
        self.pblstm3 = pBLSTM(encoder_hidden_dim * 4, encoder_hidden_dim)
         
        # The linear transformation for producing Key and Value for attention
        # Since you are using bidirectional LSTM, be careful about the size of hidden dimension
        # use the same length for key and value
        self.key_network = nn.Linear(encoder_hidden_dim * 2, key_value_size)
        self.value_network = nn.Linear(encoder_hidden_dim * 2, key_value_size)


    # input: N * seq_len * feature_length (padded, seq_len is the longest sequence length)
    # output: N * seq_len * (hidden_dim*2)
    def forward(self, x, x_len):
        # Pass through the first LSTM at the very bottom
        packed_sequence = rnn.pack_padded_sequence(x, x_len.cpu(), enforce_sorted=False, batch_first=True)
        packed_out, _ = self.lstm(packed_sequence)
        output, out_lengths = rnn.pad_packed_sequence(packed_out, batch_first=True)   # N * seq_len * (hidden*2), with N lengths

        # TODO: Pass through the pBLSTM blocks
        # can't pass packedsequence into pBLSTM, need to use unpacked sequences
        # update sequence lengths simultaneously
        # output is now N * (seq_len//8) * (hidden*2)
        output = self.pblstm1(output)
        out_lengths = out_lengths // 2
        output = self.pblstm2(output)
        out_lengths = out_lengths // 2
        output = self.pblstm3(output)
        out_lengths = out_lengths // 2
        
        # Unpack the sequence and get the Key and Value for attention
        # shape is N * (seq_len//8) * key_value_size
        key = self.key_network(output)
        value = self.value_network(output)

        # return key, value, unpacked_x_len
        # out_lengths is used for masking in calculating attention
        return key, value, out_lengths

In [11]:
def plot_attention(attention):
    plt.clf()
    sns.heatmap(attention, cmap='GnBu')
    plt.show()

class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()

    # refer recitation 8
    # key/value: N * in_seq_len * key_value_size, the whole input key/value sequence
    # query: N * key_value_size, query of one time step
    def forward(self, query, key, value, mask):
        energy = torch.bmm(key, query.unsqueeze(2)).squeeze(2)  # element-wise matrix multiplication in a batch, get energy shape N * in_seq_len
        energy.masked_fill_(mask, -1e9)               # use a mask of shape (N, in_seq_len), get rid of invalid values
        attention = F.softmax(energy, dim=1)            # (N, in_seq_len)
        out = torch.bmm(attention.unsqueeze(1), value).squeeze(1) # Compute attention-weighted sum of context vectors of shape (N, key_value_size)
        # attention vectors are returned for visualization
        return out, attention

In [12]:
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step.
    Thus we use LSTMCell instead of LSTM here.
    The output from the second LSTMCell can be used as query for calculating attention.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, decoder_hidden_dim, embed_dim, key_value_size=128, max_out_len=600):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=letter2index['<eos>'])
        self.lstm1 = nn.LSTMCell(input_size=embed_dim + key_value_size, hidden_size=decoder_hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=decoder_hidden_dim, hidden_size=key_value_size)
        # in this implementation, the size of (key = query) = (value = context) = decoder_hidden
      
        self.attention = Attention()
        self.vocab_size = vocab_size
        self.character_prob = nn.Linear(2 * key_value_size, vocab_size)   # input_dim is key_value_size(context) + hidden_size(hidden_state)
        self.character_prob.weight = self.embedding.weight
        self.key_value_size = key_value_size

        self.max_out_len = max_out_len      # default maximum output sequence length

    def forward(self, key, value, encoder_len, y=None, mode='train', teacher_forcing=0.9):
        '''
        Args:
            key :(B, T, key_value_size) - Output of the Encoder Key projection layer
            value: (B, T, key_value_size) - Output of the Encoder Value projection layer
            y: (B, text_len) - Batch input of text label with text_length, used in teacher forcing
            mode: Train or eval mode
        Return:
            predictions: the character perdiction probability
        '''

        batch_size, key_seq_max_len, key_value_size = key.shape

        # get maximum length of results, or set to default value in testing, then just generate sequences of that length
        # get teacher forcing ground truths in training
        if mode == 'train':
            max_len =  y.shape[1]
            label_embeddings = self.embedding(y)
        else:
            max_len = self.max_out_len

        # TODO: Create the attention mask here (outside the for loop rather than inside) to avoid repetition
        mask = torch.arange(key.size(1)).unsqueeze(0) >= encoder_len.unsqueeze(1)   # Make use of broadcasting: (1, seq_len), (batch_size, 1) -> (batch_size, seq_len)
        mask = mask.to(device)
        
        # output of every time step
        predictions = []
        prediction = torch.zeros(batch_size, 1).to(device)  # initialize to text output <sos> instead of logits in every batch for first input, compatible with the calculation of char_embed for logits below
        hidden_states = [None, None]  # initial hidden states of every layer
        
        # TODO: Initialize the context. Be careful here
        # initialize context to 0 since the first timestep doesn't have context
        context = torch.zeros(batch_size, key_value_size).to(device)
        
        for i in range(max_len):
            # get one-time-step embedding in a batch of shape (batch_size, embed_size)
            if mode == 'train':
                if np.random.random() > teacher_forcing:
                    if i > 0:
                        prediction = F.gumbel_softmax(prediction) # Gumbel noise
                    char_embed = self.embedding(prediction.argmax(dim=-1))
                # Teacher Forcing
                else:
                    char_embed = label_embeddings[:,i]
            else:
                char_embed = self.embedding(prediction.argmax(dim=-1))

            y_context = torch.cat([char_embed, context], dim=1)   # (batch_size, embed_size+key_value_size)
            hidden_states[0] = self.lstm1(y_context, hidden_states[0])
            lstm1_hidden = hidden_states[0][0]
            hidden_states[1] = self.lstm2(lstm1_hidden, hidden_states[1])
            output = hidden_states[1][0]    # (batch_size, hidden_size)
            
            # TODO: Compute attention from the output of the second LSTM Cell
            context, attention_score = self.attention(output, key, value, mask)  # directly use hidden state as query
            
            output_context = torch.cat([output, context], dim=1)
            prediction = self.character_prob(output_context)    # (batch_size, vocab_size)
            predictions.append(prediction.unsqueeze(1))
        return torch.cat(predictions, dim=1)    # (batch_size, max_len, vocab_size)

In [13]:
class LASmodel(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, encoder_hidden_dim, decoder_hidden_dim, embed_dim, key_value_size=128):
        super(LASmodel,self).__init__()
        self.encoder = Encoder(input_dim, encoder_hidden_dim, key_value_size=key_value_size)
        self.decoder = Decoder(vocab_size, decoder_hidden_dim, embed_dim, key_value_size=key_value_size)

    def forward(self, x, x_len, y=None, mode='train', teacher_force=0.9):
        key, value, encoder_len = self.encoder(x, x_len)
        predictions = self.decoder(key, value, encoder_len, y=y, mode=mode, teacher_forcing=teacher_force)
        return predictions

# Training

In [14]:
# train one epoch, return the average training loss
def train_epoch(model, train_loader, criterion, optimizer, teacher_force_rate):
    training_loss = 0
    model.train()
    # 0) Iterate through your data loader
    for batch_num, (padded_inputs, padded_labels, input_lengths, label_lengths) in tqdm(enumerate(train_loader)):
        # 1) Set the inputs to the device.
        padded_inputs = padded_inputs.to(device)
        padded_labels = padded_labels.to(device)
        input_lengths = input_lengths.to(device)
        label_lengths = label_lengths.to(device)
        # 2) Pass your inputs, and length of speech into the model.
        # the input to the decoder should be <sos>+sequence and output sequence+<eos>, while padded_labels=<sos>+sequence+<eos>
        # so remove the last element of each label sequence to fix shape
        predictions = model(padded_inputs, input_lengths, padded_labels[:,:-1], mode="train", teacher_force=teacher_force_rate)
        # 3) Generate a mask based on the lengths of the text
        #    Ensure the mask is on the device and is the correct shape.
        # use length-1 to mask out predictions out of <eos>
        mask_loss = torch.arange(padded_labels.size(1)-1).unsqueeze(0).to(device) >= (label_lengths-1).unsqueeze(1)
        # 4. Calculate the loss and mask it to remove the padding part
        batch_size, max_len, vocab_size = predictions.shape
        loss = criterion(predictions.reshape(-1, vocab_size), padded_labels[:,1:].reshape(-1)) # remove <sos> in labels to calculate loss
        loss.masked_fill_(mask_loss.reshape(-1).to(device), 0)
        loss = loss.sum() / batch_size   # add up losses of all time steps, then divide by batch size
        training_loss += loss.item()
        # 5. Backward on the masked loss
        loss.backward()
        # 6. Optional: Use torch.nn.utils.clip_grad_norm(model.parameters(), 2) to clip the gradient
        # 7. Take a step with your optimizer
        optimizer.step()
        optimizer.zero_grad()
    training_loss /= len(train_loader)
    # 8. print the statistic (loss, edit distance and etc.) for analysis
    return training_loss
        

# validation of classification task, return the average loss and LD
def evaluate(model, val_loader, criterion):
    val_loss = 0
    val_LD = 0
    model.eval()
    with torch.no_grad():
        for batch_num, (padded_inputs, padded_labels, input_lengths, label_lengths) in enumerate(val_loader):
            padded_inputs = padded_inputs.to(device)
            padded_labels = padded_labels.to(device)
            input_lengths = input_lengths.to(device)
            label_lengths = label_lengths.to(device)

            predictions = model(padded_inputs, input_lengths, padded_labels[:,:-1], mode="test")
            # calculate masked loss
            # prediction will be of shape (batch_size, 600, vocab_size) since it's in test mode, need to truncate
            predictions = predictions[:,:padded_labels.shape[1]-1]
            mask_loss = torch.arange(padded_labels.size(1)-1).unsqueeze(0).to(device) >= (label_lengths-1).unsqueeze(1)
            batch_size, max_len, vocab_size = predictions.shape
            # remove <sos> in labels to calculate loss
            loss = criterion(predictions.reshape(-1, vocab_size), padded_labels[:,1:].reshape(-1))
            loss.masked_fill_(mask_loss.reshape(-1).to(device), 0)
            loss = loss.sum() / batch_size   # add up losses of all time steps, then divide by batch size
            val_loss += loss.item()

            # simple greedy search decoding
            predicted_indices = torch.argmax(predictions, dim=2)
            # convert result back to text, and compute LD
            batch_LD = 0
            for i in range(batch_size):
                letter_seq = transform_index_to_letter(predicted_indices[i], index2letter)
                letter_label_seq = transform_index_to_letter(padded_labels[i][1:], index2letter)
                batch_LD += distance(letter_seq, letter_label_seq)
            batch_LD /= batch_size
            val_LD += batch_LD
    val_loss /= len(val_loader)
    val_LD /= len(val_loader)
    return val_loss, val_LD

def test(model, test_loader):
    final_pred = []
    model.eval()
    with torch.no_grad():
        for padded_inputs, input_lengths in test_loader:
            padded_inputs = padded_inputs.to(device)
            input_lengths = input_lengths.to(device)
            predictions = model(padded_inputs, input_lengths, mode="test")
            # simple greedy search decoding
            predicted_indices = torch.argmax(predictions, dim=2)
            # convert result back to text
            for i in range(len(padded_inputs)):
                final_pred.append(transform_index_to_letter(predicted_indices[i], index2letter))
    return final_pred

In [None]:
# create model and other stuff
feature_length = 40
learningRate = 1e-3
weightDecay = 5e-6
vocab_size = len(index2letter)
listener_hidden_size = 256
speller_hidden_size = 512
key_value_size = 128
embed_size = 256
model = LASmodel(feature_length, vocab_size, listener_hidden_size, speller_hidden_size, embed_size, key_value_size)
model.to(device)
model_name = "LAS_DA_gumbel"
os.makedirs(f"model/{model_name}")

criterion = nn.CrossEntropyLoss(reduction='none')
optimizer = optim.Adam(model.parameters(), lr=learningRate, weight_decay=weightDecay)
# scheduler = optim.lr_scheduler.StepLR(optimizer, gamma=0.5, step_size=5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=3, threshold=0.002)
'''
checkpoint = torch.load("best_checkpoint")
model.load_state_dict(checkpoint["model_state_dict"])
# checkpoint["optimizer_state_dict"]["param_groups"][0]["lr"] = 5e-4
criterion = checkpoint["loss"]
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
trained_epoch = checkpoint["epoch"] + 41
# scheduler.load_state_dict(checkpoint["scheduler_state_dict"])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=3, threshold=0.1)
'''
# train model
epoch_num = 60
best_val_LD = np.inf
for epoch in range(epoch_num):
    teacher_force_rate = 0.9 - 0.05 * (epoch//5)
    training_loss = train_epoch(model, train_dataloader, criterion, optimizer, teacher_force_rate)
    val_loss, val_LD = evaluate(model, val_dataloader, criterion)
    scheduler.step(val_LD)
    if(val_LD < best_val_LD):
        best_val_LD = val_LD
        torch.save({
                "epoch": epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "loss": criterion,
                "scheduler_state_dict": scheduler.state_dict()
            }, f"model/{model_name}/best_checkpoint")
    if(epoch % 10 == 9):
        torch.save({
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "loss": criterion,
            "scheduler_state_dict": scheduler.state_dict()
        }, f"model/{model_name}/checkpoint_{epoch}")
    with open(f"model/{model_name}/training_logs.txt", 'a') as logfile:
        logfile.write(f"Epoch: {epoch}, training loss: {training_loss}, validation loss: {val_loss}, validation Levenshtein Distance: {val_LD}, learning rate: {scheduler._last_lr}\n")
    print(f"Epoch: {epoch}, training loss: {training_loss}, validation loss: {val_loss}, validation Levenshtein Distance: {val_LD}, learning rate: {scheduler._last_lr}")

# Test

In [16]:
# test data
test_X = np.load("test.npy", allow_pickle=True)
test_dataset = LASDataset(test_X, None, letter2index)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=test_dataset.collate_fn)

In [19]:
# test
checkpoint = torch.load(f"model/{model_name}/best_checkpoint")
model.load_state_dict(checkpoint["model_state_dict"])
prediction = test(model, test_dataloader)
with open("test_result.csv", 'w') as f:
    f.write("id,label\n")
    for i, pred in enumerate(prediction):
        f.write(f"{i},{pred}\n")

In [None]:
!kaggle competitions submit -c 11785-homework-4-part-2-las-slack -f test_result.csv -m "second submission"

## Appendix

In [None]:
class LockedDropout(nn.Module):
    """ LockedDropout applies the same dropout mask to every time step.

    ref: https://github.com/salesforce/awd-lstm-lm/blob/master

    Args:
        dropout (float): Probability of an element in the dropout mask to be zeroed.
    """
    def __init__(self, dropout=0.5):
        self.dropout = dropout
        super().__init__()
    def forward(self, x):
        """
        Args:
            x (:class:`torch.FloatTensor` [sequence length, batch size, rnn hidden size]): Input to
                apply dropout too.
        """
        if not self.training or not self.p:
            return x
        x = x.clone()
        mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - self.dropout)
        mask = mask.div_(1 - self.dropout)
        mask = mask.expand_as(x)
        return x * mask

In [None]:
class MiniEncoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key, value and unpacked_x_len.
    Key and value are linear projections of the output from pBLSTM network for the laster.
    '''
    def __init__(self, input_dim, encoder_hidden_dim, key_value_size=128):
        super(MiniEncoder, self).__init__()
        # The first LSTM at the very bottom
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=encoder_hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
         
        # The linear transformation for producing Key and Value for attention
        # Since you are using bidirectional LSTM, be careful about the size of hidden dimension
        # use the same length for key and value
        self.key_network = nn.Linear(encoder_hidden_dim * 2, key_value_size)
        self.value_network = nn.Linear(encoder_hidden_dim * 2, key_value_size)


    # input: N * seq_len * feature_length (padded, seq_len is the longest sequence length)
    # output: N * seq_len * (hidden_dim*2)
    def forward(self, x, x_len):
        # Pass through the first LSTM at the very bottom
        packed_sequence = rnn.pack_padded_sequence(x, x_len.cpu(), enforce_sorted=False, batch_first=True)
        packed_out, _ = self.lstm(packed_sequence)
        output, out_lengths = rnn.pad_packed_sequence(packed_out, batch_first=True)   # N * seq_len * (hidden*2), with N lengths
        
        # Unpack the sequence and get the Key and Value for attention
        # shape is N * seq_len * key_value_size
        key = self.key_network(output)
        value = self.value_network(output)

        # return key, value, unpacked_x_len
        # out_lengths is used for masking in calculating attention
        return key, value, out_lengths, output

In [None]:
class MiniDecoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step.
    Thus we use LSTMCell instead of LSTM here.
    The output from the second LSTMCell can be used as query for calculating attention.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, decoder_hidden_dim, embed_dim, key_value_size=128, max_out_len=600, teacher_forcing=0.9, gumbel_noise=0.9):
        super(MiniDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=letter2index['<eos>'])
        self.lstm1 = nn.LSTMCell(input_size=embed_dim + key_value_size, hidden_size=key_value_size)
        # in this implementation, the size of (key = query) = (value = context) = decoder_hidden

        self.attention = Attention()
        self.vocab_size = vocab_size
        self.character_prob = nn.Linear(2 * key_value_size, vocab_size)   # input_sim is key_value_size(context) + hidden_size(hidden_state)
        self.character_prob.weight = self.embedding.weight
        self.key_value_size = key_value_size

        self.max_out_len = max_out_len      # default maximum output sequence length
        self.teacher_forcing = teacher_forcing  # teacher forcing probability
        self.gumbel_noise = gumbel_noise     # gumbel noise rate

    def forward(self, key, value, encoder_len, y=None, mode='train'):
        '''
        one time step forward
        Args:
            key :(B, T, key_value_size) - Output of the Encoder Key projection layer
            value: (B, T, key_value_size) - Output of the Encoder Value projection layer
            y: (B, text_len) - Batch input of text label with text_length, used in teacher forcing
            mode: Train or eval mode
        Return:
            predictions: the character perdiction probability 
        '''

        batch_size, key_seq_max_len, key_value_size = key.shape

        # get maximum length of results, or set to default value in testing, then just generate sequences of that length
        # get teacher forcing ground truths in training
        if mode == 'train' or mode == "pretrain":
            max_len =  y.shape[1]
            label_embeddings = self.embedding(y)
        else:
            max_len = self.max_out_len

        # TODO: Create the attention mask here (outside the for loop rather than inside) to avoid repetition
        mask = torch.arange(key.size(1)).unsqueeze(0) >= encoder_len.unsqueeze(1)   # Make use of broadcasting: (1, max_len), (batch_size, 1) -> (batch_size, max_len)
        mask = mask.to(device)
        
        # output of every time step
        predictions = []
        prediction = torch.zeros(batch_size, 1).to(device)  # initialize to text output <sos> instead of logits in every batch for first input, compatible with the calculation of char_embed for logits below
        hidden_states = [None, None]  # initial hidden states of every layer
        
        # TODO: Initialize the context. Be careful here
        # initialize context to 0 since the first timestep doesn't have context
        context = torch.zeros(batch_size, key_value_size).to(device)
        
        attention = []    # attention score of the first sample in this batch
        for i in range(max_len):
            # get one-time-step embedding in a batch of shape (batch_size, embed_size)
            if mode == 'train' or mode == "pretrain":
                # TODO: Implement (1) Teacher Forcing and (2) Gumble Noise techniques here
                # ...###########################################################################################################################
                char_embed = self.embedding(prediction.argmax(dim=-1))
            else:
                if i>0 and i<20:
                    print(f"i={i} prediction: {prediction.argmax(dim=-1)[0]}")   # focus on the first sample in the batch
                char_embed = self.embedding(prediction.argmax(dim=-1))
            
            y_context = torch.cat([char_embed, context], dim=1)   # (batch_size, embed_size+key_value_size)
            hidden_states[0] = self.lstm1(y_context, hidden_states[0])
            output = hidden_states[0][0]    # (batch_size, hidden_size)
            if mode == "test" and i>0 and i<20:
                print(f"context: {context[0]}\nattention_score: {attention_score[0]}")
                print(f"query: {output[0]}\n")
            
            # TODO: Compute attention from the output of the second LSTM Cell
            if mode != "pretrain":
                context, attention_score = self.attention(output, key, value, mask)
                attention.append(attention_score[0])    # add attention score of the first sample on current time step
            
            output_context = torch.cat([output, context], dim=1)
            prediction = self.character_prob(output_context)    # (batch_size, vocab_size)
            predictions.append(prediction.unsqueeze(1))
        if mode == "pretrain":
            return torch.cat(predictions, dim=1)
        else:
            return torch.cat(predictions, dim=1), torch.stack(attention, dim=0)    # predictions (batch_size, out_max_len, vocab_size), attention (out_max_len, in_seq_len)

In [None]:
class MiniLASmodel(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, encoder_hidden_dim, decoder_hidden_dim, embed_dim, key_value_size=128):
        super(MiniLASmodel,self).__init__()
        self.encoder = MiniEncoder(input_dim, encoder_hidden_dim, key_value_size=key_value_size)
        self.decoder = MiniDecoder(vocab_size, decoder_hidden_dim, embed_dim, key_value_size=key_value_size)

    def forward(self, x, x_len, y=None, mode='train'):
        key, value, encoder_len, out = self.encoder(x, x_len)
        
        if mode == "pretrain":
            key *= 0
            value *= 0
            out *= 0
        if mode == "test":
            print(f"out\n{out}{out.shape}\nkey\n{key}{key.shape}\nvalue\n{value}{value.shape}\n")
            print(f"key_weight\n{self.encoder.key_network.weight}\n")
            print(f"key_bias\n{self.encoder.key_network.bias}\n")
            print(f"value_weight\n{self.encoder.value_network.weight}\n")
            print(f"value_bias\n{self.encoder.value_network.bias}\n")
        if mode == "pretrain":
            predictions = self.decoder(key, value, encoder_len, y=y, mode=mode)
            attention = None
        else:
            predictions, attention = self.decoder(key, value, encoder_len, y=y, mode=mode)
        return predictions, attention

In [None]:
# train one epoch, return the average training loss and attention
def train_epoch_mini(model, train_loader, criterion, optimizer, mode):
    training_loss = 0
    model.train()
    # 0) Iterate through your data loader
    for batch_num, (padded_inputs, padded_labels, input_lengths, label_lengths) in tqdm(enumerate(train_loader)):
        # 1) Set the inputs to the device.
        padded_inputs = padded_inputs.to(device)
        padded_labels = padded_labels.to(device)
        input_lengths = input_lengths.to(device)
        label_lengths = label_lengths.to(device)
        # 2) Pass your inputs, and length of speech into the model.
        # the input to the decoder should be <sos>+sequence and output sequence+<eos>, while padded_labels=<sos>+sequence+<eos>
        # so remove the last element of each label sequence to fix shape
        predictions, _ = model(padded_inputs, input_lengths, padded_labels[:,:-1], mode=mode)
        # 3) Generate a mask based on the lengths of the text
        #    Ensure the mask is on the device and is the correct shape.
        # use length-1 to mask out predictions out of <eos>
        mask_loss = torch.arange(padded_labels.size(1)-1).unsqueeze(0).to(device) >= (label_lengths-1).unsqueeze(1)
        # 4. Calculate the loss and mask it to remove the padding part
        batch_size, max_len, vocab_size = predictions.shape

        loss = criterion(predictions.reshape(-1, vocab_size), padded_labels[:,1:].reshape(-1)) # remove <sos> in labels to calculate loss
        loss.masked_fill_(mask_loss.reshape(-1).to(device), 0)
        loss = loss.sum()   # add up losses of all time steps, then divide by batch size
        # loss = loss / ((mask_loss == False).sum())
        loss /= batch_size
        
        training_loss += loss.item()
        # 5. Backward on the masked loss
        loss.backward()
        # 6. Optional: Use torch.nn.utils.clip_grad_norm(model.parameters(), 2) to clip the gradient
        # 7. Take a step with your optimizer
        optimizer.step()
        
        optimizer.zero_grad()
    training_loss /= len(train_loader)
    # 8. print the statistic (loss, edit distance and etc.) for analysis
    return training_loss
        

# validation of classification task, return the average loss and LD
def evaluate_mini(model, val_loader, criterion):
    val_loss = 0
    val_LD = 0
    model.eval()
    with torch.no_grad():
        for batch_num, (padded_inputs, padded_labels, input_lengths, label_lengths) in enumerate(val_loader):
            padded_inputs = padded_inputs.to(device)
            padded_labels = padded_labels.to(device)
            input_lengths = input_lengths.to(device)
            label_lengths = label_lengths.to(device)

            predictions, attention_score = model(padded_inputs, input_lengths, padded_labels[:,:-1], mode="test")
            # calculate masked loss
            # prediction will be of shape (batch_size, 600, vocab_size) since it's in test mode, need to truncate
            predictions = predictions[:,:padded_labels.shape[1]-1]

            plot_attention(attention_score[:padded_labels.shape[1]-1].cpu().detach().numpy())
            
            mask_loss = torch.arange(padded_labels.size(1)-1).unsqueeze(0).to(device) >= (label_lengths-1).unsqueeze(1)
            batch_size, max_len, vocab_size = predictions.shape
            
            loss = criterion(predictions.reshape(-1, vocab_size), padded_labels[:,1:].reshape(-1)) # remove <sos> in labels to calculate loss
            loss.masked_fill_(mask_loss.reshape(-1).to(device), 0)
            loss = loss.sum()   # add up losses of all time steps, then divide by batch size
            # loss = loss / ((mask_loss == False).sum())
            loss /= batch_size
            
            val_loss += loss.item()

            # simple greedy search decoding
            predicted_indices = torch.argmax(predictions, dim=2)
            print(predicted_indices)
            # convert result back to text, and compute LD
            batch_LD = 0
            for i in range(batch_size):
                letter_seq = transform_index_to_letter(predicted_indices[i], index2letter)
                letter_label_seq = transform_index_to_letter(padded_labels[i][1:], index2letter)
                val_LD += distance(letter_seq, letter_label_seq)
            batch_LD /= batch_size
            val_LD += batch_LD
    val_loss /= len(val_loader)
    val_LD /= len(val_loader)
    return val_loss, val_LD

In [23]:
# try out on simple dataset
feature_length = 40
learningRate = 1e-3
weightDecay = 1e-6
vocab_size = len(index2letter)
listener_hidden_size = 256
speller_hidden_size = 512
key_value_size = 128
embed_size = 256
model = MiniLASmodel(feature_length, vocab_size, listener_hidden_size, speller_hidden_size, embed_size, key_value_size)
model.to(device)

criterion = nn.CrossEntropyLoss(reduction='none')
optimizer = optim.Adam(model.parameters(), lr=learningRate, weight_decay=weightDecay)
#scheduler = optim.lr_scheduler.StepLR(optimizer, gamma=0.5, step_size=3)

In [None]:
# pretrain model
epoch_pretrain = 10
for epoch in range(epoch_pretrain):
    training_loss = train_epoch_mini(model, train_dataloader, criterion, optimizer, mode="pretrain")
    val_loss, val_LD = evaluate_mini(model, val_dataloader, criterion)
    # scheduler.step(val_LD)
    scheduler.step()
    print(f"Epoch: {epoch}, training loss: {training_loss}, validation loss: {val_loss}, validation Levenshtein Distance: {val_LD}")

In [None]:
# train model
epoch_num = 10
for epoch in range(epoch_num):
    training_loss = train_epoch_mini(model, train_dataloader, criterion, optimizer, mode="train")
    val_loss, val_LD = evaluate_mini(model, val_dataloader, criterion)
    # scheduler.step(val_LD)
    # scheduler.step()
    print(f"Epoch: {epoch}, training loss: {training_loss}, validation loss: {val_loss}, validation Levenshtein Distance: {val_LD}")