# Encoder-decoder for sequence-to-sequence tasks

In this tutorial, we will walk through a simple example of encoder-decoder with attention mechanism.

This tutorial focuses on implementation. For basic concepts, refer to slides for lectures and recitations.


## Toy task: English spelling to pronunciation

We consider the same toy task as the last recitation, which is predicting the pronunciation (as sequence of phonemes) of an English word given its spelling.

The model architecture and hyperparameters are for demonstration purpose only. Do not copy them to your actual homework.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
# !nvidia-smi

print("changed again 2")


changed again 2


In [None]:
import numpy as np
from zipfile import ZipFile
import torch

from torch import nn
from torch.nn.utils.rnn import *
from torch.utils.data import Dataset, DataLoader, TensorDataset


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

HIDDEN_SIZE = 256

In [None]:
# loading test data
# with ZipFile("/content/gdrive/My Drive/Kaggle/test_new.npy.zip") as f:
#     f.extractall()
test_data = np.load("/content/drive/My Drive/Kaggle/test_new.npy",allow_pickle=True,encoding='bytes')

# loading dev data
# with ZipFile("/content/gdrive/My Drive/Kaggle/dev_new.npy.zip") as f:
#     f.extractall()
dev_data = np.load("/content/drive/My Drive/Kaggle/dev_new.npy",allow_pickle=True,encoding='bytes')
dev_labels = np.load("/content/drive/My Drive/Kaggle/dev_transcripts.npy",allow_pickle=True,encoding='bytes')

# loading training data
# with ZipFile("/content/gdrive/My Drive/Kaggle/train_new.npy.zip") as f:
#     f.extractall()
train_data = np.load("/content/drive/My Drive/Kaggle/train_new.npy",allow_pickle=True,encoding='bytes')
# with ZipFile("/content/gdrive/My Drive/Kaggle/train_transcripts.npy.zip") as f:
#     f.extractall()
train_labels = np.load("/content/drive/My Drive/Kaggle/train_transcripts.npy",allow_pickle=True,encoding='bytes')

In [None]:
# check the info of datasets
print("The sample size of train & train_labels is: ",train_data.shape)
print("The sample size of dev & dev_labels is: ",dev_data.shape)
print('')

# check samples for dataset
print("The sample input for training set is: \n",train_data[0].shape)
print("The sample output for training set is: \n",train_labels[0].shape)


The sample size of train & train_labels is:  (24724,)
The sample size of dev & dev_labels is:  (1106,)

The sample input for training set is: 
 (477, 40)
The sample output for training set is: 
 (14,)


## steps for data preprocessing (word level)

1. Insert \<s> and \</s> before & after a sentence  
2. Pack & padded the sequence before data loader
3. How to construct a words list? (what if words never seen before), the same words list will also applied in dev
4. 


Every target sequence is prepended with `<s>` and appended with `</s>`. This is necessary for the decoder to predict the first token and the end of the sequence.

## Preprocessing

In [None]:
LETTER_LIST = ['<pad>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', \
               'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']
               
def create_dictionaries(letter_list):
    letter2index = dict()
    index2letter = dict()
    for i in range(len(letter_list)):
        letter2index[letter_list[i]] = i
        index2letter[i] = letter_list[i]
    return letter2index, index2letter

def transform_letter_to_index(transcript, letter_list):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    letter2index, index2letter = create_dictionaries(letter_list)
    letter_to_index_list = [torch.LongTensor([letter2index[p] for p in (['<sos>'] + list(" ".join(item.astype(str))) + ['<eos>'])]) for item in transcript]

    return letter_to_index_list

def decode_sentence(predictions, letter_list):
    '''
    :param predictions: (total_batch_size, max_lens, vocab_size)
    :return decoded_list: list of strings
    '''

    _, index2letter = create_dictionaries(letter_list)
    decoded_list = []
    # loop the dataset
    num = predictions.shape[0]
    for i in range(num):
        temp = predictions[i,:,:] # (max_len, vocab_size)

        _,indices = torch.max(temp, dim=1) # (max_len)

        temp_list = []

        for i in indices:
            # if reach <eos>, early break
            if index2letter[i.item()] == '<eos>':
                break
            else:
                # stored in temp_list
                temp_list.append(index2letter[i.item()])
        # add temp_list
#         print(("".join(temp_list)))
        decoded_list.append("".join(temp_list))

    return decoded_list


In [None]:
# decoded_list = decode_sentence(predictions, LETTER_LIST)
# print(len(decoded_list))

## Speech2TextDataset

In [None]:
def normalize(x, m, s): return (x-m)/s
def normalize_to(train):
    m,s = train.mean(axis = 0),train.std(axis = 0)
    return normalize(train, m, s)


class Speech2TextDataset(Dataset):
    def __init__(self,data, labels, isTrain = True):
        # apply normalization on features
        self.X = [torch.tensor([c for c in word]) for word in data]
        self.X_lens = torch.LongTensor([len(seq) for seq in self.X])
        self.isTrain = isTrain
        if isTrain:
            self.Y = transform_letter_to_index(labels, LETTER_LIST)     
            self.Y_lens = torch.LongTensor([len(seq) for seq in self.Y])
        del data
        del labels
        
    def __getitem__(self,index):
        if self.isTrain:
            return self.X[index], self.X_lens[index], self.Y[index], self.Y_lens[index]
        else:
            return self.X[index], self.X_lens[index]

    def __len__(self):
        return len(self.X)

#### when use pad_sequence or pack_padded_sequence, remember the batch_first=True ####
#### this is mandantary when using CTCLoss

def my_collate_train(batch):
    data = [normalize_to(item[0]) for item in batch]
    target = [item[2] for item in batch]
    data_lens = torch.LongTensor([seq[1] for seq in batch])
    target_lens = torch.LongTensor([seq[3] for seq in batch])
    data = pad_sequence(data)
    target = pad_sequence(target,batch_first=True)
    # del batch
    return data, data_lens, target, target_lens

def my_collate_test(batch):
    data = [normalize_to(item[0]) for item in batch]
    data_lens = torch.LongTensor([seq[1] for seq in batch])
    data = pad_sequence(data)
    # del batch
    return data, data_lens

In [None]:
# loading training data
train_dataset = Speech2TextDataset(train_data,train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=32,shuffle=True,collate_fn=my_collate_train)

In [None]:
# loading dev data
dev_dataset = Speech2TextDataset(dev_data,dev_labels)
dev_loader = torch.utils.data.DataLoader(dev_dataset,batch_size=32, shuffle=True, collate_fn=my_collate_train)

In [None]:
# loading test data
test_dataset = Speech2TextDataset(test_data,None,isTrain=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = 32, shuffle=False, collate_fn = my_collate_test)

## Locked Dropout

In [None]:
# https://github.com/salesforce/awd-lstm-lm/blob/dfd3cb0235d2caf2847a4d53e1cbd495b781b5d2/locked_dropout.py#L5
class LockedDropout(nn.Module):
    """ LockedDropout applies the same dropout mask to every time step.

    **Thank you** to Sales Force for their initial implementation of :class:`WeightDrop`. Here is
    their `License
    <https://github.com/salesforce/awd-lstm-lm/blob/master/LICENSE>`__.

    Args:
        p (float): Probability of an element in the dropout mask to be zeroed.
    """

    def __init__(self, p=0.9):
        self.p = p
        super().__init__()

    def forward(self, x):
        """
        Args:
            x (:class:`torch.FloatTensor` [sequence length, batch size, rnn hidden size]): Input to
                apply dropout too.
        """
        if not self.training or not self.p:
            return x
        x = x.clone()
        mask = x.new_empty(1, x.size(1), x.size(2), requires_grad=False).bernoulli_(1 - self.p)
        mask = mask.div_(1 - self.p)
        mask = mask.expand_as(x)
        return x * mask


    def __repr__(self):
        return self.__class__.__name__ + '(' \
            + 'p=' + str(self.p) + ')'

## pBLSTM

In [None]:
class pBLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(pBLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size // 2, bidirectional = True)
    
    def forward(self, X, lengths):
        """
        :param X: (max_len, batch_size, input_size), input sequences
        :returns: encoded_input sequences (max_len // 2, batch_size, hidden_size * 2)
        """

        X = pack_padded_sequence(X, lengths, enforce_sorted=False)
        out, state = self.lstm(X) # (max_len, batch, hidden_size)
        out, out_lens = pad_packed_sequence(out)
        max_len, batch_size, hidden_size = out.shape

        # if max len is odd
        if max_len % 2 == 1:
            # padding
            pad = torch.zeros((1, batch_size, hidden_size),dtype = torch.float,device = DEVICE)

            # concatenate
            out = torch.cat((out,pad),0)
            out = out.transpose(0,1)

            # reshape
            out = out.reshape(batch_size, (max_len + 1)// 2, hidden_size * 2)
            out = out.transpose(0,1)
            

        # if max len is even
        else:
            
            out = out.transpose(0,1)
            out = out.contiguous().view(batch_size, max_len // 2, hidden_size * 2)
            out = out.transpose(0,1)

        # calculate the return length
        rLens = torch.zeros(out_lens.shape,device=DEVICE)

        # 2. keep track of the lengths size
        for i in range(len(out_lens)):
            if out_lens[i] % 2 == 1:
                rLens[i] = (out_lens[i] + 1) // 2
            else:
                rLens[i] = out_lens[i] // 2

        return out, rLens


## Encoder

The encoder is just a one-layer bi-directional LSTM.
The encoder returns not only the encoded sequence, but also the final hidden state of the LSTM, which will be the initial hidden state of the decoder.

notation: `batch_size` is the batch size, `max_len` is the maximum input sequence length, `hidden_size` is the hidden size.

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, key_dim=HIDDEN_SIZE, value_dim=HIDDEN_SIZE):
        super(Encoder, self).__init__()
                
        # the default "batch_first" for LSTM is False 
        # because the bidirectional=True, the final output dimension will be hidden_size
        self.lstm1 = pBLSTM(input_size, hidden_size)
        self.lstm2 = pBLSTM(hidden_size*2, hidden_size *2)
        self.lstm3 = pBLSTM(hidden_size*4, hidden_size *2)
        
        # locked dropout
        self.drop = LockedDropout()
        
        # linear output for key
        self.KeyLinear = nn.Linear(hidden_size * 4, key_dim, bias = False)

        # linear output for value
        self.ValueLinear = nn.Linear(hidden_size * 4, value_dim, bias = False)
        

    def forward(self, X, lengths):
        """
        :param X: (max_len, batch_size, hidden_size), input sequences
        :param lengths: (batch_size, ), lengths of input sequences
        :returns: key: (batch_size, max_len // 2, key_dim)
                  value: (batch_size, max_len // 2, value_dim)
        """ 

        # Initialize inputs
        out = X
        lens = lengths


        # Three pBLSTM layers
        # for i in range(3):    
        out,lens = self.lstm1(out,lens) # out: (max_len // 2, batch_size, hidden_size * 2)
        out = self.drop(out)
        out,lens = self.lstm2(out,lens) # out: (max_len // 4, batch_size, hidden_size * 4)
        out = self.drop(out)
        out,lens = self.lstm3(out,lens) # out: (max_len // 8, batch_size, hidden_size * 8)

        # Linear layers
        key = self.KeyLinear(out)
        value = self.ValueLinear(out)

        ######## do I need the state? #########              
        # final_state is a tuple, containing hidden_state and cell_state 

        # transpose before return
        key = key.transpose(0,1)
        value = value.transpose(0,1)
        # return (key, value) for attention model
        # instead of giving some output, will just give out the key & value pair
        return key, value, lens

## Attention

This the simplest "dot product" attention, meaning that every attention logit is the dot product of a target (`query`) vector and a source (`context`) vector. Such an attention model has no parameter, but you may use more advanced attention mechanism with learnable parameters in real world.

Since there are a lot of uncommon operators, every line of code is annotated with input and output tensor sizes. 

The attention vectors are not used by other computations, but are returned for visualization. You will often want to visualize the attention matrix when debugging sequence-to-sequence models.

In [None]:
class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()
        

    def forward(self, query, key, value, lens):
        '''
        :param query :(N, context_size) Query is the output of LSTMCell from Decoder
        :param key: (N, key_size) Key Projection from Encoder per time step
        :param value: (N, value_size) Value Projection from Encoder per time step
        :param lens: (N,) lens for each data sample
        :return output: Attended Context
        :return attention_mask: Attention mask that can be plotted  
        '''
        # key, value is (N, key_size), (N, value_size) because partial of it is input

        """
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
        """
        # key: (batch_size, lens, key_size)
        # value: (batch_size, lens, value_size)
        # query: (batch_size, key_size)
        # energy = bmm(key, query)
        attention = torch.bmm(key,query.unsqueeze(2)).squeeze(2) # (batch_size, lens)

        # attention = softmax(energy)
        mask = torch.arange(key.shape[1],device=DEVICE).unsqueeze(0) >= lens.unsqueeze(1)
        attention.masked_fill_(mask, 0) # (batch_size, lens)

        attention = nn.functional.softmax(attention, dim=1)

        # context = bmm(attention, value)
        context = torch.bmm(attention.unsqueeze(1), value).squeeze(1) # (batch_size, value_size)

        return context, attention # context: (batch_size, value_size)
        

In [None]:
# test attention model

# batch_size = 5
# timesteps = 4
# key_size = 3
# value_size = 3

# query = torch.ones(batch_size, key_size)
# key = torch.ones(batch_size, timesteps, key_size)
# value = torch.ones( batch_size, timesteps,value_size)
# lens = torch.ones(batch_size)*4
# query, key, value, lens = query.to(DEVICE), key.to(DEVICE), value.to(DEVICE), lens.to(DEVICE)
# result = Attention()(query, key, value, lens)
# print("**Example Expected:**\n{result}",result)

## Decoder

Each `forward` call of decoder deals with only one timestep.

Here, we use the LSTM output as the query of attention and concatenate the attended context with the LSTM output. There are many other (better) ways to use attention context in the decoder.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# import torch.distributions.gumbel.Gumbel
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, hidden_dim, value_size=HIDDEN_SIZE, key_size=HIDDEN_SIZE, isAttended=True):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim) # hidden_dim = 128
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)

        self.isAttended = isAttended
        if (isAttended == True):
            self.attention = Attention()

        self.character_prob = nn.Linear(key_size + value_size, vocab_size)

        self.hidden_size = hidden_dim

    def forward(self, key, values, rLens,rate, text=None,  isTrain=True, isVal = False):
        '''
        :param key :(T, N, key_size) Output of the Encoder Key projection layer
        :param values: (T, N, value_size) Output of the Encoder Value projection layer
        :param text: (N, text_len) Batch input of text with text_length
        :param isTrain: Train or eval mode
        :return predictions: Returns the character perdiction probability 
        '''
        batch_size = key.shape[0]
        if (isTrain == True):
            max_len =  text.shape[1] # max_len = text_len
            embeddings = self.embedding(text) # text is Y, with size (batch_size, text_len, hidden_size)
        elif isVal:
#             print("get is Val")
            max_len = text.shape[1]        
        else:
            # if prediction, then set a max_len
            max_len = 250

        predictions = []
        hidden_states = [None, None]
        prediction = torch.ones(batch_size,1).to(DEVICE) * 33
        all_attention = []

        # if attention, init context
        if self.isAttended:
            context = torch.zeros((batch_size,self.hidden_size),device = DEVICE)

        # use max_len - 1 because we want to ignore eos
        for i in range(max_len-1):
            # * Implement Gumble noise and teacher forcing techniques 
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            # * If you haven't implemented attention yet, then you may want to check the index and break 
            #   out of the loop so you do you do not get index out of range errors. 

            ############# SET INPUT #################

            if (isTrain):

                tf_rate = torch.tensor(np.random.binomial(1,rate,size=(batch_size,1)),device=DEVICE)
                pred = self.embedding(prediction.argmax(dim=-1))

                m = torch.distributions.gumbel.Gumbel(prediction, 0.1)
                prediction = m.sample()
                prediction = self.embedding(prediction.argmax(dim=-1))

                char_embed =  tf_rate * embeddings[:, i, :] + (1-tf_rate) * prediction           
         

                
            else:
#                 print(" ****** got into isTrain = False mode ********")
#                 m = torch.distributions.gumbel.Gumbel(prediction, 0.1)
#                 prediction = m.sample()

                char_embed = self.embedding(prediction.argmax(dim=-1))
            
            ############## LSTM layers ##############
          
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            if self.isAttended:

                # if isTrain:
                inp = torch.cat([char_embed, context], dim=1) # (batch_size, hidden_dim + value_size)
                hidden_states[0] = self.lstm1(inp, hidden_states[0]) # (batch_size, 128)

                context,attention = self.attention(hidden_states[0][0], key, values, rLens) # (batch_size, value_size)

            else:
                # print(" Got into isAttended = False mode")

                inp = torch.cat([char_embed, values[i,:,:]], dim=1)
                hidden_states[0] = self.lstm1(inp, hidden_states[0])
            
            # hidden_states[0] = (output, state)
            # hidden_states[0][0] = output from lstm1

            # inp_2 = hidden_states[0][0]
            inp_2 = context
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

            ### Compute attention from the output of the second LSTM Cell ###
            output = hidden_states[1][0] # (batch_size,hidden_dim)

            ############## Linear Layers ##############

            # * When attention is True, replace values[i,:,:] with the context you get from attention. 
            if self.isAttended:
                # print(" is attended True in linear layers .........")
                prediction = self.character_prob(torch.cat([output, context], dim=1)) 
            else:
                prediction = self.character_prob(torch.cat([output, values[i,:,:]], dim=1)) # (batch_size, vocab_size)

            # predictions store the value from prediction
            predictions.append(prediction.unsqueeze(1))

        return torch.cat(predictions, dim=1) # (batch_size, max_len, vocab_size)

## Seq2Seq Model
Integrate all parts of the model

In [None]:
class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, hidden_dim, value_size=HIDDEN_SIZE, key_size=HIDDEN_SIZE, isAttended=False):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim)

    def forward(self, speech_input, speech_len, text_input=None,rate = 0.9,isTrain=True,isVal = False):
        key, value, rLens = self.encoder(speech_input, speech_len)
        # if in Train mode, then provide text_input
        if (isTrain == True):
            predictions = self.decoder(key, value,  rLens, rate,text_input)
            
        elif (isVal == True):
#             print("**** get is val *********")
            predictions = self.decoder(key, value, rLens, rate, text_input,  isTrain=False, isVal = True)

        # if not in Train mode, then not provide text, generate of value is bounded by max_len
        else:
            predictions = self.decoder(key, value,  rLens,rate , text=None,  isTrain=False)
        return predictions # (batch_size, max_len, vocab_size)

## Training and Testing





In [None]:
!pip install python-Levenshtein



In [None]:
import Levenshtein

In [None]:
def translate(Y, Y_lens, letter_list):
    rList = []
    letter2index, index2letter = create_dictionaries(letter_list)
    for i in Y:
        rList.append(index2letter[i.item()])
    return "".join(rList[1:Y_lens-1])

def LevenshteinDistance(prediction, Y, Y_lens):
    '''
    :param prediction: prediction matrix (batch_size, max_lens)
    :param Y: real input (batch_size, max_lens)
    :param Y_lens: real input lens (batch_size,)
    :return distance: average Levenshtein Distance (scalar)
    '''
    distance = 0
    decoded_list = decode_sentence(prediction.transpose(1,2),LETTER_LIST)
    length = len(decoded_list) 

    for i in range(length):
        reference = translate(Y[i], Y_lens[i], LETTER_LIST)
        distance += Levenshtein.distance(reference, decoded_list[i])
    
    del prediction
    del Y
    del Y_lens
    
    return distance / length


In [None]:

def train(model, train_loader, criterion, optimizer, epoch):
    # model is seq2seq model
    model.train()
    model.to(DEVICE)
    start_epoch = time.time()
    start = time.time()
    batch_id = 0
    # 1) Iterate through your loader
    print("Learning rate for epoch ", epoch, " is", optimizer.param_groups[0]['lr'])
    
    # #################### remove later #############################
    # count = -1
    # #################### remove later #############################


    for X, X_lens, Y, Y_lens in train_loader:

        # #################### remove later #############################
        # count += 1
        # if batch_id == 100:
        #     return model
        ################### remove later #############################

        # 2) Use torch.autograd.set_detect_anomaly(True) to get notices about gradient explosion
#         with torch.autograd.set_detect_anomaly(True):

        # 3) Set the inputs to the device.
        X, X_lens, Y, Y_lens = X.to(DEVICE), X_lens.to(DEVICE), Y.to(DEVICE), Y_lens.to(DEVICE)

        # 4) Pass your inputs, and length of speech into the model.
        
        # if epoch <= 15:
        #     predictions = model(X, X_lens, Y, rate=0.9)
        # elif epoch >=15 and epoch <25:
        #     predictions = model(X, X_lens, Y, rate=0.8) # (batch_size, max_len_Y, vocab_size)
        # elif epoch >= 25 and epoch < 35:
        #     predictions = model(X, X_lens, Y, rate=0.7)
        # elif epoch >= 35:
        predictions = model(X, X_lens, Y, rate=0.6)
            
        predictions = predictions.transpose(1,2) # (batch_size, vocab_size, max_len_Y)

        batch_size = predictions.shape[0]

        # 5) Generate a mask based on the lengths of the text to create a masked loss. 
        # 5.1) Ensure the mask is on the device and is the correct shape.
        lens  =  Y_lens - 1
        label_mask = torch.arange(predictions.size(2),device=DEVICE).unsqueeze(0) >= lens.unsqueeze(1) # (batch_size, max_len_Y)

        # predictions.masked_fill_(label_mask.unsqueeze(1),0) # (batch_size, vocab_size, max_len_Y)

        # change the predictions a little bit
        loss = criterion(predictions,Y[:,1:])

        _, indices = torch.max(predictions, 1)


        # 8) Use the mask to calculate a masked loss.

        # sum the loss
        n_tokens = lens.sum()
        loss = loss.sum() / n_tokens

        # 9) Run the backward pass on the masked loss. 
        optimizer.zero_grad() 
        loss.backward()

        # 10) Use torch.nn.utils.clip_grad_norm(model.parameters(), 2)
#             torch.nn.utils.clip_grad_norm(model.parameters(), 2)
        # 11) Take a step with your optimizer
        optimizer.step()

        # calculate the perplexity
        avg_loss = loss
        perplexity  = torch.exp(avg_loss)

        # test
        # test(model, dev_loader, 0)

        # 13) Optionally print the training loss after every N batches
        if batch_id % 10 == 0:
            print("Avg 10 batchs takes ",round(( time.time() - start)/60,2)," min")
            print('EPOCH ',epoch, 'batch ',batch_id, ': Loss:', round(avg_loss.item(),4), 'Perplexity:', round(perplexity.item(),4))

        # inspect the Levenshtein Distance of output
        if batch_id % 300 == 0:
            avg_distance = LevenshteinDistance(predictions, Y, Y_lens)
            print(" ********* Epoch ", epoch, " Average Levenshtein Distance is ****: ", avg_distance)


        batch_id += 1
        del X
        del X_lens
        del Y
        del Y_lens
        del predictions
        del label_mask

    end = time.time()
    print("Training loss after one epoch is:", loss.item())
    print("Time take for an epoch is:", round((end - start_epoch)/60,2), " min")

def test(model, test_loader,criterion, epoch):
    # if model.is_cuda == False:
    model.eval()
    model.to(DEVICE)

    avg_loss = 0
    avg_distance = 0
    count = 0
    # criterion = nn.CrossEntropyLoss(reduction='sum')
    for X, X_lens, Y, Y_lens in test_loader:
        X, X_lens, Y, Y_lens = X.to(DEVICE), X_lens.to(DEVICE), Y.to(DEVICE), Y_lens.to(DEVICE)
        
        # use inference mode for prediction
        predictions = model(X, X_lens, Y, isTrain = False, isVal = True) # (batch_size, max_len_Y, vocab_size)    
        predictions = predictions.transpose(1,2) # (batch_size, vocab_size, max_len_Y)
        
        # slice with the max_len
#         max_len = torch.max(Y_lens).item()
#         predictions = predictions[:,:,:max_len-1]

        lens  =  Y_lens - 1
        label_mask = torch.arange(predictions.size(2),device=DEVICE).unsqueeze(0) >= lens.unsqueeze(1)
        loss = criterion(predictions,Y[:,1:])

        # mask loss
        loss.masked_fill_(label_mask, 0)

        # sum the loss
        n_tokens = lens.sum()
        loss = loss.sum() / n_tokens

        # Levenshtein Distance
        avg_distance += LevenshteinDistance(predictions, Y, Y_lens)

        # avg_loss
        avg_loss += float(loss)
        del X
        del X_lens
        del Y_lens
        del Y
        del predictions
        del label_mask 
        count += 1

    avg_loss /= count
    avg_distance /= count 

    ppl = np.exp(avg_loss)

    # inspection
    print(" ********************* Epoch ",epoch, " ******************")
    print("Test loss  ",epoch, " is ",avg_loss)
    print("Perplexity is: ",ppl)
    print("Avg distance is: ", avg_distance)
    model.train()
    return ppl

## Main Function


In [None]:
criterion = nn.CrossEntropyLoss(reduction='sum')
import time

# How to convert the result.
def main():
        ############# resume training ? ######################
    model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), hidden_dim=HIDDEN_SIZE)
    model.to(DEVICE)
    #     ######################################################


    optimizer = torch.optim.Adam(model.parameters(), lr= 3e-4)

    # use scheduler to change learning rate
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience =0,factor = 0.5)
    criterion = nn.CrossEntropyLoss(reduce=False)
    nepochs = 50
    batch_size = 64 if DEVICE == 'cuda' else 1

    lower = 100000
    model_id = 0
    for epoch in range(nepochs):
        # if epoch == 10:
        #     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        #     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 1,factor = 0.5)
        # if epoch == 20:
        #     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        #     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 3,factor = 0.5)
        # if epoch == 30:
        #     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        #     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 1,factor = 0.1)       
        train(model, train_loader, criterion, optimizer, epoch)
        # val()
        distance = test(model, dev_loader, criterion, epoch)
        if distance < lower:
            lower = distance
            model_id = epoch
        # if epoch >= 30:
        #     scheduler.step(distance)

        # save model
        model_save_name = 'classifier' + str(epoch+1)+ '.pt'
        path = F"/content/drive/My Drive/Kaggle/{model_save_name}" 
        torch.save(model,path)

if __name__ == '__main__':
    main()



Learning rate for epoch  0  is 0.0003
Avg 10 batchs takes  0.01  min
EPOCH  0 batch  0 : Loss: 6.5948 Perplexity: 731.2764
 ********* Epoch  0  Average Levenshtein Distance is ****:  185.21875
Avg 10 batchs takes  0.12  min
EPOCH  0 batch  10 : Loss: 5.3709 Perplexity: 215.0579
Avg 10 batchs takes  0.23  min
EPOCH  0 batch  20 : Loss: 4.2494 Perplexity: 70.065
Avg 10 batchs takes  0.34  min
EPOCH  0 batch  30 : Loss: 4.4383 Perplexity: 84.6338
Avg 10 batchs takes  0.44  min
EPOCH  0 batch  40 : Loss: 3.9522 Perplexity: 52.0519
Avg 10 batchs takes  0.55  min
EPOCH  0 batch  50 : Loss: 3.5515 Perplexity: 34.8661
Avg 10 batchs takes  0.66  min
EPOCH  0 batch  60 : Loss: 3.2776 Perplexity: 26.5115
Avg 10 batchs takes  0.77  min
EPOCH  0 batch  70 : Loss: 3.3857 Perplexity: 29.5372
Avg 10 batchs takes  0.88  min
EPOCH  0 batch  80 : Loss: 3.238 Perplexity: 25.4835
Avg 10 batchs takes  0.99  min
EPOCH  0 batch  90 : Loss: 3.1208 Perplexity: 22.6643
Avg 10 batchs takes  1.09  min
EPOCH  0 bat

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
EPOCH  19 batch  710 : Loss: 2.4254 Perplexity: 11.3069
Avg 10 batchs takes  7.55  min
EPOCH  19 batch  720 : Loss: 2.383 Perplexity: 10.8377
Avg 10 batchs takes  7.66  min
EPOCH  19 batch  730 : Loss: 2.4089 Perplexity: 11.1222
Avg 10 batchs takes  7.76  min
EPOCH  19 batch  740 : Loss: 2.4229 Perplexity: 11.2781
Avg 10 batchs takes  7.86  min
EPOCH  19 batch  750 : Loss: 2.4053 Perplexity: 11.0812
Avg 10 batchs takes  7.97  min
EPOCH  19 batch  760 : Loss: 2.3283 Perplexity: 10.2607
Avg 10 batchs takes  8.07  min
EPOCH  19 batch  770 : Loss: 2.4427 Perplexity: 11.5038
Training loss after one epoch is: 2.4283413887023926
Time take for an epoch is: 8.09  min
 ********************* Epoch  19  ******************
Test loss   19  is  3.7454959869384767
Perplexity is:  42.329997138731024
Avg distance is:  201.3947420634921
Learning rate for epoch  20  is 0.0003
Avg 10 batchs takes  0.01  min
EPOCH  20 batch  0 : Loss: 2.3848 P

## Inference
• Pass only the utterance and the [start] character to your model  
• Generate text from your model by sampling from the predicted distribution for some number of steps.  
• Generate many samples in this manner for each test   utterance (100s or 1000s). You only do this on the test set to generate the Kaggle submission so the run time shouldn’t matter.  
• Calculate the sequence lengths for each generated   sequence by finding the first [end] character  
• Now run each of these generated samples back through your model to give each a loss value  
• Take the randomly generated sample with the best loss value, optionally re-weighted or modified in some way like the paper

In [None]:
# generate input and decoding output
def inference(model, test_loader):
    model.eval()
    count = 0
    # inference
    predictions = []
    for X, X_lens in test_loader:
        X, X_lens = X.to(DEVICE), X_lens.to(DEVICE)

        prediction = model(X, X_lens, isTrain=False)

        predictions.append(prediction)

        del X
        del X_lens
        # print("Current shape is",predictions.shape)
    model.train()
    return torch.cat(predictions,dim=0)
    # decoding
    
# get the best model

###############
model_id = 41
###############
model_save_name = 'classifier' + str(model_id+1)+ '.pt'
path = F"/content/drive/My Drive/Kaggle/{model_save_name}"
model = torch.load(path)
model.to(DEVICE)
# test
criterion = nn.CrossEntropyLoss(reduce=False)
test(model, dev_loader,criterion, 0)
# predictions = inference(model, test_loader, isRandom= True)
# print(predictions.shape)
# decoded_list = decode_sentence(predictions,LETTER_LIST)

# # save results
# import pandas as pd
# submission = pd.read_csv("test_sample_submission.csv")
# submission['Predicted'] = decoded_list
# print(submission.head())
# submission.to_csv('submission_no_noise.csv',index=False)

predictions = inference(model, test_loader)
print(predictions.shape)
decoded_list = decode_sentence(predictions,LETTER_LIST)
# save results
import pandas as pd
submission = pd.read_csv("test_sample_submission.csv")
submission['Predicted'] = decoded_list
print(submission.head())
submission.to_csv('/content/drive/My Drive/Kaggle/submission_no_noise.csv',index=False)

In [None]:
def RandomSearch(model, X,X_lens, search_width):
    predictions = []
    scores = []
    # search_width = 3
    print(X.shape)
    batch_size = X.shape[1]
    max_score = torch.zeros((batch_size,1)).to(DEVICE)

    # hard code
    max_prediction = torch.zeros((batch_size, 249, 35)).to(DEVICE)
    for i in range(search_width):
        # prediction: (batch, max_len, vocab_size)
        # score: (batch, max_len)
        prediction = model(X, X_lens, isTrain=False)

        # update score 
        score = calScore(prediction)

        k = (score > max_score).int()
        max_score = k * score + (1-k) * max_score
        mask = torch.ones(batch_size,249).to(DEVICE)
        k = (mask * k).unsqueeze(dim=2)
        max_prediction = k * prediction + (1 - k) * max_prediction

        del prediction
        del score
        del k
        del mask
        
    del max_score
    return max_prediction



def calScore(prediction):
    '''
        input prediction: (batch_size, max_len, vocab_size)
        return score: (batch_size,)
    '''
    batch_size = prediction.shape[0]
    max_len = prediciton.shape[1]
    score = torch.zeros((batch_size,1)).to(DEVICE)
    for i in range(batch_size):
        # prediction[i,:,:] (max_len, vocab_size)
        val, indice = torch.max(prediction[i,:,:], dim=1)
        # should be max_len
        try:
            idx = (indice == 34).nonzero()[0]
            # get the first index
            idx = idx[0].item()
            # print(idx)
            score[i] = val[:idx].sum().item() / (idx + 1)
        except:
            score[i] = val.sum().item() / max_len
        del val
        del indice
        
    return score

# def calScore(prediction, score):
#     # prediction: (max_len,)


# generate input and decoding output
def inference(model, test_loader, isRandom=False):
    with torch.no_grad():
        model.eval()
        count = 0
        # inference
        predictions = []
        for X, X_lens in test_loader:
            # def handle_batch():
            X, X_lens = X.to(DEVICE), X_lens.to(DEVICE)
            if isRandom:
                prediction = RandomSearch(model, X,X_lens, 3)

            else:
                prediction = model(X, X_lens, isTrain=False)

            predictions.append(prediction)
            del X
            del X_lens
            del prediction
            # handle_batch()
        # Make sure deallocation has taken place
        if torch.cuda.is_available():
            torch.cuda.synchronize()

        model.train()
    return torch.cat(predictions,dim=0)

# criterion = nn.CrossEntropyLoss(reduce=False)
# model = torch.load("model/classifier39.pt")
# model.to(DEVICE)

# test(model, dev_loader,criterion, 0)
# # print(test(model, dev_loader, criterion, 0))
# predictions = inference(model, test_loader, isRandom=True)
# print(predictions.shape)
# decoded_list = decode_sentence(predictions,LETTER_LIST)

# # # save results
# import pandas as pd
# submission = pd.read_csv("test_sample_submission.csv")
# submission['Predicted'] = decoded_list
# print(submission.head())
# submission.to_csv('submission_no_noise.csv',index=False)

# # compared result
# predictions = inference(model, test_loader)
# print(predictions.shape)
# decoded_list = decode_sentence(predictions,LETTER_LIST)
# # save results
# import pandas as pd
# submission = pd.read_csv("test_sample_submission.csv")
# submission['Predicted'] = decoded_list
# print(submission.head())
# submission.to_csv('submission_no_noise.csv',index=False)

  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "
  "type " + container_type.__name__ + ". It won't be checked "


Seq2Seq(
  (encoder): Encoder(
    (lstm1): pBLSTM(
      (lstm): LSTM(40, 128, bidirectional=True)
    )
    (lstm2): pBLSTM(
      (lstm): LSTM(512, 256, bidirectional=True)
    )
    (lstm3): pBLSTM(
      (lstm): LSTM(1024, 256, bidirectional=True)
    )
    (KeyLinear): Linear(in_features=1024, out_features=256, bias=False)
    (ValueLinear): Linear(in_features=1024, out_features=256, bias=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(35, 256, padding_idx=0)
    (lstm1): LSTMCell(512, 256)
    (lstm2): LSTMCell(256, 256)
    (attention): Attention()
    (character_prob): Linear(in_features=512, out_features=35, bias=True)
  )
)

In [None]:
predictions = inference(model, test_loader)
print(predictions.shape)
decoded_list = decode_sentence(predictions,LETTER_LIST)
# save results
import pandas as pd
submission = pd.read_csv("test_sample_submission.csv")
submission['Predicted'] = decoded_list
print(submission.head())
submission.to_csv('submission_no_noise.csv',index=False)

torch.Size([523, 249, 35])
   Id                                          Predicted
0   0  THE COMPANY ALSO DONATED FIFTY THOUSAND DOLLAR...
1   1  MR. BENATZ'S CALL FOR MANDATORY TESTING HASN'T...
2   2  A NEWS RERELIES ISSUED YESTERDAY SAID THAT THE...
3   3  THE BUY OUTPLAN IS CONTINUED ON ALLEGHENY RECE...
4   4  HOWEVER INCREASING THE COST OF RESEARCHERS NOT...
