## Models.py


In [5]:
import torch
import torch.nn as nn
import torch.nn.utils as utils
import math as m
import random as r

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


In [6]:
class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value, linear):
        '''
        :param query :(N, context_size) Query is the output of LSTMCell from Decoder
        :param key: (N, key_size) Key Projection from Encoder per time step
        :param value: (N, value_size) Value Projection from Encoder per time step
        :return output: Attended Context
        :return attention_mask: Attention mask that can be plotted  
        '''
        # key/ value: (B, seq, value_size)
        # query: (B, context) -> (B, context, 1)
        # one sentence
        # over other words in sentence i.e. in one B

        # lens of inputs, use those to create mask 
        query = query.unsqueeze(2)
        energy = torch.bmm(key, query) # key tells you 
        
        X_copy = Variable(linear.data, requires_grad=False) # shifting targets shouldn't change things,
        X_copy[X_copy != 0] = 1

        attention = nn.functional.softmax(energy, dim=0) # just a weight, adds up to one.  How much weight to weight to put on each time in each value
        attention_mask = nn.functional.normalize(attention*X_copy.unsqueeze(2).detach(), p=1)
        context = torch.bmm(attention_mask.permute(0,2,1), value)
        context_squeeze = context.squeeze(1) # (B, hidden_size) some smaller T right.  This is the total context with weights

        return context_squeeze, attention


In [8]:
class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key and value.
    Key and value are nothing but simple projections of the output from pBLSTM network.
    '''
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_dim

        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        
        ### Add code to define the blocks of pBLSTMs! ###
        self.lstm1 = nn.LSTM(input_size=hidden_dim*4, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=hidden_dim*4, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.lstm3 = nn.LSTM(input_size=hidden_dim*4, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True)

        # 1024 or 512 to feed into this
        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)

    def forward(self, x, lens):
        batch_size = x.shape[0]

        # lstm
        rnn_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first = True, enforce_sorted=False)
        outputs, _ = self.lstm(rnn_inp)

        # first pblstm
        out_1, lens = utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        if out_1.shape[1] % 2 != 0:
            out_1 = out_1[:, 0:out_1.shape[1]-1, :]  
        out_1_reshape = out_1.view(batch_size, int(out_1.shape[1]/2), int(out_1.shape[2]*2))
        lens /= 2
        packed_out_1 = utils.rnn.pack_padded_sequence(out_1_reshape, lengths=lens, batch_first = True, enforce_sorted=False)
        outputs1, _ = self.lstm1(packed_out_1)
    
        # second pblstm
        out_2, lens = utils.rnn.pad_packed_sequence(outputs1, batch_first=True)
        if out_2.shape[1] % 2 != 0:
            out_2 = out_2[:, 0:out_2.shape[1]-1, :]  
        out_2_reshape = out_2.view(batch_size, int(out_2.shape[1]/2), int(out_2.shape[2]*2))
        lens /= 2
        packed_out_2 = utils.rnn.pack_padded_sequence(out_2_reshape, lengths=lens, batch_first = True, enforce_sorted=False) 
        outputs2, _ = self.lstm2(packed_out_2)

        # third pblstm
        out_3, lens = utils.rnn.pad_packed_sequence(outputs2, batch_first=True)
        if out_3.shape[1] % 2 != 0:
            out_3 = out_3[:, 0:out_3.shape[1]-1, :]  
        out_3_reshape = out_3.view(batch_size, int(out_3.shape[1]/2), int(out_3.shape[2]*2))
        lens /= 2
        packed_out_3 = utils.rnn.pack_padded_sequence(out_3_reshape, lengths=lens, batch_first = True, enforce_sorted=False) 
        outputs3, _ = self.lstm3(packed_out_3)

        # linear 
        linear_input, final_length = utils.rnn.pad_packed_sequence(outputs3, batch_first = True)
        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)

        return keys, value, final_length, linear_input[:,:,0]



In [9]:
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=True):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.value_size = value_size
        self.hidden_size = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)

        self.isAttended = isAttended
        if (isAttended == True):
            self.attention = Attention()

        self.character_prob = nn.Linear(key_size + value_size, vocab_size)

    def forward(self, key, values, text, TF, lengths, isTrain, isVal):
        '''
        :param key :(T, N, key_size) Output of the Encoder Key projection layer - WE HAVE BATCH FIRST
        :param values: (T, N, value_size) Output of the Encoder Value projection layer - WE HAVE BATCH FIRST
        :param text: (N, text_len) Batch input of text with text_length - SAME
        :param isTrain: Train or eval mode
        :return predictions: Returns the character perdiction probability 
        '''
        batch_size = key.shape[0]
        # input should be sos to last character
        # targets should be first character to eos.

        if (isTrain == True or isVal == True):
            max_len = text.shape[1]
            embeddings = self.embedding(text) # (N, T, hidden_size)
        else:
            max_len = 250


        if (self.isAttended == True):
            attn_out = torch.zeros((batch_size, self.value_size))
        else:
            if values.shape[1] < max_len:
                zeros = torch.zeros(batch_size, int(max_len - values.shape[1]), self.value_size).to(DEVICE)
                values_new = torch.cat([values, zeros], dim = 1)
            else:
                values_new = values

        predictions = []
        hidden_states = [None, None]
        prediction = torch.zeros(batch_size, self.vocab_size).to(DEVICE) 
        prediction[:, 33] = 1 # initialized to <sos> index

        for i in range(max_len):
            # * Implement Gumble noise and ***teacher forcing*** techniques - w some prob just take char_embed or pred
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            # * If you haven't implemented attention yet, then you may want to check the index and break 

            if (isTrain):
                char = embeddings[:,i,:] # check here what embeddings dimensions are
                prediction = Gumbel(prediction.to('cpu'), torch.tensor([1-float(TF)])).sample().to(DEVICE)
                pred = self.embedding(prediction.argmax(dim=-1))
                prob = r.random()
                char_embed = char if prob > float(TF) else pred 
            else:
                char_embed = self.embedding(prediction.argmax(dim=-1)) # move up 


            if (self.isAttended == False):
                attn_out = values_new[:,i,:]


            inp = torch.cat([char_embed.to(DEVICE), attn_out.to(DEVICE)], dim=1) # context
            hidden_states[0] = self.lstm1(inp, hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

            ### Compute attention from the output of the second LSTM Cell ###
            output = hidden_states[1][0]
            if (self.isAttended == True):
                attn_out, attn_weights = self.attention(output, key, values, lengths) 


            prediction = self.character_prob(torch.cat([output, attn_out], dim=1)) # (N, vocab_size) # values becomes context
            predictions.append(prediction.unsqueeze(1))

        return torch.cat(predictions, dim=1)



In [10]:
class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=True):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim)

    def forward(self, speech_input, speech_len, text_input, TF, isTrain, isVal):
        key, value, lengths, linear = self.encoder(speech_input, speech_len)
        if (isTrain == True or isVal == True):
            predictions = self.decoder(key, value, text_input, TF, linear, isTrain, isVal)
        else:
            predictions = self.decoder(key, value, None, TF, linear, isTrain, isVal)
        return predictions, lengths, linear


## Train_test.py


In [11]:
import time
import torch
import pdb 
from torch.autograd import Variable
from Levenshtein import distance as levenshtein_distance
### Add Your Other Necessary Imports Here! ###

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


In [12]:
def train(model, train_loader, criterion, optimizer, epoch, TF):
    model.train()
    model.to(DEVICE)
    start = time.time()

    # initialize parameters
    cum_words = 0.0
    cum_loss = 0.0
    running_loss = 0.0

    # print number of batches
    print('='*60)
    print("Training", len(train_loader), "batches")
    print('='*60)

    for batch_idx, (X, Y_sos, X_len, Y_len, Y_char_first) in enumerate(train_loader):
        torch.autograd.set_detect_anomaly(True)
        X = X.to(DEVICE)
        Y_sos = Y_sos.to(DEVICE) # all data & model on same device
        Y_char_first = Y_char_first.to(DEVICE) # all data & model on same device

        # run model and loss
        outputs, length, linear = model(X, X_len, Y_sos, TF=TF, isTrain=True, isVal=False) # model here Y needs to be sos to last char
        loss = criterion(outputs.view(-1, outputs.shape[2]).float(), Y_char_first.view(-1).long()) # need true label set for criterion i.e. first char to eos
        
        mask = tor
        Y_copy = Variable(Y_sos.data, requires_grad=False) # shifting targets shouldn't change things
        outputs_mask = Y_copy.contiguous().view(-1) # need true label set  
        outputs_mask[outputs_mask != 0] = 1
        loss_mask = (loss*outputs_mask.detach()).sum()
        loss_mask.to(DEVICE)
        # step backward on masked loss
        loss_mask.backward()

        # grad clip and step
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
        optimizer.step()
        optimizer.zero_grad()
        
        # normalize and accumulate masked loss
        cum_words += Y_len.sum().item()
        cum_loss += loss_mask.item()
        perp_mask = m.exp(cum_loss / cum_words)
        running_loss = perp_mask

        # print loss during batch
        mid = time.time()
        if batch_idx % 100 == 99:
            print('Batch: {:} Cumulative Time: {:.4f}s Train Perplexity: {:.4f}'.format(batch_idx + 1, mid - start, running_loss))
            print('='*60)

        # delete parameters
        torch.cuda.empty_cache()
        del perp_mask
        del X
        del X_len
        del Y_len
        del Y_char_first
        del Y_sos
        del loss
        del loss_mask
        del outputs

    end = time.time()
    time_final = end - start
    return running_loss, time_final

In [13]:
def val(model, val_loader, criterion, optimizer, epoch, TF):
    ### Write your test code here! ###
    model.eval()
    model.to(DEVICE)
    start = time.time()

    # initialize parameters
    cum_words = 0.0
    cum_loss = 0.0
    running_loss = 0.0
    dist = 0.0
    total_sentences = 0.0

    for batch_idx, (X, Y_sos, X_len, Y_len, Y_char_first) in enumerate(val_loader):
        torch.autograd.set_detect_anomaly(True)
        X = X.to(DEVICE)
        Y_sos = Y_sos.to(DEVICE) # all data & model on same device
        Y_char_first = Y_char_first.to(DEVICE) # all data & model on same device

        # run model and loss
        outputs, length, linear = model(X, X_len, Y_sos, TF=TF, isTrain=False, isVal=True) # model here Y needs to be sos to last char
        loss = criterion(outputs.view(-1, outputs.shape[2]).float(), Y_char_first.view(-1).long()) # need true label set for criterion i.e. first char to eos

        # calculate masked loss check outputs.  
        Y_copy = Variable(Y_sos.data, requires_grad=False) # shifting targets shouldn't change things
        outputs_mask = Y_copy.contiguous().view(-1) # need true label set  
        outputs_mask[outputs_mask != 0] = 1
        loss_mask = (loss*outputs_mask.detach()).sum()
        loss_mask.to(DEVICE)

        # LD distance
        _, character = torch.max(outputs.float(), dim = 2) 
        for i in range(outputs.shape[0]):
            ind_char = np.argwhere(character[i, :].cpu() == 34)
            if len(ind_char[0]) > 0:
                sent_slice = character[i, :ind_char[0][0].item()]
            else:
                sent_slice = character[i, :]
            
            ind_gold = np.argwhere(Y_char_first[i, :].cpu() == 34)
            gold_slice = Y_char_first[i, :ind_gold[0][0].item()]
            
            sentence = ''.join([LETTER_LIST[j] for j in sent_slice]) 
            sentence_gold = ''.join([LETTER_LIST[j] for j in gold_slice])
            dist += levenshtein_distance(sentence, sentence_gold)


        # normalize and accumulate masked perp
        cum_words += Y_len.sum().item()
        cum_loss += loss_mask.item()
        perp_mask = m.exp(cum_loss / cum_words)
        running_loss = perp_mask

        total_sentences += outputs.shape[0]
        avg_dist = dist / total_sentences
        
        # delete parameters
        torch.cuda.empty_cache()
        del perp_mask
        del X
        del X_len
        del Y_len
        del Y_char_first
        del Y_sos
        del loss
        del loss_mask
        del outputs

    end = time.time()
    time_final = end - start
    return running_loss, time_final, avg_dist



## Dataloader.py


In [14]:
import numpy as np
import torch
from torch.utils.data import Dataset 
from torch.nn.utils.rnn import *


In [15]:
'''
Loading all the numpy files containing the utterance information and text information
'''
def load_data():
    speech_train = np.load('train_new.npy', allow_pickle=True, encoding='bytes')
    speech_valid = np.load('dev_new.npy', allow_pickle=True, encoding='bytes')
    speech_test = np.load('test_new.npy', allow_pickle=True, encoding='bytes')

    transcript_train = np.load('./train_transcripts.npy', allow_pickle=True,encoding='bytes')
    transcript_valid = np.load('./dev_transcripts.npy', allow_pickle=True,encoding='bytes')

    return speech_train, speech_valid, speech_test, transcript_train, transcript_valid


In [16]:
'''
Transforms alphabetical input to numerical input, replace each letter by its corresponding 
index from letter_list
'''
def transform_letter_to_index(transcript, letter_list):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    idx_transcript = []

    for i, arr in enumerate(transcript):
        idx_arr = [letter_list.index('<sos>')]
        # idx_arr.append(letter_list.index(' '))

        for j, word in enumerate(arr):
            str_word = word.decode('UTF-8')
            for k, letter in enumerate(str_word):
                idx_letter = letter_list.index(letter)
                idx_arr.append(idx_letter)

            if j == len(arr)-1:
                idx_arr.append(letter_list.index('<eos>'))
            else:
                idx_arr.append(letter_list.index(' '))

        idx_transcript.append(np.array(idx_arr))

    nump_idx_transcript = np.array(idx_transcript)

    return nump_idx_transcript

In [17]:

class Speech2TextDataset(Dataset):
    '''
    Dataset class for the speech to text data, this may need some tweaking in the
    getitem method as your implementation in the collate function may be different from
    ours. 
    '''
    def __init__(self, speech, text=None, isTrain=True):
        self.speech = speech
        self.isTrain = isTrain
        if (text is not None):
            self.text = text

    def __len__(self):
        return self.speech.shape[0]

    def __getitem__(self, index):
        if (self.isTrain == True):
            return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(self.text[index])
        else:
            return torch.tensor(self.speech[index].astype(np.float32))


In [18]:
def collate_train(batch_data):
    ### Return the padded speech and text data, and the length of utterance and transcript ###
    inputs, targets = zip(*batch_data)

    lens_inp = [len(seq) for seq in inputs]
    lens_tar = [len(seq) for seq in targets]

    inputs = [torch.FloatTensor(inputs[i]) for i in range(len(inputs))]
    targets_sos = [torch.LongTensor(targets[i][:-1]) for i in range(len(targets))]
    targets_first_char = [torch.LongTensor(targets[i][1:]) for i in range(len(targets))]

    inputs_pad = pad_sequence(inputs, batch_first = True, padding_value = 0)
    targets_sos_pad = pad_sequence(targets_sos, batch_first = True, padding_value = 0)
    targets_first_char_pad = pad_sequence(targets_first_char, batch_first = True, padding_value = 0)

    return inputs_pad, targets_sos_pad, torch.LongTensor(lens_inp), torch.LongTensor(lens_tar), targets_first_char_pad

In [19]:
def collate_test(batch_data):
    ### Return padded speech and length of utterance ###
    inputs = batch_data
    lens_inp = [len(seq) for seq in inputs]
    inputs = [torch.FloatTensor(inputs[i]) for i in range(len(inputs))]
    inputs_pad = pad_sequence(inputs, batch_first = True, padding_value = 0)

    return inputs_pad, torch.LongTensor(lens_inp)


## Main.py


In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.distributions.gumbel import Gumbel

In [21]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

LETTER_LIST = ['<pad>', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', \
               'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']

print(len(LETTER_LIST))
print(DEVICE)

35
cuda


In [22]:
def weights_init(layer):
    if isinstance(layer, nn.Embedding):
        torch.nn.init.uniform_(layer.weight.data, a=-0.1, b=0.1)
    elif isinstance(layer, nn.LSTM):
        for param in layer.parameters():
            if len(param.shape) >= 2: # weights
                torch.nn.init.orthogonal_(param.data)
            else: # beta
                torch.nn.init.normal_(param.data)
    elif isinstance(layer, nn.LSTMCell):
        for param in layer.parameters():
            if len(param.shape) >= 2:
                torch.nn.init.orthogonal_(param.data)
            else:
                torch.nn.init.normal_(param.data)

In [23]:
model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), hidden_dim=256)
model.apply(weights_init)
optimizer = optim.Adam(model.parameters(), lr=1e-3) 
criterion = nn.CrossEntropyLoss(reduction='none')
nepochs = 100
batch_size = 64 if DEVICE == 'cuda' else 2

speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()

character_text_train = transform_letter_to_index(transcript_train, LETTER_LIST)
character_text_valid = transform_letter_to_index(transcript_valid, LETTER_LIST)

train_dataset = Speech2TextDataset(speech_train, character_text_train)
val_dataset = Speech2TextDataset(speech_valid, character_text_valid)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_train)


In [None]:
for epoch in range(1, nepochs+1):
    log = open(name + "_logs.txt", "a")
    if epoch == 4 or epoch == 7 or epoch == 10: TF += 0.1
    if epoch == 25 or epoch == 40: TF += 0.1
    if epoch == 25: optimizer = optim.Adam(model.parameters(), lr=5e-4)
    if epoch == 30: optimizer = optim.Adam(model.parameters(), lr=1e-4)
#     model.load_state_dict(torch.load('normalize_weight_init_epoch_' + str(epoch) + '_model.pt'))

    # train
    perplexity_train, time_train = train(model, train_loader, criterion, optimizer, epoch, TF)
    print('Epoch: {:.0f} Train Time: {:.4f}s Train Perplexity: {:.4f}'.format(epoch, time_train, perplexity_train))
    log.write('Epoch: {:.0f} Train Time: {:.4f}s Train Perplexity: {:.4f}\n'.format(epoch, time_train, perplexity_train)) 

    # val
    perplexity_val, time_val, avg_dist = val(model, val_loader, criterion, optimizer, epoch, TF) 
    print('Epoch: {:.0f} Val Time: {:.4f}s Val Perplexity: {:.4f} Val dist: {:.2f}'.format(epoch, time_val, perplexity_val, avg_dist))
    log.write('Epoch: {:.0f} Val Time: {:.4f}s Val Perplexity: {:.4f} Val dist: {:.2f}\n'.format(epoch, time_val, perplexity_val, avg_dist))

    
    print('='*60)
    log.write('='*60 + '\n')
    log.close()

    # save model
    torch.save(model.state_dict(), name + '_epoch_' + str(epoch) + '_model.pt')
#     torch.save(optimizer.state_dict(), name + '_epoch_' + str(epoch) + '_optimizer.pt')

## Write_test.py



In [30]:
test_dataset = Speech2TextDataset(speech_test, None, False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_test)

In [31]:
testout = open("submission.csv", "w")
testout.write('Id,Predicted\n')

13

In [32]:

model.load_state_dict(torch.load("weight_gumbel_epoch_83_model.pt"))

model.eval()
model.to(DEVICE)
batch_size = 64
for batch_idx, (X, X_len) in enumerate(test_loader):
    X = X.to(DEVICE)

    # run model and do greedy search
    outputs, _, _ = model(X, X_len, None, TF=TF, isTrain=False, isVal=False) 
    _, character = torch.max(outputs.float(), dim = 2) 

    for i in range(X.shape[0]):
        indx = batch_idx*batch_size + i
        ind_char = np.argwhere(character[i, :].cpu() == 34)
        if len(ind_char[0]) > 0:
#             print(ind_char)
            sent_slice = character[i, :ind_char[0][0].item()]
        else:
            sent_slice = character[i, :]
            
        sentence = ''.join([LETTER_LIST[j] for j in sent_slice]) # just to be clear eos should not be printed right

        testout.write(str(indx) + "," + str(sentence) + '\n')       

testout.close()