In [24]:
DATA_PATH = "data/"
MODEL_PATH = "models/"
num_workers = 4
BATCH_SIZE = 64

In [25]:
!pip install editdistance

[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
[33mYou are using pip version 10.0.1, however version 20.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [26]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
from torch import nn
from torch.nn.utils.rnn import *
import os
from torch.autograd import Variable
import torch.nn.functional as F
import time
import sys
import editdistance
import pickle
import random
import editdistance

In [27]:
device = torch.device('cuda:0')
# set SEED
os.environ["SEED"] = "999"
torch.manual_seed(999)
np.random.seed(0)

In [28]:
!nvidia-smi

Sun May  3 23:58:08 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            On   | 00000000:00:1E.0 Off |                    0 |
| N/A   49C    P0    27W /  70W |    662MiB / 15109MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [29]:
class Vocabulary(object):
    def __init__(self):
        self.char2idx = {}
        self.char_list = []
        self.size = 0

    def add_char(self, char):
        if not char in self.char2idx:
            self.char2idx[char] = self.size
            self.char_list.append(char)
            self.size += 1

    def __call__(self, char):
        if not char in self.char2idx:
            return self.char2idx['<unk>']
        return self.char2idx[char]

    def __len__(self):
        return self.size

In [30]:
def build_vocab(overwrite = True):
    vocab_file = DATA_PATH + 'vocab.pkl'
    # if file exist then load files
    if os.path.exists(vocab_file) and (overwrite == False):
        print ("loading pre-extracted vocabulary")
        vocab = pickle.load(open(DATA_PATH + 'vocab.pkl', 'rb'))
        return vocab

    print ("Loading data...")
    #dev_transcripts = np.load(DATA_PATH+"dev_transcripts.npy", allow_pickle=True)
    transcripts = np.load(DATA_PATH+"train_transcripts.npy", allow_pickle=True)
    #transcripts = np.append(transcripts, dev_transcripts)
    print("Loaded data.")

    vocab = Vocabulary()
    vocab.add_char('<pad>')
    vocab.add_char('<sos>')
    vocab.add_char('<eos>')
    vocab.add_char('<unk>')
    vocab.add_char(' ')

    for s in transcripts:
        for w in s:
            for c in w.decode():
                vocab.add_char(c)

    print("Total character size: {}".format(len(vocab)))
    pickle.dump(vocab, open(vocab_file, 'wb'))
    return vocab

In [31]:
vocab = build_vocab(overwrite=True)

Loading data...
Loaded data.
Total character size: 36


In [32]:
print (vocab.char_list)

['<pad>', '<sos>', '<eos>', '<unk>', ' ', 'T', 'H', 'E', 'F', 'M', 'A', 'L', 'P', 'R', 'O', 'D', 'U', 'C', 'S', 'I', 'W', 'Y', 'N', 'G', 'V', 'B', 'K', 'Q', 'X', 'J', 'Z', '-', "'", '.', '_', '+']


In [33]:
def load_data(x_path,y_path=""):
    x = np.load(x_path, allow_pickle=True,encoding='bytes')
    print ("X:")
    print ("Number of utterances " + str(x.shape[0]))
    print ("Number of dimentions " + str(x[0].shape[1]))
    print ("Avg length of utterances " + str(np.mean([i.shape[0] for i in x])))
    if y_path:
        print ("Y:")
        transcripts = np.load(y_path, allow_pickle=True,encoding='bytes')
        #print (transcripts)
        y = []
        for s in transcripts:
            new = []
            new.append(vocab("<sos>"))
            for w in s:
                for c in w.decode():
                    new.append(vocab(c))
                new.append(vocab(" "))
            new.append(vocab("<eos>"))
            y.append(np.array(new))
        y = np.array(y)
        print ("Avg length of transcripts in char " + str(np.mean([i.shape[0] for i in y])))
        return x, y
    return x

In [34]:
dev_x, dev_y = load_data(DATA_PATH+"dev_new.npy",y_path=DATA_PATH+"dev_transcripts.npy")

X:
Number of utterances 1106
Number of dimentions 40
Avg length of utterances 626.7793851717902
Y:
Avg length of transcripts in char 100.27305605786619


In [35]:
dev_y[0].shape

(77,)

In [36]:
class Speech2TextDataset(Dataset):
    '''
    Dataset class for the speech to text data, this may need some tweaking in the
    getitem method as your implementation in the collate function may be different from
    ours. 
    '''
    def __init__(self, speech, text=None, isTrain=True):
        self.speech = speech
        self.isTrain = isTrain
        if (isTrain):
            self.text = text

    def __len__(self):
        return self.speech.shape[0]

    def __getitem__(self, index):
        if (self.isTrain == True):
            return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(self.text[index])
        else:
            return torch.tensor(self.speech[index].astype(np.float32))

In [37]:
def collate_pad(batch):
    batch_size = len(batch)
    #order = list(range(batch_size))
    #batch = zip(batch, order)
    #batch = sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True) # sort, decreasing seq length
    #order = [i[1] for i in batch]
    #batch = [i[0] for i in batch]
    if len(batch[0]) == 2:
        x, y = zip(*batch)
    else:
        x = batch
        y = None
    
    x_len = torch.LongTensor([i.shape[0] for i in x])
    x = pad_sequence(x)
    if y != None:
        y_len = torch.LongTensor([i.shape[0] for i in y])
        y = pad_sequence(y)
        return x, x_len, y, y_len
    else:
        return x, x_len#, order

In [38]:
dev_dataloader = DataLoader(Speech2TextDataset(dev_x, dev_y), 
                            shuffle=False, 
                            batch_size=BATCH_SIZE, 
                            collate_fn = collate_pad,
                            pin_memory=True)

In [39]:
#train_dataloader = dev_dataloader

In [40]:
train_x, train_y = load_data(DATA_PATH+"train_new.npy",y_path=DATA_PATH+"train_transcripts.npy")
train_dataloader = DataLoader(Speech2TextDataset(train_x, train_y), 
                              shuffle=True, 
                              batch_size=BATCH_SIZE, 
                              collate_fn = collate_pad,
                              pin_memory=True)

X:
Number of utterances 24724
Number of dimentions 40
Avg length of utterances 651.3022164698269
Y:
Avg length of transcripts in char 105.88023782559456


In [41]:
len(train_dataloader)

387

In [42]:
for x, x_len, y, y_len in train_dataloader:
    print (x.shape) #T, batch, in_dim
    print (x_len)
    print (y.shape) #T, batch
    print (y_len)
    break

torch.Size([1350, 64, 40])
tensor([ 552,  744,  435,  893,  375,  516,  606,  515,  437,  515,  627,  575,
         557,  790,  197,  394,  642, 1017,  594,  565,  496,  593, 1073,  373,
         752,  619, 1175, 1030,  776,  836,  198, 1350,  593,  400, 1101, 1233,
         806,  846,  490,  659,  898,  523, 1151,  532,  540,  552,  980,  947,
         713,  627,  888,  780,  605,  954,  977,  384,  621,  312,  736,  627,
         381,  516,  578, 1101])
torch.Size([209, 64])
tensor([ 84, 108,  73, 145,  73,  82,  90,  84,  70,  91, 111,  83, 101, 152,
         21,  76,  83, 160,  94,  94,  82, 106, 166,  52, 102, 124, 169, 150,
        138, 112,  26, 188,  89,  53, 139, 184, 133, 150,  87, 120, 168,  89,
        209,  78,  79,  90, 163, 174, 136,  91, 134, 154, 121, 163, 153,  52,
        126,  55, 134,  86,  49,  78,  88, 176])


In [43]:
class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    The length of utterance (speech input) can be hundereds to thousands of frames long.
    The Paper reports that a direct LSTM implementation as Encoder resulted in slow convergence,
    and inferior results even after extensive training.
    The major reason is inability of AttendAndSpell operation to extract relevant information
    from a large number of input steps.
    '''
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim * 2, hidden_size=hidden_dim, num_layers=1, bidirectional=True)

    def forward(self, x):
        '''
        :param x : (L * N * in_dim) input to the pBLSTM --> packed
        :return output: (L, N, H) encoded sequence from pyramidal Bi-LSTM 
        '''
        x, lengths = pad_packed_sequence(x)
        x = x.transpose(0, 1).contiguous() #N * T * in_dim

        T = x.shape[1]
        
        if T % 2 != 0:
            T -= 1
            x = x[:, :T, :]
        x = x.view(x.shape[0], T//2, x.shape[2] * 2).contiguous()
        #x = x.contiguous().view(x.shape[0], T//2, 2, x.shape[2])
        #x = torch.mean(x, dim=3) # N, T//2 * in_dim*2
        x = x.transpose(0, 1).contiguous() # T//2 * N * in_dim*2
        x = pack_padded_sequence(x, lengths//2, enforce_sorted=False)
        output = self.blstm(x)[0]
        return output

In [44]:
class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value,lengths):
        '''
        :param query :(N, context_size) Query is the output of LSTMCell from Decoder
        :param key: (T, N, key_size) Key Projection from Encoder per time step
        :param value: (T, N, value_size) Value Projection from Encoder per time step
        :return output: Attended Context
        :return attention_mask: Attention mask that can be plotted  
        '''
        key = key.transpose(0,1)#(N, T, key_szie)
        value = value.transpose(0,1)#(N, T, value_size)
        attention = torch.bmm(key, query.unsqueeze(2)).squeeze(2) #(batch_size, max_len)
        mask = torch.arange(key.size(1)).unsqueeze(0) < lengths.unsqueeze(1)
        mask = mask.cuda()
        attention = nn.functional.softmax(attention, dim=1)
        attention = mask.float()*attention
        attention = F.normalize(attention, dim=1, p=1)

        out = torch.bmm(attention.unsqueeze(1), value).squeeze(1)

        return out, attention

In [45]:
class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key and value.
    Key and value are nothing but simple projections of the output from pBLSTM network.
    '''
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
        
        ### Add code to define the blocks of pBLSTMs! ###
        pBLSTMs = [pBLSTM(hidden_dim * 2, hidden_dim),
                   pBLSTM(hidden_dim * 2, hidden_dim),
                   pBLSTM(hidden_dim * 2, hidden_dim)]
        self.pBLSTMs = torch.nn.ModuleList(pBLSTMs)

        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)

    def forward(self, x, lens):
        rnn_inp = pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        outputs, _ = self.blstm(rnn_inp) #outputs: L x N x H*2

        ### Use the outputs and pass it through the pBLSTM blocks! ###
        for pBLSTM in self.pBLSTMs:
            outputs = pBLSTM(outputs) 

        linear_input, lens = pad_packed_sequence(outputs)
        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)

        return keys, value, lens

In [46]:
class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=True):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size) #query
        self.query_layer = nn.Linear(hidden_dim, hidden_dim)

        self.isAttended = isAttended
        if (isAttended == True):
            self.attention = Attention()

        self.character_prob = nn.Linear(key_size + value_size, vocab_size)

    def forward(self, key, values, lengths, text=None, isTrain=True, tf = 0.3):
        '''
        :param key :(T, N, key_size) Output of the Encoder Key projection layer
        :param values: (T, N, value_size) Output of the Encoder Value projection layer
        :param text: (N, text_len) Batch input of text with text_length
        :param isTrain: Train or eval mode
        :return predictions: Returns the character perdiction probability 
        '''
        #print ("key")
        #print (key.shape) #130 64 128
        #print ("values")
        #print (values.shape)
        batch_size = key.shape[1]

        if (isTrain == True):
            #print ("txt")
            #print (text.shape)
            max_len =  text.shape[0]
            embeddings = self.embedding(text) #[L, N, dim]
            #print ("embeddings")
            #print (embeddings.shape) 
        else:
            max_len = 250

        predictions = []
        hidden_states = [None, None]
        attentions = []
        prediction = torch.ones(batch_size,1).cuda()
        context=values[0,:,:]
        for i in range(max_len - 1):
            # * Implement Gumble noise and teacher forcing techniques 
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            # * If you haven't implemented attention yet, then you may want to check the index and break 
            #   out of the loop so you do you do not get index out of range errors. 
            if (isTrain):
                if random.random() < tf:
                    char_embed = self.embedding(prediction.argmax(dim=-1))
                else:
                    char_embed = embeddings[i,:,:]
            else:
                char_embed = self.embedding(prediction.argmax(dim=-1))

            inp = torch.cat([char_embed, context], dim=1)
            #print ('inp')
            #print (inp.shape)
            hidden_states[0] = self.lstm1(inp, hidden_states[0])
            
            inp_2 = hidden_states[0][0]
            #print ("inp_2")
            #print (inp_2.shape)
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])
            
            output = hidden_states[1][0]
            ### Compute attention from the output of the second LSTM Cell ###
            query = self.query_layer(output)
            if self.isAttended:
                context, attention = self.attention(query, key, values, lengths)
                attentions.append(attention.detach())

            prediction = self.character_prob(torch.cat([output, context], dim=1)) #N*V
            predictions.append(prediction.unsqueeze(1))
  
        return torch.cat(predictions, dim=1), torch.stack(attentions) #short_in_dim * N * out_len

In [49]:
class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=True):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim)

    def forward(self, speech_input, speech_len, text_input=None, isTrain=True, tf = 0.3):
        key, value, lengths = self.encoder(speech_input, speech_len)
        if (isTrain == True):
            predictions, attentions = self.decoder(key, value, lengths, text=text_input, tf = tf)
            
        else:
            predictions, attentions = self.decoder(key, value, lengths, isTrain=False, tf = tf)
        return predictions, attentions
        

def init_model(m):
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight.data)
        nn.init.normal_(m.bias.data)
    if isinstance(m, nn.LSTMCell) or isinstance(m, nn.GRUCell):
        for name, param in m.named_parameters():
            if 'weight' in name:
                nn.init.orthogonal_(param.data)
            if 'bias' in name:
                nn.init.normal_(param.data)

In [50]:
model = Seq2Seq(input_dim=40, vocab_size=len(vocab), hidden_dim=128)
model.apply(init_model)
model = model.cuda()
print (model)

Seq2Seq(
  (encoder): Encoder(
    (blstm): LSTM(40, 128, bidirectional=True)
    (pBLSTMs): ModuleList(
      (0): pBLSTM(
        (blstm): LSTM(512, 128, bidirectional=True)
      )
      (1): pBLSTM(
        (blstm): LSTM(512, 128, bidirectional=True)
      )
      (2): pBLSTM(
        (blstm): LSTM(512, 128, bidirectional=True)
      )
    )
    (key_network): Linear(in_features=256, out_features=128, bias=True)
    (value_network): Linear(in_features=256, out_features=128, bias=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(36, 128, padding_idx=0)
    (lstm1): LSTMCell(256, 128)
    (lstm2): LSTMCell(128, 128)
    (query_layer): Linear(in_features=128, out_features=128, bias=True)
    (attention): Attention()
    (character_prob): Linear(in_features=256, out_features=36, bias=True)
  )
)


In [51]:
model.load_state_dict(torch.load(MODEL_PATH+'1588543810_3.pt'))

<All keys matched successfully>

In [52]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, threshold=0.1, verbose=True)
criterion = nn.CrossEntropyLoss(reduction = 'none').cuda()

In [53]:
def get_distance(predict,real):
    distance = 0
    total = len(predict)
    for i in range(total):
        distance += editdistance.eval(predict[i], real[i])
    return distance/total

In [54]:
def plot_weights(attentions, epoch):
    fig = plt.figure()
    plt.imshow(attentions, interpolation='nearest', cmap='hot')
    fig.savefig("epoch%d.png" % (epoch))
    plt.close()

In [55]:
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt

def plot_grad_flow(parameters):
    ave_grads = []
    max_grads = []
    layers = []
    for n, p in parameters:
        if p.requires_grad and ("bias" not in n):
            layers.append(n)
            ave_grads.append(p.grad.abs().mean())
            max_grads.append(p.grad.abs().max())
    plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")
    plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b")
    plt.hlines(0, 0, len(ave_grads) + 1, lw=2, color="k")
    plt.xticks(range(0, len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(left=0, right=len(ave_grads))
    plt.ylim(bottom=-0.001, top=0.02)
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)
    plt.legend([Line2D ([0], [0], color="c", lw=4),
                Line2D ([0], [0], color="b", lw=4),
                Line2D ([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])
    plt.savefig('gradient.png', bbox_inches='tight')
    plt.close()

In [None]:
mean_train_losses = []
mean_train_perplexity = []
mean_valid_losses = []
mean_valid_distance = []
mean_valid_perplexity = []
epochs = 24
best_model = None
for epoch in range(epochs):
    model.train()
    train_losses = []
    train_perplexity = []
    for k, (x_batch, x_len, y_batch, y_len) in tqdm(enumerate(train_dataloader)):
        with torch.autograd.set_detect_anomaly(True):
            optimizer.zero_grad()
            x_batch = x_batch.cuda() #T, N, 40
            y_batch = y_batch.cuda() #L, N
            predictions, attentions = model(x_batch, x_len, text_input=y_batch, isTrain=True, tf = 0.4) # N, L-1, dim
            labels = y_batch.transpose(0,1)[:, 1:]
            loss = criterion(predictions.contiguous().view(-1, len(vocab)), labels.contiguous().view(-1)) 
            mask = torch.zeros(labels.size())
            for i in range(len(y_len)):
                mask[i, :y_len[i]-1] = 1
            mask = mask.view(-1).cuda()
            masked_loss = loss * mask
            loss = torch.sum(masked_loss)
            loss = loss/torch.sum(mask)
            loss.backward()
            torch.nn.utils.clip_grad_norm(model.parameters(), 2)
            
            
            perplexity = torch.exp(torch.mean(masked_loss))
            
            if k %50 == 0:
                _, output = torch.max(predictions, dim=2)

                pred = "".join(vocab.char_list[x] for x in output[0][:y_len[0]-2].tolist())
                true = "".join(vocab.char_list[x] for x in labels[0][:y_len[0]-2].tolist())
                print ("=====")
                print ("%s\n%s"%(true,pred))
                print ("======")
            
            train_perplexity.append(perplexity.item())
            train_losses.append(loss.item())
            print (np.mean(loss.item()))
            plot_weights(attentions.detach().cpu().numpy()[:, -1], epoch)
            plot_grad_flow(model.named_parameters())
            optimizer.step()
    

    model.eval()
    valid_losses = []
    valid_distance = []
    valid_perplexity = []
    with torch.no_grad():
        for k, (x_batch, x_len, y_batch, y_len) in tqdm(enumerate(dev_dataloader)):
            x_batch = x_batch.cuda() #T, N, 40
            y_batch = y_batch.cuda() #N, L
            print (x_batch.shape)
            predictions, _ = model(x_batch, x_len, text_input = y_batch, isTrain=False) # N, L-1, dim
            predictions = predictions[:, :y_batch.shape[0]-1, :]
            labels = y_batch.transpose(0,1)[:, 1:]
            loss = criterion(predictions.contiguous().view(-1, len(vocab)), labels.contiguous().view(-1)) 
            mask = torch.zeros(labels.size())
            for i in range(len(y_len)):
                mask[i, :y_len[i]-1] = 1
            mask = mask.view(-1).cuda()
            masked_loss = loss * mask
            loss = torch.sum(masked_loss)/torch.sum(mask)
            perplexity = torch.mean(torch.exp(loss))
            
            _, output = torch.max(predictions, dim=2)
            
            preds = []
            reals = []
            #print (labels.shape)
            #print (output.shape)
            for i in range(output.size(0)):
                end_pos = (output[i].tolist().index(vocab.char2idx['<eos>']) if vocab.char2idx['<eos>'] in output[i].tolist() else output[i].size(0))
                pred = "".join(vocab.char_list[x] for x in output[i].tolist()[:end_pos])
                true = "".join(vocab.char_list[x] for x in labels[i][:y_len[i]-2].tolist())
                preds.append(pred)
                reals.append(true)
            print ("=====")
            print ("%s\n%s"%(true,pred))
            print ("======")
            valid_distance.append(get_distance(preds,reals))
            valid_losses.append(loss.item())
            valid_perplexity.append(perplexity.item())
    
    mean_train_losses.append(np.mean(train_losses))
    mean_valid_losses.append(np.mean(valid_losses))
    mean_valid_perplexity.append(np.mean(valid_perplexity))
    mean_valid_distance.append(np.mean(valid_distance))
    scheduler.step(np.mean(valid_distance))
    if (best_model is None) or (np.mean(valid_distance) < min(mean_valid_distance)):
        best_model = model

    
    print('epoch {}: train loss : {:.4f}, train perplexity : {:.4f}, valid loss : {:.4f}, valid perplexity: {:.2f} valid distance : {:.2f}'\
         .format(epoch+1, np.mean(train_losses),np.mean(train_perplexity), np.mean(valid_losses), np.mean(valid_perplexity), np.mean(valid_distance)))

    torch.save(model.state_dict(), MODEL_PATH+'%d_%d.pt'%(int(time.time()), epoch))
torch.save(best_model.state_dict(), MODEL_PATH+'best_%d.pt'%int(time.time()))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



=====
ALMOST ALL STUDENTS WHO ARE ACCEPTED INTO MEDICAL SCHOOLS OBTAIN A MEDICAL DEGREE 
ALMOST ALL STUDENTS WHO ARE ACCEPTED INTO MEDICAL SCHOOLS OBTAIN A MEDICAL DEGREE 
0.3144131600856781
0.34172675013542175
0.36042365431785583
0.31770119071006775
0.23715148866176605
0.37366628646850586
0.3135872483253479
0.2504916191101074
0.28825119137763977
0.32774338126182556
0.2699856162071228
0.3087587356567383
0.3003072738647461
0.25621938705444336
0.23564964532852173
0.34931278228759766
0.24299149215221405
0.2870831787586212
0.3042893707752228
0.3216700553894043
0.31491464376449585
0.26117753982543945
0.2773367166519165
0.23107270896434784
0.30117085576057434
0.24976834654808044
0.269864022731781
0.28853997588157654
0.31672102212905884
0.24410520493984222
0.26866576075553894
0.32528823614120483
0.3760913014411926
0.24864813685417175
0.2566514313220978
0.40447914600372314
0.252943754196167
0.2764131426811218
0.4487375319004059
0.2985667288303375
0.26057925820350647
0.2723957896232605
0.313454

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

torch.Size([1043, 64, 40])
=====
PAINEWEBBER WAS ONE OF THE EARLY WALL STREET FIRMS TO GET INTO VENTURE CAPITAL 
PAIN WEBERER WAS ONE OF THE EARLY WALL STREET FIRMS TO GET INTEVENTURE CAPITI PIPINE 
torch.Size([1237, 64, 40])
=====
MUCH OF THE GROUND BEEF CONSUMED IN THE UNITED STATES COMES FROM DAIRY COWS 
MUCH OF THE GROUND BEEF CONSUMED THIL THE UNITED STATES COMES FROM DAIRY COWS 
torch.Size([1335, 64, 40])
=====
FOR INVESTORS WILLING TO TAKE MORE RISK COMMA THE NEARBY CONTRACT MONTH COMMA WHICH TRADED WITHOUT PRICE LIMITS COMMA WAS ONE ALTERNATIVE PERIOD 
FOR INVESTORS LINE TO TAKE MORE RISE COMMA THE NEAR BY CONTRACT MONTH COMMA WHICH TRADE WITH OUT PRISELY ITS COMMA WITH WALL TERNATIVE PERIOD 
torch.Size([1166, 64, 40])
=====
IN ONE SENSE THE U. S. ADMINISTRATION'S SUPPORT OF THE MEXICAN PLAN ISOLATES BANKAMERICA AND MANUFACTURERS HANOVER 
IN ONE CENTS THE USEMED MINISTRATION SUPPORT OF THE NEXT CONPLAN I  Y LATES BANK AMERICA MANUFACTORS AND OVER 
torch.Size([1179, 64, 40])
===

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

=====
DOUBLE-QUOTE THERE IS AN ATMOSPHERE OF DISTRUST THROUGHOUT THE MINISTRY COMMA DOUBLE-QUOTE SAYS MR. MIRANDA COMMA WHO NOW GIVES FRENCH LESSONS HERE PERIOD 
DOUBLE-QUOTE THERE IS AN ATMOSPHERE DISDISTRUST THROUGHOUT THE MINISTRY COMMA DOUBLE-QUOTE SAYS MR. MARRNDA COMMA WHO DOW GISE  FRENCH LUSI  ' HERE <eos>ERIOD 
0.33252885937690735
0.2841033339500427
0.24754643440246582
0.24213308095932007
0.2609356641769409
0.27128729224205017
0.3056928813457489
0.3985348641872406
0.3035445511341095
0.30865392088890076
0.2793140113353729
0.29716628789901733
0.36483484506607056
0.2196824550628662
0.2812917232513428
0.29388052225112915
0.23191846907138824
0.26346611976623535
0.28213033080101013
0.27604940533638
0.23919489979743958
0.3168918192386627
0.20282702147960663
0.37879812717437744
0.25047624111175537
0.23071548342704773
0.18166100978851318
0.24045626819133759
0.22723908722400665
0.24340033531188965
0.2901761531829834
0.23248420655727386
0.2524130642414093
0.26299047470092773
0.2213990986

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

torch.Size([1043, 64, 40])
=====
PAINEWEBBER WAS ONE OF THE EARLY WALL STREET FIRMS TO GET INTO VENTURE CAPITAL 
PAIN WEEBER WAS ONE OF THE EARLY WALL STREEN FIRMS TO GET INTERVENTURE CAPITA PAINT 
torch.Size([1237, 64, 40])
=====
MUCH OF THE GROUND BEEF CONSUMED IN THE UNITED STATES COMES FROM DAIRY COWS 
MUCH OF THE GROUND BEEF CONSUMED TO THE UNITED STATES COMES FROM DAIRY COWS 
torch.Size([1335, 64, 40])
=====
FOR INVESTORS WILLING TO TAKE MORE RISK COMMA THE NEARBY CONTRACT MONTH COMMA WHICH TRADED WITHOUT PRICE LIMITS COMMA WAS ONE ALTERNATIVE PERIOD 
FOR INVESTOR LINETO TAKE MORE RIS COMMA THE NEARBY CONTRACT MONTH COMMA WHICH TRADED WITH OUT PRISELYMITS COMMA WILL WALL TERTATED PERIOD 
torch.Size([1166, 64, 40])
=====
IN ONE SENSE THE U. S. ADMINISTRATION'S SUPPORT OF THE MEXICAN PLAN ISOLATES BANKAMERICA AND MANUFACTURERS HANOVER 
IN ONE CENTS THE HAS BED MINISTRATION SUPPORT OF THE MEXT CONPLAN ISLATES BANKAMARIPA A MANUFACTORS AND OVER 
torch.Size([1179, 64, 40])
=====
THE E

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

=====
P. N. C.'S MERGER WITH CITIZENS LAST YEAR GAVE IT ACCESS TO THE THRIVING MID HYPHEN SOUTH REGION PERIOD 
P. N. C.'S MERGER WITH CITIZENS LAST YEAR GAVIN T ACCESS TO THE THRIVING MID HYPHEN SOUTH REGION PERIOD 
0.2957105040550232
0.21666871011257172
0.24641340970993042
0.22243475914001465
0.22376132011413574
0.21776805818080902
0.19745168089866638
0.22591395676136017
0.20546755194664001
0.2232978790998459
0.2120232880115509
0.21020616590976715
0.18969805538654327
0.1959037184715271
0.24996982514858246
0.192496657371521
0.27565184235572815
0.23428481817245483
0.23316651582717896
0.15325772762298584
0.28169456124305725
0.20304058492183685
0.2419043332338333
0.23974137008190155
0.17602184414863586
0.22536325454711914
0.25026604533195496
0.2443057745695114
0.23873770236968994
0.20914320647716522
0.1828048974275589
0.18811634182929993
0.19959327578544617
0.1813986748456955
0.15150536596775055
0.21635854244232178
0.2954350709915161
0.22435227036476135
0.21302051842212677
0.3236799836158

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

torch.Size([1043, 64, 40])
=====
PAINEWEBBER WAS ONE OF THE EARLY WALL STREET FIRMS TO GET INTO VENTURE CAPITAL 
PAIN WEBBER WAS ONE OF THE EARLY WALL STREET FIRMS TO GET INTEVENTURE CAPITA 
torch.Size([1237, 64, 40])
=====
MUCH OF THE GROUND BEEF CONSUMED IN THE UNITED STATES COMES FROM DAIRY COWS 
MUCH OF THE GROUND BEEF CONSUMED THE UNITED STATES COMES FROM DAIRY COWS 
torch.Size([1335, 64, 40])
=====
FOR INVESTORS WILLING TO TAKE MORE RISK COMMA THE NEARBY CONTRACT MONTH COMMA WHICH TRADED WITHOUT PRICE LIMITS COMMA WAS ONE ALTERNATIVE PERIOD 
FOR INVESTOR LINETO TAKE MORE RIS COMMA THE NEARBY CONTRACT MONTH COMMA WHICH TRADED WITHOUT PRISELY MITS COMMA WILL WALL TERNATIVE PERIOD 
torch.Size([1166, 64, 40])
=====
IN ONE SENSE THE U. S. ADMINISTRATION'S SUPPORT OF THE MEXICAN PLAN ISOLATES BANKAMERICA AND MANUFACTURERS HANOVER 
IN ONE CENTS THE USEM DADMINISTRATION SUPPORT OF THE MEXT CONPLAN I FLAIDS BANK AMERICA  MANUFACTORS AND OVER 
torch.Size([1179, 64, 40])
=====
THE ENORMOUS 

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

=====
PHILOSOPHERS OF EDUCATION OFTEN DIFFER IN THEIR VIEWS ON THE NATURE OF KNOWLEDGE 
PHILOSOPHERS OF EDUCATION OFTEN DIFFER IN THEIR VIEWS ON THE NATURE OF KNOWLEDGE 
0.21108156442642212
0.19690877199172974
0.11958964169025421
0.16798503696918488
0.14637507498264313
0.18679717183113098
0.15929493308067322
0.16076044738292694
0.17127877473831177
0.140704944729805
0.1855994313955307
0.17050617933273315
0.14416556060314178
0.17253975570201874
0.25059932470321655
0.14550605416297913
0.14178623259067535
0.13688504695892334
0.1668386608362198
0.18115033209323883
0.20015180110931396
0.19376088678836823
0.24234478175640106
0.15519392490386963
0.1565723419189453
0.15509240329265594
0.16573035717010498
0.24360617995262146
0.13959351181983948
0.19003820419311523
0.1929168999195099
0.16576050221920013
0.18515774607658386
0.16863863170146942
0.13171932101249695
0.12055066227912903
0.2051866352558136
0.13996702432632446
0.1257396936416626
0.17377786338329315
0.13038772344589233
0.1638005375862121

In [None]:
torch.save(model.state_dict(), MODEL_PATH+'%d_%d.pt'%(int(time.time()), epoch))

In [None]:
test_x = load_data(DATA_PATH+"test_new.npy")
test_dataloader = DataLoader(Speech2TextDataset(text_x, isTrain = False), 
                              shuffle=False, 
                              batch_size=BATCH_SIZE, 
                              collate_fn = collate_pad,
                              pin_memory=True)

In [None]:
for x, _ in test_dataloader:
    print (x.shape)
    break

In [None]:
model.eval()
preds = []
with torch.no_grad():
    for k, (x_batch, x_len) in tqdm(enumerate(test_dataloader)):
        x_batch = x_batch.cuda() #T, N, 40
        predictions, _ = model(x_batch, x_len, text_input = y_batch, isTrain=False) # N, L-1, dim
        predictions = predictions[:, :y_batch.shape[0]-1, :]
        _, output = torch.max(predictions, dim=2)
        for i in range(output.size(0)):
            end_pos = (output[i].tolist().index(vocab.char2idx['<eos>']) if vocab.char2idx['<eos>'] in output[i].tolist() else output[i].size(0))
            pred = "".join(vocab.char_list[x] for x in output[i].tolist()[:end_pos])
            preds.append(pred)
out_df = pd.DataFrame()
out_df['Id'] = np.arange(0, len(test_x))
out_df['Predicted'] = preds
out_df.head()

In [None]:
SUBMISSION_PATH = "submission/"
file_name = SUBMISSION_PATH+"submission_%d.csv"%int(time.time())
out_df.to_csv(file_name,index=False)