# Recurrent Language Model

In [107]:
path_to_utils = '../../utils'

dataset = 'amazon'
if dataset == 'amazon':
    path_to_data = './amazon_review_data'
else:
    path_to_data = './data'

In [34]:
import os
import sys

sys.path.append(path_to_utils)

import loading_text_and_tokenization
import torch
import numpy as np
import torch.nn as nn
import random
import math
import torch.nn.functional as F

import pickle

In [3]:
USE_CUDA = True
device = torch.device("cuda" if USE_CUDA else "cpu")

In [4]:
batch_size  = 32

if dataset == 'amazon':
    batch_size = 64

### Loading Data and turning into batches

In [5]:
corpus_filename = dataset+'_obj.p'

if os.path.exists(corpus_filename):
    corpus = pickle.load( open( corpus_filename, "rb" ) )
else:
    corpus = loading_text_and_tokenization.Corpus(path_to_data)
    pickle.dump( corpus, open( corpus_filename, "wb" ) )

Adding words from train to dictionary..
tokenizing train..
tokenizing valid..


In [6]:
print ("Train dataset size is {}".format(len(corpus.train)))
print ("Val dataset size is {}".format(len(corpus.valid)))


Train dataset size is 16616677
Val dataset size is 1860569


### Aside: torch.Tensor.narrow

In [7]:
dummy_tensor = torch.arange(0, 10);
print(dummy_tensor)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


In [8]:
dummy_tensor.narrow(0, 1, 5)

tensor([1, 2, 3, 4, 5])

In [9]:
dummy_tensor.narrow(0, 5, 4)

tensor([5, 6, 7, 8])

All we need to understand is that .narrow() is just a way to do indexing. When we do dummy_tensor.narrow(0, i, j) we are indexing dummy_tensor[i:i+j]

In [10]:
def batchify(data, bsz, random_start_idx=False):
    # calculate total number of batches that fit cleanly
    nbatch = data.size(0) // bsz
    if random_start_idx:
        start_idx = random.randint(0, data.size(0) % bsz - 1)
    else:
        start_idx = 0
        
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    # Nice thing about this: 
    # u don't need to pad since every sequence now has same length
    data = data.narrow(0, start_idx, nbatch * bsz)
    
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)


In [11]:

def get_batch(source, i, max_seq_len):
    seq_len = min(max_seq_len, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


In [36]:
clip = 0.25
log_interval = 200
criterion = nn.CrossEntropyLoss()

def repackage_hidden(h):
    """
        Wraps hidden states in new Tensors, to detach them from their history.
    """
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    
def train_step(model, lr, epoch):
    model.train()
    total_loss = 0.
    hidden = model.init_hidden(batch_size)
    
    # We shuffle train data every epoch
    train_data = batchify(corpus.train, batch_size, random_start_idx=True)
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, max_seq_len)):
        data, targets = get_batch(train_data, i, max_seq_len)
        
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            if p.grad is not None:
                p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch %log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // max_seq_len, lr,
                cur_loss, math.exp(cur_loss)))
            total_loss = 0
            
    return model


### Perplexity

Description here

In [89]:
# perplexity evaluation for a given corpus
def evaluate(model, data_source, max_seq_len, eval_batch_size=32, calc_prob = False):
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, max_seq_len):
            data, targets = get_batch(data_source, i, max_seq_len)
            
            output, hidden = model(data, hidden)
            if calc_prob:
                out = output.clone();
                out = F.softmax(out, dim = 2)

                sum_prob = 0;
                for idx in range(len(targets)):
                    sum_prob += torch.log(out[idx, 0, targets[idx]])
                    
            output_flat = output.view(-1, vocab_size)
            
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
            

    if calc_prob:
        return total_loss / len(data_source), torch.exp(sum_prob).item()
    return total_loss / len(data_source)

### Train for $n$ epochs

In [23]:
def train_for_n_epochs(model, filename, num_epochs = 10, lr=20):
    best_val_loss = np.inf
    val_data = batchify(corpus.valid, batch_size, random_start_idx=True)
    for epoch in range(1, num_epochs+1):
        model = train_step(model, lr, epoch)
        val_loss = evaluate(model, val_data, max_seq_len, batch_size)
        print('-' * 89)
        print('| end of epoch {:3d} | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(epoch, 
                                               val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(filename, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
    return model

## RNN Model

In [15]:
## Common Model Parameters

embed_size = 200
hidden_size = 200
num_layers = 2
num_epochs = 20
lr = 20.0
dropout = 0.2
max_seq_len = 35

if dataset == 'amazon':
    num_epochs = 5;
    max_seq_len = 70


vocab_size = len(corpus.dictionary)

In [16]:
vocab_size

21134

In [18]:
filename_rnn = 'rnn_'+dataset+'.pth';

In [19]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, vocab_size)

        self.init_weights()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.num_layers, bsz, self.hidden_size),
                    weight.new_zeros(self.num_layers, bsz, self.hidden_size))

In [20]:
model_rnn = RNNModel(vocab_size, embed_size, hidden_size, num_layers, dropout).to(device)

### Training RNN Model or Loading Saved Model

In [21]:
train_again = False

In [24]:
if os.path.exists(filename_rnn) and (not train_again):
    model_rnn = torch.load(filename_rnn);
else:
    model_rnn = train_for_n_epochs(model = model_rnn, 
                               filename = filename_rnn,
                               num_epochs = num_epochs)


| epoch   1 |   200/ 3709 batches | lr 20.00 | loss  4.10 | ppl    60.44
| epoch   1 |   400/ 3709 batches | lr 20.00 | loss  4.06 | ppl    58.07
| epoch   1 |   600/ 3709 batches | lr 20.00 | loss  4.07 | ppl    58.35
| epoch   1 |   800/ 3709 batches | lr 20.00 | loss  4.03 | ppl    56.12
| epoch   1 |  1000/ 3709 batches | lr 20.00 | loss  4.01 | ppl    55.40
| epoch   1 |  1200/ 3709 batches | lr 20.00 | loss  4.02 | ppl    55.93
| epoch   1 |  1400/ 3709 batches | lr 20.00 | loss  3.99 | ppl    53.89
| epoch   1 |  1600/ 3709 batches | lr 20.00 | loss  3.96 | ppl    52.59
| epoch   1 |  1800/ 3709 batches | lr 20.00 | loss  3.97 | ppl    52.83
| epoch   1 |  2000/ 3709 batches | lr 20.00 | loss  3.95 | ppl    51.71
| epoch   1 |  2200/ 3709 batches | lr 20.00 | loss  3.94 | ppl    51.37
| epoch   1 |  2400/ 3709 batches | lr 20.00 | loss  3.94 | ppl    51.21
| epoch   1 |  2600/ 3709 batches | lr 20.00 | loss  3.95 | ppl    51.96
| epoch   1 |  2800/ 3709 batches | lr 20.00 | loss

  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |   200/ 3709 batches | lr 20.00 | loss  3.92 | ppl    50.53
| epoch   2 |   400/ 3709 batches | lr 20.00 | loss  3.90 | ppl    49.17
| epoch   2 |   600/ 3709 batches | lr 20.00 | loss  3.91 | ppl    49.75
| epoch   2 |   800/ 3709 batches | lr 20.00 | loss  3.88 | ppl    48.31
| epoch   2 |  1000/ 3709 batches | lr 20.00 | loss  3.87 | ppl    48.05
| epoch   2 |  1200/ 3709 batches | lr 20.00 | loss  3.89 | ppl    49.04
| epoch   2 |  1400/ 3709 batches | lr 20.00 | loss  3.86 | ppl    47.47
| epoch   2 |  1600/ 3709 batches | lr 20.00 | loss  3.84 | ppl    46.49
| epoch   2 |  1800/ 3709 batches | lr 20.00 | loss  3.85 | ppl    47.07
| epoch   2 |  2000/ 3709 batches | lr 20.00 | loss  3.83 | ppl    46.13
| epoch   2 |  2200/ 3709 batches | lr 20.00 | loss  3.83 | ppl    46.13
| epoch   2 |  2400/ 3709 batches | lr 20.00 | loss  3.83 | ppl    46.14
| epoch   2 |  2600/ 3709 batches | lr 20.00 | loss  3.85 | ppl    46.99
| epoch   2 |  2800/ 3709 batches | lr 20.00 | loss

### Scoring The Sentence

In [58]:
def score_sentences(model, sent_list):
    ppl_list = []
    for sent in sent_list:
        tokens = corpus.tokenize_sentence(sent)
        test_sent_idx = batchify(tokens, 1)
        loss, prob = evaluate(model, test_sent_idx, len(tokens), 1, calc_prob = True)
        ppl_list.append((sent, math.exp(loss), prob))
    return ppl_list

In [71]:
test_sentences = [ 'how are you?', \
                'dividend yields have been bolstered by stock declines', \
             'stock bolstered declines dividend by yields have been', \
             'artificial neural networks are computing systems vaguely inspired by the biological neural networks', \
             'Cho is so cool', \
             'roberta is so cool', \
              'this cloth is nice', \
                 'a']

score_sentences(model_rnn, test_sentences)

[('how are you?', 85.30433599726925, 1.8884946939579095e-08),
 ('dividend yields have been bolstered by stock declines',
  335.5168345683966,
  1.8559707221977514e-23),
 ('stock bolstered declines dividend by yields have been',
  329.04241356628273,
  2.2117348419944873e-23),
 ('artificial neural networks are computing systems vaguely inspired by the biological neural networks',
  111.46072618656794,
  2.1892486535251042e-29),
 ('Cho is so cool', 33.46978211595802, 2.380864927431503e-08),
 ('roberta is so cool', 33.46978211595802, 2.380864927431503e-08),
 ('this cloth is nice', 66.64289569409148, 7.607302565659779e-10),
 ('a', 66.33855753905767, 0.00022723140136804432)]

## Generation

In [28]:
def generate_words(model, n_words = 100):
    #n_words number of words to generate
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(1)
    input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
    # input = torch.tensor([[corpus.dictionary.word2idx['a']]], dtype=torch.long).to(device)
    
    sentence = corpus.dictionary.idx2word[input.item()] + ' '
    model.eval()
    with torch.no_grad():  # no tracking history
        for i in range(n_words):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(1.0).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]

            input.fill_(word_idx)
  
            word = corpus.dictionary.idx2word[word_idx]

            sentence += word +' '
        
    return sentence

In [29]:
generate_words(model_rnn)

'veil , protects the <unk> and the material / potent <unk> to not be quite comfortable heat , no where . <eos> this item is so cute ! lightweight and comfortable . they are a bit on a close side . . . but individual books are a great first place a great <unk> . these are a really nice , lightweight and comfy piece . <eos> for the price , or a good present for the other buyers , at the money this is the most exciting pack i can see . it works , but the closest did the '

## RNN + Self Attention

In [30]:
class RNN_SelfAttn_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, 
                 dropout = 0.5, idropout = 0.5, self_attention = True):
        super(RNN_SelfAttn_Model, self).__init__()

        
        self.drop = nn.Dropout(dropout)
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRUCell(hidden_size, hidden_size, bias=True);

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.memory_rnn = nn.GRUCell(embed_size, hidden_size)
        self.projector_summ = nn.Sequential(nn.Dropout(idropout),
                                            nn.Linear(embed_size + hidden_size, hidden_size),
                                            nn.Dropout(idropout))
        
        self.decoder = nn.Linear(hidden_size, vocab_size)
        self.vocab_size = vocab_size;
        
        self.self_attention = self_attention;

    def forward(self, input, memory):
        emb = self.encoder(input)
        emb = F.relu(emb)
        
#         print(emb.shape)
#         emb = emb.transpose(0, 1);
        return_scores = torch.empty(emb.size(0), emb.size(1), self.vocab_size).to(input.device)        
        
        for t in range(emb.size(0)):
            current_vec = emb[t];
            
            if self.self_attention:
                selected_memory, attention0 = self.attention(current_vec, memory)
            else:
                selected_memory, attention0 = memory[:, 0, :], None;

            # recurrent
            mem_out = self.memory_rnn(current_vec, selected_memory);

            # update memory
            memory = torch.cat([mem_out[:, None, :], memory[:, :-3, :]], dim=1)
    
            scores = self.decoder(mem_out)
            return_scores[t] = scores
            
        return return_scores.contiguous(), memory

    def attention(self, input, memory):
        # select memory to use
        concat_vec = torch.cat([input,  memory[:, 0, :]], dim=1);
        projected_vec = self.projector_summ(concat_vec);
    
        dot_product_values = torch.bmm(memory, projected_vec.unsqueeze(-1)).squeeze(-1)/ math.sqrt(self.hidden_size);
        
        weights =  F.softmax(dot_product_values, dim = 1).unsqueeze(-1);
        
        selected_memory = torch.sum( memory * weights, dim=1)
        return selected_memory, weights

    def init_hidden(self, bsz):
        return torch.zeros(bsz, 1, self.hidden_size).to(self.decoder.weight.device)


In [31]:
filename_rnn_selfattn = 'rnn_selfattn'+dataset+'.pth';

In [32]:
model_rnn_selfattn = RNN_SelfAttn_Model(vocab_size, embed_size, hidden_size, num_layers, dropout).to(device)

In [37]:
if os.path.exists(filename_rnn_selfattn) and (not train_again):
    model_rnn_selfattn = torch.load(filename_rnn_selfattn);
else:
    model_rnn_selfattn = train_for_n_epochs(model = model_rnn_selfattn, 
                               filename = filename_rnn_selfattn,
                               num_epochs = num_epochs)


| epoch   1 |   200/ 3709 batches | lr 20.00 | loss 13.35 | ppl 625810.23
| epoch   1 |   400/ 3709 batches | lr 20.00 | loss  8.75 | ppl  6324.29
| epoch   1 |   600/ 3709 batches | lr 20.00 | loss  6.56 | ppl   706.73
| epoch   1 |   800/ 3709 batches | lr 20.00 | loss  5.57 | ppl   261.60
| epoch   1 |  1000/ 3709 batches | lr 20.00 | loss  5.10 | ppl   164.67
| epoch   1 |  1200/ 3709 batches | lr 20.00 | loss  4.92 | ppl   137.68
| epoch   1 |  1400/ 3709 batches | lr 20.00 | loss  4.76 | ppl   116.37
| epoch   1 |  1600/ 3709 batches | lr 20.00 | loss  4.63 | ppl   102.72
| epoch   1 |  1800/ 3709 batches | lr 20.00 | loss  4.55 | ppl    94.64
| epoch   1 |  2000/ 3709 batches | lr 20.00 | loss  4.48 | ppl    88.11
| epoch   1 |  2200/ 3709 batches | lr 20.00 | loss  4.42 | ppl    82.87
| epoch   1 |  2400/ 3709 batches | lr 20.00 | loss  4.39 | ppl    80.55
| epoch   1 |  2600/ 3709 batches | lr 20.00 | loss  4.37 | ppl    79.24
| epoch   1 |  2800/ 3709 batches | lr 20.00 | los

  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |   200/ 3709 batches | lr 20.00 | loss  4.23 | ppl    69.06
| epoch   2 |   400/ 3709 batches | lr 20.00 | loss  4.18 | ppl    65.46
| epoch   2 |   600/ 3709 batches | lr 20.00 | loss  4.18 | ppl    65.67
| epoch   2 |   800/ 3709 batches | lr 20.00 | loss  4.15 | ppl    63.27
| epoch   2 |  1000/ 3709 batches | lr 20.00 | loss  4.12 | ppl    61.72
| epoch   2 |  1200/ 3709 batches | lr 20.00 | loss  4.14 | ppl    62.86
| epoch   2 |  1400/ 3709 batches | lr 20.00 | loss  4.10 | ppl    60.15
| epoch   2 |  1600/ 3709 batches | lr 20.00 | loss  4.07 | ppl    58.74
| epoch   2 |  1800/ 3709 batches | lr 20.00 | loss  4.08 | ppl    58.93
| epoch   2 |  2000/ 3709 batches | lr 20.00 | loss  4.05 | ppl    57.48
| epoch   2 |  2200/ 3709 batches | lr 20.00 | loss  4.04 | ppl    57.00
| epoch   2 |  2400/ 3709 batches | lr 20.00 | loss  4.04 | ppl    57.06
| epoch   2 |  2600/ 3709 batches | lr 20.00 | loss  4.06 | ppl    57.81
| epoch   2 |  2800/ 3709 batches | lr 20.00 | loss

KeyboardInterrupt: 

### Scoring Sentences

In [90]:
score_sentences(model_rnn_selfattn, test_sentences)

[('how are you?', 101.36184006487102, 9.47329770184524e-09),
 ('dividend yields have been bolstered by stock declines',
  228.70603132037192,
  5.841191633299324e-22),
 ('stock bolstered declines dividend by yields have been',
  226.93469120306304,
  6.264579970671918e-22),
 ('artificial neural networks are computing systems vaguely inspired by the biological neural networks',
  206.47568442864693,
  3.906920411508972e-33),
 ('Cho is so cool', 39.82627319195157, 9.980484882987639e-09),
 ('roberta is so cool', 39.82627319195157, 9.980484882987639e-09),
 ('this cloth is nice', 41.82467837258902, 7.813343749774049e-09),
 ('a', 279.6846450160066, 1.278389390790835e-05)]

### Generating Sentences

In [91]:
generate_words(model_rnn_selfattn)

"string . they get easily fit and made the crease after 1 hour i received promised macy ' s only the ones i had to change my computer barely jewels and power shorts were dragging put together on my head instead of the first time buckled are related and see through them . <eos> my son loved sapphire stockings seat and this khaki is perfect . do not recommend it as you should be damaged and love these overall excellent quality . the style is great or blue and are very easy to use on and just outside for what moves "

# Self Attention

In [103]:
class SelfAttn_Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1, 
                 dropout = 0.5, idropout = 0.5, self_attention = True):
        super(SelfAttn_Model, self).__init__()

        
        self.drop = nn.Dropout(dropout)
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRUCell(hidden_size, hidden_size, bias=True);

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
    
        self.projector_summ = nn.Sequential(nn.Dropout(idropout),
                                            nn.Linear(embed_size + hidden_size, hidden_size),
                                            nn.Dropout(idropout))
        
        self.decoder = nn.Linear(embed_size + hidden_size, vocab_size)
        self.vocab_size = vocab_size;
        
        self.self_attention = self_attention;

    def forward(self, input, memory):
        emb = self.encoder(input)
        emb = F.relu(emb)
        
#         print(emb.shape)
#         emb = emb.transpose(0, 1);
        return_scores = torch.empty(emb.size(0), emb.size(1), self.vocab_size).to(input.device)        
        
        for t in range(emb.size(0)):
            current_vec = emb[t];
            
            if self.self_attention:
                selected_memory, attention0 = self.attention(current_vec, memory)
            else:
                selected_memory, attention0 = memory[:, 0, :], None;

            # recurrent
            mem_out = current_vec;

            # update memory
            memory = torch.cat([mem_out[:, None, :], memory[:, :-25, :]], dim=1)
    
#             print(mem_out.shape)
#             print(selected_memory.shape)
            scores = self.decoder( torch.cat([mem_out, selected_memory], dim = 1) )
            return_scores[t] = scores
            
        return return_scores.contiguous(), memory

    def attention(self, input, memory):
        # select memory to use
        concat_vec = torch.cat([input,  memory[:, 0, :]], dim=1);
        projected_vec = self.projector_summ(concat_vec);
    
        dot_product_values = torch.bmm(memory, projected_vec.unsqueeze(-1)).squeeze(-1)/ math.sqrt(self.hidden_size);
        
        weights =  F.softmax(dot_product_values, dim = 1).unsqueeze(-1);
        
        selected_memory = torch.sum( memory * weights, dim=1)
        return selected_memory, weights

    def init_hidden(self, bsz):
        return torch.zeros(bsz, 1, self.hidden_size).to(self.decoder.weight.device)


In [104]:
filename_selfattn = 'selfattn'+dataset+'.pth';

In [105]:
model_selfattn = SelfAttn_Model(vocab_size, embed_size, hidden_size, num_layers, dropout).to(device)

In [None]:
train_again = True
if os.path.exists(filename_selfattn) and (not train_again):
    model_selfattn = torch.load(filename_selfattn);
else:
    model_selfattn = train_for_n_epochs(model = model_selfattn, 
                               filename = filename_selfattn,
                               num_epochs = num_epochs)


| epoch   1 |   200/ 3709 batches | lr 20.00 | loss  4.48 | ppl    88.47
| epoch   1 |   400/ 3709 batches | lr 20.00 | loss  4.44 | ppl    85.17
| epoch   1 |   600/ 3709 batches | lr 20.00 | loss  4.47 | ppl    87.19
| epoch   1 |   800/ 3709 batches | lr 20.00 | loss  4.44 | ppl    84.60
| epoch   1 |  1000/ 3709 batches | lr 20.00 | loss  4.44 | ppl    84.47
| epoch   1 |  1200/ 3709 batches | lr 20.00 | loss  4.45 | ppl    85.97
| epoch   1 |  1400/ 3709 batches | lr 20.00 | loss  4.43 | ppl    84.09
| epoch   1 |  1600/ 3709 batches | lr 20.00 | loss  4.41 | ppl    81.86
| epoch   1 |  1800/ 3709 batches | lr 20.00 | loss  4.43 | ppl    83.82
| epoch   1 |  2000/ 3709 batches | lr 20.00 | loss  4.40 | ppl    81.30
| epoch   1 |  2200/ 3709 batches | lr 20.00 | loss  4.40 | ppl    81.47
| epoch   1 |  2400/ 3709 batches | lr 20.00 | loss  4.40 | ppl    81.42
| epoch   1 |  2600/ 3709 batches | lr 20.00 | loss  4.44 | ppl    84.37
| epoch   1 |  2800/ 3709 batches | lr 20.00 | loss

  "type " + obj.__name__ + ". It won't be checked "


| epoch   2 |   200/ 3709 batches | lr 20.00 | loss  4.43 | ppl    83.73
| epoch   2 |   400/ 3709 batches | lr 20.00 | loss  4.39 | ppl    80.61
| epoch   2 |   600/ 3709 batches | lr 20.00 | loss  4.42 | ppl    83.04
| epoch   2 |   800/ 3709 batches | lr 20.00 | loss  4.39 | ppl    81.02
| epoch   2 |  1000/ 3709 batches | lr 20.00 | loss  4.37 | ppl    79.41
| epoch   2 |  1200/ 3709 batches | lr 20.00 | loss  4.41 | ppl    82.67
| epoch   2 |  1400/ 3709 batches | lr 20.00 | loss  4.37 | ppl    79.19
| epoch   2 |  1600/ 3709 batches | lr 20.00 | loss  4.35 | ppl    77.71
| epoch   2 |  1800/ 3709 batches | lr 20.00 | loss  4.37 | ppl    79.08
| epoch   2 |  2000/ 3709 batches | lr 20.00 | loss  4.35 | ppl    77.74
| epoch   2 |  2200/ 3709 batches | lr 20.00 | loss  4.35 | ppl    77.77
| epoch   2 |  2400/ 3709 batches | lr 20.00 | loss  4.36 | ppl    78.10
| epoch   2 |  2600/ 3709 batches | lr 20.00 | loss  4.38 | ppl    80.07
| epoch   2 |  2800/ 3709 batches | lr 20.00 | loss

### Scoring Sentences

In [108]:
score_sentences(model_selfattn, test_sentences)

[('how are you?', 103.00469458609216, 8.883246138680079e-09),
 ('dividend yields have been bolstered by stock declines',
  706.8707342755534,
  2.2695685399091972e-26),
 ('stock bolstered declines dividend by yields have been',
  562.5903504763468,
  1.7712076968178442e-25),
 ('artificial neural networks are computing systems vaguely inspired by the biological neural networks',
  270.4123081023086,
  8.94595511602657e-35),
 ('Cho is so cool', 44.463124679970925, 5.754389853507291e-09),
 ('roberta is so cool', 44.463124679970925, 5.754389853507291e-09),
 ('this cloth is nice', 70.44770105140095, 5.763228783095542e-10),
 ('a', 60.45458798167272, 0.00027361573302187026)]

### Generating Sentences

In [109]:
generate_words(model_selfattn)

"shelve for a girl run a little disappointed to wear them for so long , it ! <eos> cute red for example and requirements to have a 3 different pair of these were more comfortable , nice , love orthaheel sandals are the right , it has to stretch but it people far at all . i had to the store with these panties , based on a while payless bag face and holic of your manufacture or wearing them on a thin costume ( it ' s agent for a couple runs big , it was too big people with "