# Recurrent Language Model

In [1]:
path_to_utils = '../../utils'
path_to_data = './data'

In [23]:
import os
import sys

sys.path.append(path_to_utils)

import loading_text_and_tokenization
import torch
import numpy as np
import torch.nn as nn
import random

In [3]:
USE_CUDA = False
device = torch.device("cuda" if USE_CUDA else "cpu")

In [4]:
batch_size  = 32

### Loading Data and turning into batches

In [5]:
corpus = loading_text_and_tokenization.Corpus(path_to_data)

In [6]:
print ("Train dataset size is {}".format(len(corpus.train)))
print ("Val dataset size is {}".format(len(corpus.valid)))
print ("Test dataset size is {}".format(len(corpus.test)))

Train dataset size is 2088628
Val dataset size is 217646
Test dataset size is 245569


### Aside: torch.Tensor.narrow

In [7]:
dummy_tensor = torch.arange(0, 10);
print(dummy_tensor)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


In [8]:
dummy_tensor.narrow(0, 1, 5)

tensor([1, 2, 3, 4, 5])

In [9]:
dummy_tensor.narrow(0, 5, 4)

tensor([5, 6, 7, 8])

All we need to understand is that .narrow() is just a way to do indexing. When we do dummy_tensor.narrow(0, i, j) we are indexing dummy_tensor[i:i+j]

In [10]:
def batchify(data, bsz, random_start_idx=False):
    # calculate total number of batches that fit cleanly
    nbatch = data.size(0) // bsz
    if random_start_idx:
        start_idx = random.randint(0, data.size(0) % bsz - 1)
    else:
        start_idx = 0
        
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    # Nice thing about this: 
    # u don't need to pad since every sequence now has same length
    data = data.narrow(0, start_idx, nbatch * bsz)
    
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data


In [11]:
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)

## RNN Model

In [12]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, vocab_size)

        self.init_weights()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.num_layers, bsz, self.hidden_size),
                    weight.new_zeros(self.num_layers, bsz, self.hidden_size))

In [13]:
embed_size = 200
hidden_size = 400
num_layers = 2
num_epochs = 10
lr = 0.1
dropout = 0.3
max_seq_len = 35

vocab_size = len(corpus.dictionary)
model = RNNModel(vocab_size, embed_size, hidden_size, num_layers, dropout)

In [14]:

def get_batch(source, i, max_seq_len):
    seq_len = min(max_seq_len, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


In [25]:
clip = 0.3
log_interval = 200

def repackage_hidden(h):
    """
        Wraps hidden states in new Tensors, to detach them from their history.
    """
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    
def train():
    model.train()
    total_loss = 0.
    hidden = model.init_hidden(batch_size)
    
    # We shuffle train data every epoch
    train_data = train_data = batchify(corpus.train, batch_size, random_start_idx=True)
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, max_seq_len)):
        data, targets = get_batch(train_data, i, max_seq_len)
        
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch %log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // max_seq_len, lr,
                cur_loss, math.exp(cur_loss)))
            total_loss = 0


### Perplexity

Description from Roberta

In [16]:
# perplexity evaluation for a given corpus
def evaluate(data_source, max_seq_len, eval_batch_size=32):
    model.eval()
    total_loss = 0.
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, max_seq_len):
            data, targets = get_batch(data_source, i, max_seq_len)
            
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, vocab_size)
            
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

## Training

In [17]:
best_val_loss = np.inf
criterion = nn.CrossEntropyLoss()

In [26]:
for epoch in range(1, num_epochs+1):
    train()
    val_loss = evaluate(val_data, max_seq_len)
    print('-' * 89)
    print('| end of epoch {:3d} | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, 
                                           val_loss, math.exp(val_loss)))
    print('-' * 89)
    # Save the model if the validation loss is the best we've seen so far.
    if not best_val_loss or val_loss < best_val_loss:
        with open('model.pt', 'wb') as f:
            torch.save(model, f)
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0


KeyboardInterrupt: 

In [27]:
len(corpus.dictionary)

33278