# Section 2: RNNs in PyTorch

## Goals
1. Build a simple RNN classifier
2. Learn about PyTorch's in-built RNN modules (LSTM etc.)

(Roughly follows http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html)

In [345]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torchtext
from torchtext.vocab import Vectors, GloVe

### Part 1: Building an RNN sentiment classifier
#### Part 1.1: Generating the data

First we'll generate some toy data. The task will be to recall an integer at a certain position in a sequence. 
For a sequence a<sub>1</sub> a<sub>12</sub> a<sub>3</sub> a<sub>4</sub> a<sub>5</sub> the output might be a<sub>3</sub>.

In [624]:
# number of training examples
n_train = 2000

# number of validation examples
n_val = 1000

# length of each sequence
n_length = 10

# examples per batch
n_batch = 32

# size of the vocabulary
n_vocab = 20

# position to be recalled
answer_pos = n_length-1

# generate random sequences
train_seq = Variable(torch.Tensor(n_train, n_length).random_(0, n_vocab).long())
val_seq = Variable(torch.Tensor(n_val, n_length).random_(0, n_vocab).long())

# choose the correct labels
train_labels = train_seq.clone()[:, answer_pos]
val_labels = val_seq.clone()[:, answer_pos]

# group data into batches
train_iter = []
for i in range(0, n_train, n_batch):
    batch_seq = train_seq[i:i+n_batch]
    batch_labels = train_labels[i:i+n_batch]
    if (batch_seq.size()[0] == n_batch):
        train_iter.append([batch_seq, batch_labels])
    
val_iter = []
for i in range(0, n_val, n_batch):
    batch_seq = val_seq[i:i+n_batch]
    batch_labels = val_labels[i:i+n_batch]
    if (batch_seq.size()[0] == n_batch):
        val_iter.append([batch_seq, batch_labels])

#### Part 1.2 Build the model (version 1)

The RNN module will be a PyTorch model like any other, with init a forward functions. This network:
1. Takes as input the word at a particular point in the sequence, as well as the hidden state at the previous state of the network
2. Uses nn.Embedding to get a vector for the word
3. Concatenate the embedding and the hidden state
4. Apply a linear layer to get the next hidden state
5. Apply a linear layer to get the output
6. Output both 

In [476]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, vocab_size):
        super(RNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, input_size)
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        combined = torch.cat((embedded, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

#### Part 1.3: Train the model

Now we can initialize and train the network:

In [513]:
def train_batch(model, criterion, optim, batch, label):
    # initialize hidden vector
    hidden = Variable(torch.zeros(n_batch, n_hidden))

    # clear gradients
    rnn.zero_grad()

    # calculate forward pass
    for i in range(batch.size()[1]):
        output, hidden = model(batch[:, i], hidden)

    # calculate loss    
    loss = criterion(output, label)

    # backpropagate and step
    loss.backward()
    optim.step()
    
    return loss.data[0]

In [514]:
# training loop
def train(model, criterion, optim):
    for e in range(n_epochs):
        batches = 0
        epoch_loss = 0
        avg_loss = 0
        for batch, label in train_iter:
            batch_loss = train_batch(model, criterion, optim, batch, label)
            batches += 1
            epoch_loss += batch_loss
            avg_loss = ((avg_loss * (batches - 1)) + batch_loss) / batches
        
        print("Epoch ", e, " Loss: ", epoch_loss)
        


In [515]:
# size of the hidden vector
n_hidden = 3

# initialize the network
rnn = RNN(n_vocab, n_hidden, n_vocab, n_vocab)

n_epochs = 30
learning_rate = .05
criterion = nn.NLLLoss()
optim = torch.optim.SGD(rnn.parameters(), lr = learning_rate)

train(rnn, criterion, optim)

Epoch  0  Loss:  73.60259747505188
Epoch  1  Loss:  43.383880376815796
Epoch  2  Loss:  26.73484981060028
Epoch  3  Loss:  17.657559633255005
Epoch  4  Loss:  12.494984596967697
Epoch  5  Loss:  9.371681913733482
Epoch  6  Loss:  7.358956411480904
Epoch  7  Loss:  5.98852775990963
Epoch  8  Loss:  5.011282339692116
Epoch  9  Loss:  4.287019722163677
Epoch  10  Loss:  3.732741631567478
Epoch  11  Loss:  3.2970485016703606
Epoch  12  Loss:  2.9468020275235176
Epoch  13  Loss:  2.659862741827965
Epoch  14  Loss:  2.4209661558270454
Epoch  15  Loss:  2.2192936949431896
Epoch  16  Loss:  2.0469901897013187
Epoch  17  Loss:  1.898227520287037
Epoch  18  Loss:  1.7685999162495136
Epoch  19  Loss:  1.6547195129096508
Epoch  20  Loss:  1.5539430230855942
Epoch  21  Loss:  1.4641783721745014
Epoch  22  Loss:  1.3837508670985699
Epoch  23  Loss:  1.3113047368824482
Epoch  24  Loss:  1.2457311227917671
Epoch  25  Loss:  1.186115127056837
Epoch  26  Loss:  1.1316951923072338
Epoch  27  Loss:  1.081

#### Part 1.4: Test the model

Testing the model is similar to training it:

In [343]:
def test_batch(batch, label):
    if (batch.size()[0] != n_batch):
        return 0, 0
    
    # initialize hidden state
    hidden = Variable(torch.zeros(n_batch, n_hidden))
    
    # calculate forward pass
    for i in range(batch[0].size()[0]):
        output, hidden = rnn(batch[:, i], hidden)
        
    # calculate predictions
    _, pred = output.max(1)

    # calculate number of correct predictions
    correct = (pred == label).long().sum().data[0]
    return correct, n_batch

Then calculate the total score by looping through the batches:

In [344]:
# Test loop

batch_num = 0
correct = 0
total = 0
for i in range(len(val_iter)):
    batch, label = val_iter[i]
    batch_correct, batch_size = test_batch(batch, label)
    batch_num += 1
    correct += batch_correct
    total += batch_size
    
print("Percent correct: ", correct / total)

Percent correct:  0.9949596774193549


### Part 2: Using PyTorch RNN modules

PyTorch's RNN capabilities live [here](http://pytorch.org/docs/master/nn.html#recurrent-layers). We can use it as follows (note that the input is batched along the **second** dimension):

In [588]:
n_input = 10
n_hidden = 20
n_layers = 2
n_batch = 3
n_length = 5
rnn = nn.LSTM(n_input, n_hidden, n_layers)
input = Variable(torch.randn(n_length, n_batch, n_input))
h0 = Variable(torch.randn(n_layers, n_batch, n_hidden))
c0 = Variable(torch.randn(n_layers, n_batch, n_hidden))
output, hn = rnn(input, (h0, c0))
print(output, hn)

Variable containing:
(0 ,.,.) = 

Columns 0 to 8 
   0.2231 -0.0203 -0.0854 -0.2982 -0.0656  0.0930 -0.2293  0.6071 -0.0953
 -0.2261  0.1648 -0.5386  0.0828  0.2245 -0.2529 -0.0072 -0.2858  0.2296
 -0.0753 -0.3349 -0.4099  0.1039  0.0125  0.0802  0.2097 -0.1543 -0.2422

Columns 9 to 17 
  -0.1708  0.1044 -0.0797 -0.3872  0.1516  0.0719  0.0070 -0.2724 -0.0536
  0.2972  0.1917 -0.3889  0.2385 -0.0287  0.0066 -0.0919  0.0148  0.1382
 -0.0847 -0.0421 -0.1286 -0.1308 -0.0199  0.1316  0.1084 -0.2401 -0.2508

Columns 18 to 19 
  -0.0918 -0.1930
  0.0485 -0.0829
  0.0594 -0.0006

(1 ,.,.) = 

Columns 0 to 8 
   0.0831  0.0110 -0.0568 -0.2536  0.0285  0.0186 -0.1289  0.2785 -0.1063
 -0.1375  0.0868 -0.3380  0.0548  0.1371 -0.0536  0.0038 -0.0434  0.1007
 -0.1194 -0.1133 -0.4001  0.1453  0.1121  0.0408  0.0839  0.0398 -0.1405

Columns 9 to 17 
  -0.0189  0.0240 -0.1874 -0.3466  0.1337 -0.0032  0.0994 -0.1832 -0.1327
  0.2092  0.0882 -0.3217 -0.0023 -0.0114  0.0006  0.0346 -0.0807  0.0927
  0.04

We'll define a custom module to apply this module to our problem. This module will embed each integer, then apply the LSTM to the sequence, and then apply a linear and a softmax to get probabilities for each class:

In [612]:
class MyLSTM(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, vocab_size, n_layers):
        super(MyLSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        # embed the input integers
        embedded = self.embedding(input)
        
        # put the batch along the second dimension
        embedded = embedded.transpose(0, 1)
        
        # apply the LSTM
        output, hidden = self.lstm(embedded, hidden)
        
        # apply the linear and the softmax
        output = self.softmax(self.linear(output))

        return output, hidden

Training and testing are essentially the same as before, except that we no longer need to manually loop in the forward pass:

In [607]:
def train_batch(model, criterion, optim, batch, label):
    # initialize hidden vectors
    hidden = (Variable(torch.zeros(n_layers, n_batch, n_hidden)), Variable(torch.zeros(n_layers, n_batch, n_hidden)))

    # clear gradients
    rnn.zero_grad()

    # calculate forward pass
    output, hidden = model(batch, hidden)

    # calculate loss    
    loss = criterion(output[answer_pos], label)

    # backpropagate and step
    loss.backward()
    optim.step()
    
    return loss.data[0]

In [608]:
# training loop
def train(model, criterion, optim):
    for e in range(n_epochs):
        batches = 0
        epoch_loss = 0
        avg_loss = 0
        for batch, label in train_iter:
            batch_loss = train_batch(model, criterion, optim, batch, label)
            batches += 1
            epoch_loss += batch_loss
            avg_loss = ((avg_loss * (batches - 1)) + batch_loss) / batches
        
        print("Epoch ", e, " Loss: ", epoch_loss)
        


In [622]:
# size of the embeddings and vectors
n_embedding = 128
n_hidden = 128

# number of layers
n_layers = 1

# initialize LSTM
rnn = MyLSTM(n_embedding, n_hidden, n_vocab, n_vocab, n_layers)

n_epochs = 30
learning_rate = .1
criterion = nn.NLLLoss()
optim = torch.optim.SGD(rnn.parameters(), lr = learning_rate)

train(rnn, criterion, optim)

Epoch  0  Loss:  182.85484051704407
Epoch  1  Loss:  117.944895029068
Epoch  2  Loss:  78.74753439426422
Epoch  3  Loss:  65.38788288831711
Epoch  4  Loss:  60.50836080312729
Epoch  5  Loss:  58.166617691516876
Epoch  6  Loss:  56.78475075960159
Epoch  7  Loss:  55.852413058280945
Epoch  8  Loss:  55.16654431819916
Epoch  9  Loss:  54.63217830657959
Epoch  10  Loss:  54.199151039123535
Epoch  11  Loss:  53.83822554349899
Epoch  12  Loss:  53.530878841876984
Epoch  13  Loss:  53.26531022787094
Epoch  14  Loss:  53.03429317474365
Epoch  15  Loss:  52.83198964595795
Epoch  16  Loss:  52.65340077877045
Epoch  17  Loss:  52.49481302499771
Epoch  18  Loss:  52.35341036319733
Epoch  19  Loss:  52.22695928812027
Epoch  20  Loss:  52.11368536949158
Epoch  21  Loss:  52.01213473081589
Epoch  22  Loss:  51.92097508907318
Epoch  23  Loss:  51.838898718357086
Epoch  24  Loss:  51.76471447944641
Epoch  25  Loss:  51.697418332099915
Epoch  26  Loss:  51.636178970336914
Epoch  27  Loss:  51.5803039669

In [623]:
# Test loop

batch_num = 0
correct = 0
total = 0
for i in range(len(val_iter)):
    batch, label = val_iter[i]
    batch_correct, batch_size = test_batch(batch, label)
    batch_num += 1
    correct += batch_correct
    total += batch_size
    
print("Percent correct: ", correct / total)

Percent correct:  0.04939516129032258
