In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [2]:
batch_size = 3
input_length = [[2,5,3], [5,3,4]]
input_dim = 2
hidden_dim = 4
num_layers = 1

# RNN vs RNNCell

|              nn.RNN              |                 nn.RNNCell                 |
|:--------------------------------:|:------------------------------------------:|
|       unrolling whole steps      |            unrolling single step           |
| internally uses CuDNN => faster! | similar to TensorFlow's RNN implementation |

## nn.RNN

* Args:
    * input_size (int)
    * hidden_size (int)
    
    
* Optional Args
    * num_layers (int)
    * nonlinearity (activation function; ex. nn.Tanh, nn.ReLU)
    * bias (bool)
    * batch_first (bool)
    * dropout (float)
    * bidirectional (bool)


* inputs:
    * input (seq_len, batch_size, input_size)
    * h_0 (num_layers*num_directions, batch_size, hidden_size) => default: zeros


* outputs:
    * output (seq_len, batch_size, hidden_size* num_directions) => outputs from whole steps
    * h_n (num_layers, num_directions, batch, hidden_size) => last output

In [3]:
rnn = nn.RNN(input_dim, hidden_dim, batch_first=True)
rnn

RNN(2, 4, batch_first=True)

In [4]:
vars(rnn)

{'_all_weights': [['weight_ih_l0',
   'weight_hh_l0',
   'bias_ih_l0',
   'bias_hh_l0']],
 '_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x10cce1358>,
 '_backward_hooks': OrderedDict(),
 '_buffers': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 '_parameters': OrderedDict([('weight_ih_l0', Parameter containing:
                0.1542  0.2868
                0.4675  0.3869
               -0.4708  0.0957
               -0.0607 -0.3307
               [torch.FloatTensor of size 4x2]),
              ('weight_hh_l0', Parameter containing:
                0.4665  0.4471 -0.1223 -0.1255
               -0.3009  0.0404  0.3506  0.2915
               -0.0427 -0.1958 -0.0956 -0.0010
               -0.1599  0.1421  0.3546  0.2858
               [torch.FloatTensor of size 4x4]),
              ('bias_ih_l0', Parameter containing:
               -0.4124
                0.4614
               -0.2966
               -0.4744
               [torch.FloatTensor of 

In [5]:
# inputs: (input, h_0)
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))
h_0 = Variable(torch.zeros([num_layers, batch_size, hidden_dim]))
print(x.size(), h_0.size())

torch.Size([3, 5, 2]) torch.Size([1, 3, 4])


In [6]:
# outputs: (output, h_n)
assert rnn(x, h_0) == rnn(x) # default hidden_state: zeros
rnn(x)

(Variable containing:
 (0 ,.,.) = 
  -0.4937  0.6139 -0.1481 -0.0467
  -0.6928  0.5946 -0.8060  0.5793
  -0.7778 -0.1236  0.3310  0.3964
  -0.8046  0.8517 -0.4521  0.4131
  -0.6221  0.5989 -0.3003  0.3700
 
 (1 ,.,.) = 
  -0.1279  0.8874 -0.2820 -0.4299
  -0.2864  0.1046  0.0895 -0.0148
  -0.3075  0.9133 -0.5585 -0.1959
   0.4079  0.9657 -0.8523 -0.5091
   0.0712 -0.1737 -0.0917 -0.2649
 
 (2 ,.,.) = 
  -0.1643  0.8875 -0.3835 -0.3749
   0.0408  0.5818  0.0137 -0.3660
  -0.1139  0.6956 -0.3992 -0.1453
   0.1199  0.8176 -0.3207 -0.4262
  -0.2451  0.3560 -0.6392  0.1604
 [torch.FloatTensor of size 3x5x4], Variable containing:
 (0 ,.,.) = 
  -0.6221  0.5989 -0.3003  0.3700
   0.0712 -0.1737 -0.0917 -0.2649
  -0.2451  0.3560 -0.6392  0.1604
 [torch.FloatTensor of size 1x3x4])

## nn.LSTM

* Args:
    Same as RNN


* inputs:
    * input (seq_len, batch_size, input_size)
    * (h_0, c_0) ( (num_layers*num_directions, batch_size, hidden_size), (num_layers*num_directions, batch_size, hidden_size) )
    

* outputs:
    * output (seq_len, batch_size, hidden_size*num_directions)
    * (h_n, c_n) ( (num_layers*num_directions, batch_size, hidden_size), (num_layers*num_directions, batch_size, hidden_size) )

In [7]:
lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
lstm

LSTM(2, 4, batch_first=True)

In [8]:
vars(lstm)

{'_all_weights': [['weight_ih_l0',
   'weight_hh_l0',
   'bias_ih_l0',
   'bias_hh_l0']],
 '_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x10cce1358>,
 '_backward_hooks': OrderedDict(),
 '_buffers': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 '_parameters': OrderedDict([('weight_ih_l0', Parameter containing:
                0.2075  0.1240
               -0.4426 -0.1420
                0.2059 -0.1617
                0.0331  0.1155
               -0.3827  0.3408
                0.1857 -0.4769
               -0.3418  0.1522
                0.1211 -0.1431
               -0.1919  0.1294
               -0.2866  0.4115
                0.2031  0.0550
                0.1276 -0.3864
                0.1753  0.4090
               -0.4320  0.0387
                0.4692 -0.0808
               -0.4207 -0.4574
               [torch.FloatTensor of size 16x2]),
              ('weight_hh_l0', Parameter containing:
                0.0571  0.3927 -0.4739 -0.11

In [9]:
# inputs: (input, (h_0, c_0))
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))
h_0 = Variable(torch.zeros([num_layers, batch_size, hidden_dim]))
c_0 = Variable(torch.zeros([num_layers, batch_size, hidden_dim]))
print(x.size(), h_0.size(), c_0.size())

torch.Size([3, 5, 2]) torch.Size([1, 3, 4]) torch.Size([1, 3, 4])


In [10]:
# outputs: (output, (h_n, c_n))
assert lstm(x, (h_0, c_0)) == lstm(x) # default hidden_state: zeros
lstm(x)

(Variable containing:
 (0 ,.,.) = 
   0.0980  0.1053 -0.0314 -0.0607
   0.0346 -0.0967 -0.0848 -0.0226
   0.0642 -0.0468 -0.0770 -0.0332
   0.0911  0.0320 -0.0814 -0.0897
   0.0938  0.0322 -0.1014 -0.0860
 
 (1 ,.,.) = 
   0.0345 -0.0157 -0.0198 -0.0285
  -0.0534 -0.0918  0.0209  0.0170
  -0.0201 -0.1370 -0.0198  0.0058
   0.0136 -0.0286  0.0665 -0.0239
   0.0149 -0.1291 -0.0229 -0.0558
 
 (2 ,.,.) = 
   0.0349 -0.0071 -0.0059 -0.0305
  -0.0463 -0.0324  0.1087 -0.0223
   0.0213 -0.0644 -0.0057 -0.0642
   0.0527 -0.0527 -0.0573 -0.0716
   0.0659 -0.0081 -0.0125 -0.0426
 [torch.FloatTensor of size 3x5x4], (Variable containing:
  (0 ,.,.) = 
    0.0938  0.0322 -0.1014 -0.0860
    0.0149 -0.1291 -0.0229 -0.0558
    0.0659 -0.0081 -0.0125 -0.0426
  [torch.FloatTensor of size 1x3x4], Variable containing:
  (0 ,.,.) = 
    0.2556  0.0605 -0.4365 -0.1640
    0.0391 -0.3420 -0.0614 -0.1318
    0.1007 -0.0275 -0.0259 -0.3034
  [torch.FloatTensor of size 1x3x4]))

## nn.RNNCell

* Args:
    * input_size (int)
    * hidden_size (int)


* Optional Args
    * nonlinearity (activation function; ex. nn.Tanh, nn.ReLU)
    * bias (bool)
    
    
* inputs:
    * input (batch_size, input_size)
    * hidden (batch_size, hidden_size)


* outputs:
    * h (batch, hidden_size) => current output

In [11]:
rnncell = nn.RNNCell(input_dim, hidden_dim)
rnncell

RNNCell(2, 4)

In [12]:
vars(rnncell)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x10cce1358>,
 '_backward_hooks': OrderedDict(),
 '_buffers': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 '_parameters': OrderedDict([('weight_ih', Parameter containing:
                0.0462 -0.4683
                0.3904  0.3451
                0.4196  0.3587
               -0.0410  0.4752
               [torch.FloatTensor of size 4x2]),
              ('weight_hh', Parameter containing:
               -0.4353  0.3403  0.3750  0.2516
                0.3241 -0.2271 -0.0170  0.1571
               -0.0019 -0.2814 -0.0746  0.0587
               -0.1815 -0.0939  0.2721 -0.3193
               [torch.FloatTensor of size 4x4]),
              ('bias_ih', Parameter containing:
                0.4209
                0.0529
               -0.2916
                0.2168
               [torch.FloatTensor of size 4]),
              ('bias_hh', Parameter containing:
                0.1012
               -0.

In [13]:
# input
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))

# initial hidden
h = Variable(torch.zeros(batch_size, hidden_dim))

outputs = []
for i in range(max(input_length[0])):
    h = rnncell(x[:, i, :], h)
    outputs.append(h)

outputs

[Variable containing:
  0.4928  0.4206 -0.2249  0.2194
  0.5528 -0.5719 -0.8863  0.1242
  0.9087 -0.5160 -0.8612 -0.6502
 [torch.FloatTensor of size 3x4], Variable containing:
 -0.6993  0.8970  0.5518  0.8582
  0.3512 -0.7760 -0.9596 -0.6016
 -0.3001  0.2738 -0.5236 -0.1354
 [torch.FloatTensor of size 3x4], Variable containing:
  0.8768  0.1437 -0.4419  0.3393
  0.0924  0.1465 -0.4722 -0.3160
  0.6768 -0.3463 -0.7402 -0.1467
 [torch.FloatTensor of size 3x4], Variable containing:
 -0.6900  0.7996  0.1189  0.6671
  0.4396  0.2118 -0.4229 -0.0176
 -0.7075  0.3605 -0.4519  0.5858
 [torch.FloatTensor of size 3x4], Variable containing:
  0.8538 -0.1093 -0.6029  0.1427
  0.0054 -0.0776 -0.7328  0.2528
  0.3136  0.0215 -0.4674  0.5588
 [torch.FloatTensor of size 3x4]]

## nn.LSTMCell

* Args:
    * input_size (int)
    * hidden_size (int)


* Optional Args
    * bias (bool)
    
    
* inputs:
    * input (batch_size, input_size)
    * (h_0, c_0) ( (batch_size, hidden_size), (batch_size, hidden_size) )


* outputs:
    * (h_1, c_1) ( (batch_size, hidden_size), (batch_size, hidden_size) ) => current output

In [14]:
lstmcell = nn.LSTMCell(input_dim, hidden_dim)
lstmcell

LSTMCell(2, 4)

In [15]:
vars(lstmcell)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x10cce1358>,
 '_backward_hooks': OrderedDict(),
 '_buffers': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 '_parameters': OrderedDict([('weight_ih', Parameter containing:
                0.2900 -0.4561
                0.0900 -0.0536
               -0.3224  0.1832
               -0.3194 -0.2830
                0.4909 -0.2421
                0.3469 -0.2925
                0.3753  0.4415
               -0.4281 -0.0260
                0.3189  0.1862
                0.3742  0.3724
                0.1882  0.0592
                0.3040 -0.4041
               -0.4183 -0.4442
               -0.0532 -0.0383
                0.4141 -0.4651
               -0.1902  0.3165
               [torch.FloatTensor of size 16x2]),
              ('weight_hh', Parameter containing:
                0.1674 -0.4753  0.4099 -0.4182
               -0.2049  0.2616  0.1919 -0.2565
                0.0788  0.3431  0.4055 -0.3193

In [16]:
# input
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))

# initial hidden
h = Variable(torch.zeros(batch_size, hidden_dim))
c = Variable(torch.zeros(batch_size, hidden_dim))

outputs = []
for i in range(max(input_length[0])):
    h, c = lstmcell(x[:, i, :], (h, c))
    outputs.append((h, c))

outputs

[(Variable containing:
   0.0239  0.0653 -0.0766 -0.1305
  -0.0394 -0.0399 -0.1233 -0.2497
   0.0266  0.0803 -0.0728 -0.1402
  [torch.FloatTensor of size 3x4], Variable containing:
   0.0717  0.1722 -0.1206 -0.2100
  -0.0871 -0.1005 -0.2579 -0.3753
   0.0849  0.2139 -0.1184 -0.2210
  [torch.FloatTensor of size 3x4]), (Variable containing:
  -0.1323 -0.1289 -0.1874 -0.2948
  -0.0244 -0.0166 -0.1732 -0.3553
  -0.0934 -0.0941 -0.1721 -0.1685
  [torch.FloatTensor of size 3x4], Variable containing:
  -0.2126 -0.3268 -0.3410 -0.5166
  -0.0610 -0.0423 -0.3720 -0.5718
  -0.1590 -0.2392 -0.2537 -0.3164
  [torch.FloatTensor of size 3x4]), (Variable containing:
  -0.0877 -0.1197 -0.2127 -0.3029
   0.0088  0.0429 -0.1944 -0.3726
  -0.1382 -0.1583 -0.2033 -0.3050
  [torch.FloatTensor of size 3x4], Variable containing:
  -0.1927 -0.3154 -0.3559 -0.5410
   0.0256  0.1121 -0.4053 -0.6216
  -0.2564 -0.4212 -0.3663 -0.5253
  [torch.FloatTensor of size 3x4]), (Variable containing:
   0.0350  0.0593 -0.20

# (Naive) Speed Comparison between LSTM vs LSTMCell

In [17]:
from time import time

In [24]:
batch_size = 30
input_dim = 40
hidden_dim = 40
max_seq_len = 50
num_layers = 1

n_epochs = 1000

In [25]:
lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
lstmcell = lstmcell = nn.LSTMCell(input_dim, hidden_dim)

## Feeding predefined inputs
### - used in RNN Encoder

### LSTM

In [26]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))
h_0 = Variable(torch.zeros(num_layers, batch_size, hidden_dim))
c_0 = Variable(torch.zeros(num_layers, batch_size, hidden_dim))

start = time()

for _ in range(n_epochs):
    outputs_ = lstm(x, (h_0, c_0))

print(time() - start)

16.018003940582275


### LSTMCell

In [27]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))

start = time()

for _ in range(n_epochs):
    
    h = Variable(torch.zeros(batch_size, hidden_dim))
    c = Variable(torch.zeros(batch_size, hidden_dim))

    for i in range(max_seq_len):
        h, c = lstmcell(x[:, i, :], (h, c))

print(time() - start)

16.192583322525024


## Feeding output from last step
### - used in RNN Decoder

### LSTM

In [28]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))

start = time()

for _ in range(n_epochs):
    
    h = Variable(torch.zeros(num_layers, batch_size, hidden_dim))
    c = Variable(torch.zeros(num_layers, batch_size, hidden_dim))
    input = x[:, 0:1, :]
    
    for i in range(max_seq_len):
        output, (h, c) = lstm(input, (h, c))
        input = output
        
print(time() - start)

25.086395025253296


### LSTMCell

In [30]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))

start = time()

for _ in range(n_epochs):
    
    h = Variable(torch.zeros(batch_size, hidden_dim))
    c = Variable(torch.zeros(batch_size, hidden_dim))
    input = x[:, 0, :]
    
    for i in range(max_seq_len):
        h, c = lstmcell(input, (h, c))
        input = input # usually argmax is used

print(time() - start)

14.893877267837524


# Rule of thumbs
## - Use RNN when Encoding
## - Use RNNCells when decoding + (complex architecture)