In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [2]:
batch_size = 3
input_length = [[2,5,3], [5,3,4]]
input_dim = 2
hidden_dim = 4
num_layers = 1

# RNN vs RNNCell

|              nn.RNN              |                 nn.RNNCell                 |
|:--------------------------------:|:------------------------------------------:|
|       unrolling whole steps      |            unrolling single step           |
| internally uses CuDNN => faster! | similar to TensorFlow's RNN implementation |

## nn.RNN

* Args:
    * input_size (int)
    * hidden_size (int)
    
    
* Optional Args
    * num_layers (int)
    * nonlinearity (activation function; ex. nn.Tanh, nn.ReLU)
    * bias (bool)
    * batch_first (bool)
    * dropout (float)
    * bidirectional (bool)


* inputs:
    * input (seq_len, batch_size, input_size)
    * h_0 (num_layers*num_directions, batch_size, hidden_size) => default: zeros


* outputs:
    * output (seq_len, batch_size, hidden_size* num_directions) => outputs from whole steps
    * h_n (num_layers, num_directions, batch, hidden_size) => last output

In [3]:
rnn = nn.RNN(input_dim, hidden_dim, batch_first=True)
rnn

RNN(2, 4, batch_first=True)

In [4]:
vars(rnn)

{'_all_weights': [['weight_ih_l0',
   'weight_hh_l0',
   'bias_ih_l0',
   'bias_hh_l0']],
 '_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1090d0780>,
 '_backward_hooks': OrderedDict(),
 '_buffers': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 '_parameters': OrderedDict([('weight_ih_l0', Parameter containing:
                0.0477 -0.4365
                0.2555  0.4368
                0.3737 -0.1157
               -0.1752  0.3154
               [torch.FloatTensor of size 4x2]),
              ('weight_hh_l0', Parameter containing:
                0.2164  0.2660  0.4514  0.4022
               -0.3634  0.4438  0.4042 -0.1533
                0.4506 -0.1276  0.4494 -0.1418
                0.4890 -0.1199 -0.1734 -0.2589
               [torch.FloatTensor of size 4x4]),
              ('bias_ih_l0', Parameter containing:
               -0.1168
                0.3833
               -0.3831
                0.1456
               [torch.FloatTensor of 

In [5]:
# inputs: (input, h_0)
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))
h_0 = Variable(torch.zeros([num_layers, batch_size, hidden_dim]))
print(x.size(), h_0.size())

torch.Size([3, 5, 2]) torch.Size([1, 3, 4])


In [6]:
# outputs: (output, h_n)
assert rnn(x, h_0) == rnn(x) # default hidden_state: zeros
rnn(x)

(Variable containing:
 (0 ,.,.) = 
  -0.0806  0.5714  0.4102  0.5077
   0.9465 -0.4646  0.5344 -0.5717
  -0.0087  0.0118  0.6334  0.8715
   0.5138  0.2963  0.0419  0.4018
   0.7385 -0.3995  0.3720  0.3232
 
 (1 ,.,.) = 
   0.1152 -0.3016 -0.3987  0.6483
   0.3387 -0.8053 -0.6235  0.5949
   0.0015 -0.5674  0.1528  0.5908
   0.5571 -0.5701  0.0896  0.2579
   0.2509 -0.2802  0.4774  0.6495
 
 (2 ,.,.) = 
  -0.4368  0.5026 -0.2393  0.7874
   0.3199  0.0372 -0.6229  0.2809
   0.3174 -0.8161 -0.5950  0.6668
  -0.4400 -0.4234 -0.3515  0.8216
   0.0635 -0.2178 -0.3839  0.2879
 [torch.FloatTensor of size 3x5x4], Variable containing:
 (0 ,.,.) = 
   0.7385 -0.3995  0.3720  0.3232
   0.2509 -0.2802  0.4774  0.6495
   0.0635 -0.2178 -0.3839  0.2879
 [torch.FloatTensor of size 1x3x4])

## nn.LSTM

* Args:
    Same as RNN


* inputs:
    * input (seq_len, batch_size, input_size)
    * (h_0, c_0) ( (num_layers*num_directions, batch_size, hidden_size), (num_layers*num_directions, batch_size, hidden_size) )
    

* outputs:
    * output (seq_len, batch_size, hidden_size*num_directions)
    * (h_n, c_n) ( (num_layers*num_directions, batch_size, hidden_size), (num_layers*num_directions, batch_size, hidden_size) )

In [7]:
lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
lstm

LSTM(2, 4, batch_first=True)

In [8]:
vars(lstm)

{'_all_weights': [['weight_ih_l0',
   'weight_hh_l0',
   'bias_ih_l0',
   'bias_hh_l0']],
 '_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1090d0780>,
 '_backward_hooks': OrderedDict(),
 '_buffers': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 '_parameters': OrderedDict([('weight_ih_l0', Parameter containing:
               -0.2709 -0.4489
               -0.1869 -0.2587
                0.1093  0.2780
               -0.3045 -0.3138
                0.2527 -0.2527
                0.2759 -0.1833
                0.4317  0.2930
               -0.0862  0.2169
               -0.2793 -0.3404
               -0.3539 -0.1586
                0.3308  0.3165
               -0.1210 -0.4383
                0.4473  0.1359
               -0.0468  0.0816
               -0.3572 -0.4207
               -0.4613 -0.1007
               [torch.FloatTensor of size 16x2]),
              ('weight_hh_l0', Parameter containing:
                0.4584  0.3134 -0.0054 -0.20

In [9]:
# inputs: (input, (h_0, c_0))
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))
h_0 = Variable(torch.zeros([num_layers, batch_size, hidden_dim]))
c_0 = Variable(torch.zeros([num_layers, batch_size, hidden_dim]))
print(x.size(), h_0.size(), c_0.size())

torch.Size([3, 5, 2]) torch.Size([1, 3, 4]) torch.Size([1, 3, 4])


In [10]:
# outputs: (output, (h_n, c_n))
assert lstm(x, (h_0, c_0)) == lstm(x) # default hidden_state: zeros
lstm(x)

(Variable containing:
 (0 ,.,.) = 
   0.1413 -0.1147 -0.1643 -0.0261
   0.0758 -0.1829 -0.1464 -0.1577
   0.0350 -0.1714 -0.1701 -0.3384
   0.0329 -0.2155 -0.1239 -0.2429
   0.0831 -0.2252 -0.1652 -0.2530
 
 (1 ,.,.) = 
   0.1805 -0.0931 -0.2023  0.0825
   0.1397  0.0705 -0.3338  0.0517
   0.3162 -0.1030 -0.2764  0.0282
   0.1159 -0.1791 -0.1993 -0.1239
   0.1401 -0.1082 -0.3206 -0.2668
 
 (2 ,.,.) = 
   0.1474  0.0213 -0.2579  0.1579
   0.1962  0.0346 -0.3296  0.1853
   0.2079 -0.0936 -0.2783 -0.0278
   0.2046 -0.0725 -0.3544 -0.0412
   0.1711 -0.1166 -0.2978 -0.1789
 [torch.FloatTensor of size 3x5x4], (Variable containing:
  (0 ,.,.) = 
    0.0831 -0.2252 -0.1652 -0.2530
    0.1401 -0.1082 -0.3206 -0.2668
    0.1711 -0.1166 -0.2978 -0.1789
  [torch.FloatTensor of size 1x3x4], Variable containing:
  (0 ,.,.) = 
    0.1531 -0.6141 -0.3403 -0.5543
    0.4037 -0.2540 -0.5825 -0.4177
    0.4052 -0.2837 -0.6384 -0.3207
  [torch.FloatTensor of size 1x3x4]))

## nn.RNNCell

* Args:
    * input_size (int)
    * hidden_size (int)


* Optional Args
    * nonlinearity (activation function; ex. nn.Tanh, nn.ReLU)
    * bias (bool)
    
    
* inputs:
    * input (batch_size, input_size)
    * hidden (batch_size, hidden_size)


* outputs:
    * h (batch, hidden_size) => current output

In [11]:
rnncell = nn.RNNCell(input_dim, hidden_dim)
rnncell

RNNCell(2, 4)

In [12]:
vars(rnncell)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1090d0780>,
 '_backward_hooks': OrderedDict(),
 '_buffers': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 '_parameters': OrderedDict([('weight_ih', Parameter containing:
                0.1422 -0.0114
                0.0492  0.0542
                0.3163  0.1715
               -0.2685 -0.2623
               [torch.FloatTensor of size 4x2]),
              ('weight_hh', Parameter containing:
               -0.4082 -0.0328  0.4141  0.3738
               -0.1749 -0.4950 -0.1493 -0.3171
               -0.4664 -0.0455  0.3793  0.4158
                0.4268 -0.1448 -0.2446 -0.4422
               [torch.FloatTensor of size 4x4]),
              ('bias_ih', Parameter containing:
                0.3230
                0.4584
                0.4306
                0.3714
               [torch.FloatTensor of size 4]),
              ('bias_hh', Parameter containing:
               -0.0188
                0.

In [13]:
# input
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))

# initial hidden
h = Variable(torch.zeros(batch_size, hidden_dim))

outputs = []
for i in range(max(input_length[0])):
    h = rnncell(x[:, i, :], h)
    outputs.append(h)

outputs

[Variable containing:
  0.2853  0.5973  0.6975  0.5364
  0.3273  0.5900  0.7028  0.5684
  0.3868  0.5996  0.7596  0.5069
 [torch.FloatTensor of size 3x4], Variable containing:
  0.4475 -0.0430  0.5622  0.6679
  0.5876  0.0053  0.7822  0.4333
  0.5635 -0.0311  0.7148  0.5793
 [torch.FloatTensor of size 3x4], Variable containing:
  0.6099  0.3228  0.8517  0.3340
  0.4487  0.2991  0.7331  0.5217
  0.5437  0.3630  0.8779  0.0896
 [torch.FloatTensor of size 3x4], Variable containing:
  0.5700  0.0596  0.6230  0.7865
  0.4346  0.1853  0.7587  0.3493
  0.3037  0.2479  0.6677  0.5082
 [torch.FloatTensor of size 3x4], Variable containing:
  0.4834  0.1054  0.5993  0.7135
  0.5092  0.2283  0.7026  0.6245
  0.5871  0.2827  0.8744  0.1081
 [torch.FloatTensor of size 3x4]]

## nn.LSTMCell

* Args:
    * input_size (int)
    * hidden_size (int)


* Optional Args
    * bias (bool)
    
    
* inputs:
    * input (batch_size, input_size)
    * (h_0, c_0) ( (batch_size, hidden_size), (batch_size, hidden_size) )


* outputs:
    * (h_1, c_1) ( (batch_size, hidden_size), (batch_size, hidden_size) ) => current output

In [14]:
lstmcell = nn.LSTMCell(input_dim, hidden_dim)
lstmcell

LSTMCell(2, 4)

In [15]:
vars(lstmcell)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x1090d0780>,
 '_backward_hooks': OrderedDict(),
 '_buffers': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 '_parameters': OrderedDict([('weight_ih', Parameter containing:
                0.3406 -0.1945
               -0.1697 -0.0063
               -0.3277  0.3388
               -0.1071 -0.4594
                0.4088  0.0952
               -0.2136 -0.1484
               -0.4779  0.4432
                0.0056 -0.2826
                0.1696  0.4416
                0.2224 -0.0016
                0.4745 -0.2597
                0.1774 -0.3403
                0.3643  0.3717
                0.3297  0.2010
                0.1882  0.0392
               -0.0858 -0.2674
               [torch.FloatTensor of size 16x2]),
              ('weight_hh', Parameter containing:
               -0.1149 -0.3341  0.2543  0.2980
               -0.3893  0.3343  0.1346 -0.2728
               -0.2206 -0.2333 -0.1868  0.4417

In [16]:
# input
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))

# initial hidden
h = Variable(torch.zeros(batch_size, hidden_dim))
c = Variable(torch.zeros(batch_size, hidden_dim))

outputs = []
for i in range(max(input_length[0])):
    h, c = lstmcell(x[:, i, :], (h, c))
    outputs.append((h, c))

outputs

[(Variable containing:
   0.2027  0.0043 -0.1469 -0.0637
   0.1030  0.0821  0.0680  0.0982
   0.3911  0.1224 -0.0046 -0.0310
  [torch.FloatTensor of size 3x4], Variable containing:
   0.3808  0.0098 -0.3388 -0.1540
   0.2036  0.1805  0.1373  0.1944
   0.5825  0.2047 -0.0085 -0.0900
  [torch.FloatTensor of size 3x4]), (Variable containing:
   0.5542  0.1539 -0.0017 -0.0301
   0.0017  0.0428  0.0570  0.1746
   0.0129 -0.0097 -0.0314  0.0871
  [torch.FloatTensor of size 3x4], Variable containing:
   0.9116  0.2515 -0.0030 -0.0857
   0.0047  0.1236  0.1313  0.3173
   0.0491 -0.0396 -0.0804  0.1453
  [torch.FloatTensor of size 3x4]), (Variable containing:
   0.6459  0.1116  0.0093 -0.0447
   0.2258  0.1078  0.0807  0.0544
   0.0288  0.0293  0.0122  0.1176
  [torch.FloatTensor of size 3x4], Variable containing:
   1.2309  0.1918  0.0172 -0.1420
   0.4170  0.2161  0.1650  0.1216
   0.0711  0.0764  0.0269  0.2199
  [torch.FloatTensor of size 3x4]), (Variable containing:
   0.6643  0.1644  0.11

# (Naive) Speed Comparison between LSTM vs LSTMCell

In [17]:
from time import time

In [18]:
batch_size = 30
input_dim = 40
hidden_dim = 40
max_seq_len = 50
num_layers = 1

In [19]:
lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
lstmcell = lstmcell = nn.LSTMCell(input_dim, hidden_dim)

## Feeding predefined inputs
### - used in RNN Encoder

### LSTM

In [20]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))
h_0 = Variable(torch.zeros(num_layers, batch_size, hidden_dim))
c_0 = Variable(torch.zeros(num_layers, batch_size, hidden_dim))

start = time()

for _ in range(500):
    outputs_ = lstm(x, (h_0, c_0))

print(time() - start)

9.440079927444458


### LSTMCell

In [21]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))

start = time()

for _ in range(500):
    
    h = Variable(torch.zeros(batch_size, hidden_dim))
    c = Variable(torch.zeros(batch_size, hidden_dim))

    for i in range(max_seq_len):
        h, c = lstmcell(x[:, i, :], (h, c))

print(time() - start)

11.81505012512207


## Feeding output from last step
### - used in RNN Decoder

### LSTM

In [22]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))

start = time()

for _ in range(500):
    
    h = Variable(torch.zeros(num_layers, batch_size, hidden_dim))
    c = Variable(torch.zeros(num_layers, batch_size, hidden_dim))
    input = x[:, 0:1, :]
    
    for i in range(max_seq_len):
        output, (h, c) = lstm(input, (h, c))
        input = output
        
print(time() - start)

16.5055410861969


### LSTMCell

In [23]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))

start = time()

for _ in range(5000):
    
    h = Variable(torch.zeros(batch_size, hidden_dim))
    c = Variable(torch.zeros(batch_size, hidden_dim))
    input = x[:, 0, :]
    
    for i in range(5):
        h, c = lstmcell(input, (h, c))

print(time() - start)

9.428470849990845


# Rule of thumbs
## - Use RNN when Encoding
## - Use RNNCells when decoding + (complex architecture)