In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
inputs = [torch.randn(1, 5) for _ in range(4)]  # make a sequence of length 5

There are __4__ inputs, each input has __5__ features, so LSTM __input dim = 5__

In [3]:
inputs

[tensor([[ 1.7585,  0.9087,  1.2925, -0.9285,  0.2344]]),
 tensor([[-0.2671, -0.8850, -2.1523, -1.6194, -1.7567]]),
 tensor([[ 0.3331,  1.6464, -0.0294,  0.9305,  0.4814]]),
 tensor([[-0.8014, -0.0726, -0.9579,  1.3642, -0.5175]])]

In [4]:
reshape = torch.cat(inputs).view(len(inputs),1,-1)

In [5]:
reshape, reshape.size()

(tensor([[[ 1.7585,  0.9087,  1.2925, -0.9285,  0.2344]],
 
         [[-0.2671, -0.8850, -2.1523, -1.6194, -1.7567]],
 
         [[ 0.3331,  1.6464, -0.0294,  0.9305,  0.4814]],
 
         [[-0.8014, -0.0726, -0.9579,  1.3642, -0.5175]]]),
 torch.Size([4, 1, 5]))

3 is hidden_size, is hidden state dim.

In this case, 

for each second, 

### xt.shape = (features, 1) => (5, 1)

### yt.shape = (features, 1) => (5, 1)

### ht.shape = (hidden_size, 1) => (3 ,1) 

### whh.shape = (hidden_size, hidden_size) =>(3,3)

### wxh.shape = (hidden_size, features) => (3,5)

### why.shape = (features, hidden_size) => (5,3)

__For each element in the input sequence, each layer computes the following function__

\begin{split}\begin{array}{ll}
i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
c_t = f_t c_{(t-1)} + i_t g_t \\
h_t = o_t \tanh(c_t)
\end{array}\end{split}

![1.png](lstm-img/lstm1.png)
![2.png](lstm-img/lstm2.png)
![3.png](lstm-img/lstm3.png)
![4.png](lstm-img/lstm4.png)

In [6]:
lstm = nn.LSTM(5,3,2)

* __input_size__ – The number of expected features in the input x
* __hidden_size__ – The number of features in the hidden state h
* __num_layers__ – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1 

(__this is how many layers stack upper together__)

* __bias__ – If False, then the layer does not use bias weights b_ih and b_hh. Default: True
* __batch_first__ – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False
* __dropout__ – If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. Default: 0
* __bidirectional__ – If True, becomes a bidirectional LSTM. Default: False

## step1: initialize hidden
hidden is tuple, 
(h0,c0)
In this case, both h_0 and c_0 are (2,1,3)

                                    (
                                    
                                     2  => lstm_num_layer * num_directions 
                                     1  => mini-batch_size,
                                     3  => hidden_size
                                     
                                     )
__num_direction__ is 1 by default (when bidirectional is False)

__num_direction__ is 2 (when bidirectional is True)

thanks answer from https://en.wikipedia.org/wiki/Bidirectional_recurrent_neural_networks#/media/File:RNN_BRNN.png

https://discuss.pytorch.org/t/what-is-num-directions-of-nn-lstm/11663

In [7]:
hidden = (torch.randn(2, 1, 3),
          torch.randn(2, 1, 3))

## step2: construct input
input of lstm is (4,1,5)

                 (
                 
                  4 => sequence_num,
                  1 => mini-batch_size,
                  5 => features number for every time
                  
                 )

In [8]:
input1 = torch.cat(inputs).view(len(inputs),1,-1)
input1.size()

torch.Size([4, 1, 5])

In [9]:
print("before inputs is:\n {}\n\n"
      "before length of inputs is: {}\n\n"
      "after input1 is:\n {}\n\n"
      "after input1.size is: {}".format(inputs, len(inputs), input1, input1.size()))

before inputs is:
 [tensor([[ 1.7585,  0.9087,  1.2925, -0.9285,  0.2344]]), tensor([[-0.2671, -0.8850, -2.1523, -1.6194, -1.7567]]), tensor([[ 0.3331,  1.6464, -0.0294,  0.9305,  0.4814]]), tensor([[-0.8014, -0.0726, -0.9579,  1.3642, -0.5175]])]

before length of inputs is: 4

after input1 is:
 tensor([[[ 1.7585,  0.9087,  1.2925, -0.9285,  0.2344]],

        [[-0.2671, -0.8850, -2.1523, -1.6194, -1.7567]],

        [[ 0.3331,  1.6464, -0.0294,  0.9305,  0.4814]],

        [[-0.8014, -0.0726, -0.9579,  1.3642, -0.5175]]])

after input1.size is: torch.Size([4, 1, 5])


### input of lstm is:

__input1, (h0, c0)__

### Output of lstm is:

__output, (hn, cn)__


#### If input is only one step of a sequence, then ouput is

__output__ of lstm is (1,1,3)

                 (
                
                  1 => sequence_num,
                  1 => mini-batch_size,
                  3 => num_directions * hidden_size
                  
                 )
              
__num_direction__ is 1 by default (when bidirectional is False)

__num_direction__ is 2 (when bidirectional is True)

output __hidden__ is __h_n__, __c_n__,

both of them have shape of (2,1,3)

                                (

                                 2  => lstm_num_layer * num_directions 
                                 1  => mini-batch_size,
                                 3  => hidden_size

                                 )
                                 
the last slice of __h_n__, is equal to __out__

hidden[0][1,:,:] == out

In [10]:
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    # hidden is a tuple
    print("out is:\n{}\n out.shape is:\n{}\n\n".format(out, out.size()))
    print("h_n is:\n{}\n\n, h_n shape is:\n{}\n"
           "c_n is:\n{}\n\n, c_n shape is:\n{}\n".format(hidden[0],hidden[0].size(), hidden[1], hidden[1].size()))

out is:
tensor([[[ 0.0486, -0.0038, -0.1808]]])
 out.shape is:
torch.Size([1, 1, 3])


h_n is:
tensor([[[ 0.1133,  0.0932, -0.0258]],

        [[ 0.0486, -0.0038, -0.1808]]])

, h_n shape is:
torch.Size([2, 1, 3])
c_n is:
tensor([[[ 0.1691,  0.1439, -0.0691]],

        [[ 0.1078, -0.0114, -0.5262]]])

, c_n shape is:
torch.Size([2, 1, 3])

out is:
tensor([[[ 0.1031, -0.0778, -0.0843]]])
 out.shape is:
torch.Size([1, 1, 3])


h_n is:
tensor([[[ 0.0335, -0.1472, -0.1614]],

        [[ 0.1031, -0.0778, -0.0843]]])

, h_n shape is:
torch.Size([2, 1, 3])
c_n is:
tensor([[[ 0.0408, -0.2807, -0.2694]],

        [[ 0.2861, -0.2007, -0.2837]]])

, c_n shape is:
torch.Size([2, 1, 3])

out is:
tensor([[[ 0.1225, -0.0928, -0.0572]]])
 out.shape is:
torch.Size([1, 1, 3])


h_n is:
tensor([[[-0.0245,  0.3293, -0.0703]],

        [[ 0.1225, -0.0928, -0.0572]]])

, h_n shape is:
torch.Size([2, 1, 3])
c_n is:
tensor([[[-0.0572,  0.5126, -0.1620]],

        [[ 0.3279, -0.2472, -0.1559]]])

, c_n shape i

#### If input is whole sequence, then ouput is

__output__ of lstm is (4,1,3)

                 (
                
                  4 => sequence_num,
                  1 => mini-batch_size,
                  3 => num_directions * hidden_size
                  
                 )

output __hidden__ is __h_n__, __c_n__,

both of them have shape of (2,1,3)

                                (

                                 2  => lstm_num_layer * num_directions 
                                 1  => mini-batch_size,
                                 3  => hidden_size

                                 )
                                 
The last slice of __out__ is equal to last slice __h_n__

out[3,:,:] == hidden[0][1,:,:]

In [12]:
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state

# (compare the last slice of "out" with "hidden" below, they are the same)

# out[3,:,:] is same as hidden[0][1,:,:]

# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
hidden = (torch.randn(2, 1, 3), torch.randn(2, 1, 3))  # clean out hidden state
out, hidden = lstm(input1, hidden)
print("out is :\n{}\n\n, out.size is {}".format(out, out.size()))

print("hidden is :\n{}\n\n".format(hidden))

print("last output is:\n {}\n\n".format(out[3,:,:]))
print("h_n in last layer is:\n {}\n\n".format(hidden[0][1,:,:]))

out is :
tensor([[[ 0.0678, -0.2624, -0.1539]],

        [[ 0.0935, -0.1988, -0.0568]],

        [[ 0.1031, -0.1867, -0.0445]],

        [[ 0.1161, -0.1871, -0.0482]]])

, out.size is torch.Size([4, 1, 3])
hidden is :
(tensor([[[ 0.0016,  0.3302, -0.1628]],

        [[ 0.1161, -0.1871, -0.0482]]]), tensor([[[ 0.0041,  0.5225, -0.2216]],

        [[ 0.3147, -0.5357, -0.1303]]]))


last output is:
 tensor([[ 0.1161, -0.1871, -0.0482]])


h_n in last layer is:
 tensor([[ 0.1161, -0.1871, -0.0482]])




In [16]:
out[:,-1,:].size()

torch.Size([4, 3])

In [14]:
out.size()

torch.Size([4, 1, 3])

In [15]:
out

tensor([[[ 0.0678, -0.2624, -0.1539]],

        [[ 0.0935, -0.1988, -0.0568]],

        [[ 0.1031, -0.1867, -0.0445]],

        [[ 0.1161, -0.1871, -0.0482]]])