In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [6]:
inputs = [torch.randn(1, 5) for _ in range(4)]  # make a sequence of length 5

There are __4__ inputs, each input has __5__ features, so LSTM __input dim = 5__

In [7]:
inputs

[tensor([[-0.9922,  0.7802, -0.6727,  1.9629, -0.1278]]),
 tensor([[ 0.2803,  0.5734,  1.4915, -0.6404, -1.5168]]),
 tensor([[ 0.2716, -0.9033, -1.8071, -0.5176, -0.0370]]),
 tensor([[-0.4774, -0.5221,  0.4505,  0.3980,  0.0160]])]

In [8]:
reshape = torch.cat(inputs).view(len(inputs),1,-1)

In [9]:
reshape, reshape.size()

(tensor([[[-0.9922,  0.7802, -0.6727,  1.9629, -0.1278]],
 
         [[ 0.2803,  0.5734,  1.4915, -0.6404, -1.5168]],
 
         [[ 0.2716, -0.9033, -1.8071, -0.5176, -0.0370]],
 
         [[-0.4774, -0.5221,  0.4505,  0.3980,  0.0160]]]),
 torch.Size([4, 1, 5]))

3 is hidden_size, is hidden state dim.

In this case, 

for each second, 

### xt.shape = (features, 1) => (5, 1)

### yt.shape = (features, 1) => (5, 1)

### ht.shape = (hidden_size, 1) => (3 ,1) 

### whh.shape = (hidden_size, hidden_size) =>(3,3)

### wxh.shape = (hidden_size, features) => (3,5)

### why.shape = (features, hidden_size) => (5,3)

下图$g_{t}$ 和 $\tilde{C_{t}}$ 一样。
input_size is features

para0 : [24,10] = 4*hidden_size, input_size

$W_{ii}, W_{if},W_{ig},W_{io}$

para1 : [24,6] = 4*hidden_size, hidden_size
$W_{hi}, W_{hf},W_{hg},W_{ho}$

para2 : [24] = 4*hidden_size
$b_{ii}, b_{if},b_{ig},b_{io}$

para3 : [24] = 4*hidden_size
$b_{hi}, b_{hf},b_{hg},b_{ho}$

In [10]:
import torch.nn as nn
lstm1 = nn.LSTM(input_size = 10, hidden_size = 6)
for i, para in enumerate(lstm1.parameters()):
    print(i, para.size())

0 torch.Size([24, 10])
1 torch.Size([24, 6])
2 torch.Size([24])
3 torch.Size([24])


__For each element in the input sequence, each layer computes the following function__

\begin{split}\begin{array}{ll}
i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
c_t = f_t c_{(t-1)} + i_t g_t \\
h_t = o_t \tanh(c_t)
\end{array}\end{split}

![1.png](lstm-img/lstm1.png)
![2.png](lstm-img/lstm2.png)
![3.png](lstm-img/lstm3.png)
![4.png](lstm-img/lstm4.png)

In [10]:
lstm = nn.LSTM(5,3,2)

* __input_size__ – The number of expected features in the input x
* __hidden_size__ – The number of features in the hidden state h
* __num_layers__ – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1 

(__this is how many layers stack upper together__)

* __bias__ – If False, then the layer does not use bias weights b_ih and b_hh. Default: True
* __batch_first__ – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False
* __dropout__ – If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. Default: 0
* __bidirectional__ – If True, becomes a bidirectional LSTM. Default: False

## step1: initialize hidden
hidden is tuple, 
(h0,c0)
In this case, both h_0 and c_0 are (2,1,3)

                                    (
                                    
                                     2  => lstm_num_layer * num_directions 
                                     1  => mini-batch_size,
                                     3  => hidden_size
                                     
                                     )
__num_direction__ is 1 by default (when bidirectional is False)

__num_direction__ is 2 (when bidirectional is True)

thanks answer from https://en.wikipedia.org/wiki/Bidirectional_recurrent_neural_networks#/media/File:RNN_BRNN.png

https://discuss.pytorch.org/t/what-is-num-directions-of-nn-lstm/11663

In [23]:
hidden = (torch.randn(2, 1, 3),
          torch.randn(2, 1, 3))

## step2: construct input
input of lstm is (4,1,5)

                 (
                 
                  4 => sequence_num,
                  1 => mini-batch_size,
                  5 => features number for every time
                  
                 )

In [24]:
input1 = torch.cat(inputs).view(len(inputs),1,-1)
input1.size()

torch.Size([4, 1, 5])

In [25]:
print("before inputs is:\n {}\n\n"
      "before length of inputs is: {}\n\n"
      "after input1 is:\n {}\n\n"
      "after input1.size is: {}".format(inputs, len(inputs), input1, input1.size()))

before inputs is:
 [tensor([[ 0.6622,  1.6556,  0.6693, -0.8457,  0.8894]]), tensor([[-0.6806,  0.5081, -0.4722, -1.0739,  0.9023]]), tensor([[ 0.7480,  0.8019, -0.2547,  0.0481,  1.5207]]), tensor([[-0.2853, -1.1381, -0.9574, -0.8077,  0.1744]])]

before length of inputs is: 4

after input1 is:
 tensor([[[ 0.6622,  1.6556,  0.6693, -0.8457,  0.8894]],

        [[-0.6806,  0.5081, -0.4722, -1.0739,  0.9023]],

        [[ 0.7480,  0.8019, -0.2547,  0.0481,  1.5207]],

        [[-0.2853, -1.1381, -0.9574, -0.8077,  0.1744]]])

after input1.size is: torch.Size([4, 1, 5])


### input of lstm is:

__input1, (h0, c0)__

### Output of lstm is:

__output, (hn, cn)__


#### If input is only one step of a sequence, then ouput is

__output__ of lstm is (1,1,3)

                 (
                
                  1 => sequence_num,
                  1 => mini-batch_size,
                  3 => num_directions * hidden_size
                  
                 )
              
__num_direction__ is 1 by default (when bidirectional is False)

__num_direction__ is 2 (when bidirectional is True)

output __hidden__ is __h_n__, __c_n__,

both of them have shape of (2,1,3)

                                (

                                 2  => lstm_num_layer * num_directions 
                                 1  => mini-batch_size,
                                 3  => hidden_size

                                 )
                                 
the last slice of __h_n__, is equal to __out__

hidden[0][1,:,:] == out

In [34]:
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    # hidden is a tuple
    print("out is:\n{}\n out.shape is:\n{}\n\n".format(out, out.size()))
    print("h_n is:\n{}\n\n, h_n shape is:\n{}\n"
           "c_n is:\n{}\n\n, c_n shape is:\n{}\n".format(hidden[0],hidden[0].size(), hidden[1], hidden[1].size()))

out is:
tensor([[[ 0.1718, -0.1224, -0.1719]]])
 out.shape is:
torch.Size([1, 1, 3])


h_n is:
tensor([[[ 0.0776,  0.0843,  0.2449]],

        [[ 0.1718, -0.1224, -0.1719]]])

, h_n shape is:
torch.Size([2, 1, 3])
c_n is:
tensor([[[ 0.4472,  0.2617,  0.5426]],

        [[ 0.6434, -0.2376, -0.3676]]])

, c_n shape is:
torch.Size([2, 1, 3])

out is:
tensor([[[ 0.1687, -0.1234, -0.1716]]])
 out.shape is:
torch.Size([1, 1, 3])


h_n is:
tensor([[[ 0.0863,  0.1179,  0.3284]],

        [[ 0.1687, -0.1234, -0.1716]]])

, h_n shape is:
torch.Size([2, 1, 3])
c_n is:
tensor([[[ 0.3224,  0.3522,  0.5558]],

        [[ 0.6452, -0.2376, -0.3670]]])

, c_n shape is:
torch.Size([2, 1, 3])

out is:
tensor([[[ 0.1711, -0.1238, -0.1735]]])
 out.shape is:
torch.Size([1, 1, 3])


h_n is:
tensor([[[ 0.0935,  0.1233,  0.3011]],

        [[ 0.1711, -0.1238, -0.1735]]])

, h_n shape is:
torch.Size([2, 1, 3])
c_n is:
tensor([[[ 0.3851,  0.5083,  0.4121]],

        [[ 0.6501, -0.2374, -0.3724]]])

, c_n shape i

#### If input is whole sequence, then ouput is

__output__ of lstm is (4,1,3)

                 (
                
                  4 => sequence_num,
                  1 => mini-batch_size,
                  3 => num_directions * hidden_size
                  
                 )

output __hidden__ is __h_n__, __c_n__,

both of them have shape of (2,1,3)

                                (

                                 2  => lstm_num_layer * num_directions 
                                 1  => mini-batch_size,
                                 3  => hidden_size

                                 )
                                 
The last slice of __out__ is equal to last slice __h_n__

out[3,:,:] == hidden[0][1,:,:]

In [38]:
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state

# (compare the last slice of "out" with "hidden" below, they are the same)

# out[3,:,:] is same as hidden[0][1,:,:]

# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
hidden = (torch.randn(2, 1, 3), torch.randn(2, 1, 3))  # clean out hidden state
out, hidden = lstm(input1, hidden)
print(out, out.size())

print(hidden)

print(out[3,:,:])
print(hidden[0][1,:,:])


tensor([[[ 0.0840,  0.0881, -0.5538]],

        [[ 0.2312,  0.0122, -0.2594]],

        [[ 0.1971, -0.0598, -0.2299]],

        [[ 0.1827, -0.0999, -0.1956]]]) torch.Size([4, 1, 3])
(tensor([[[ 0.0360,  0.1725,  0.3835]],

        [[ 0.1827, -0.0999, -0.1956]]]), tensor([[[ 0.1035,  0.3549,  0.5074]],

        [[ 0.7407, -0.1862, -0.4313]]]))
tensor([[ 0.1827, -0.0999, -0.1956]])
tensor([[ 0.1827, -0.0999, -0.1956]])
