In [61]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [81]:
inputs = [torch.randn(1, 5) for _ in range(4)]  # make a sequence of length 5

There are __4__ inputs, each input has __5__ features, so LSTM __input dim = 5__

In [82]:
inputs

[tensor([[ 0.0786,  0.5960,  2.1105, -0.3480,  1.2661]]),
 tensor([[ 0.1282, -0.3206,  1.8067,  0.9972, -0.4360]]),
 tensor([[-0.6857, -1.5372, -0.7193,  0.3391, -0.7413]]),
 tensor([[-0.6050, -0.5376,  0.3297,  2.1191,  0.4590]])]

In [83]:
reshape = torch.cat(inputs).view(len(inputs),1,-1)

In [84]:
reshape, reshape.size()

(tensor([[[ 0.0786,  0.5960,  2.1105, -0.3480,  1.2661]],
 
         [[ 0.1282, -0.3206,  1.8067,  0.9972, -0.4360]],
 
         [[-0.6857, -1.5372, -0.7193,  0.3391, -0.7413]],
 
         [[-0.6050, -0.5376,  0.3297,  2.1191,  0.4590]]]),
 torch.Size([4, 1, 5]))

3 is hidden_size, is hidden state dim.

In this case, 

for each second, 

### xt.shape = (features, 1) => (5, 1)

### yt.shape = (features, 1) => (5, 1)

### ht.shape = (hidden_size, 1) => (3 ,1) 

### whh.shape = (hidden_size, hidden_size) =>(3,3)

### wxh.shape = (hidden_size, features) => (3,5)

### why.shape = (features, hidden_size) => (5,3)

__For each element in the input sequence, each layer computes the following function__

\begin{split}\begin{array}{ll}
i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
c_t = f_t c_{(t-1)} + i_t g_t \\
h_t = o_t \tanh(c_t)
\end{array}\end{split}

![1.png](lstm-img/lstm1.png)
![2.png](lstm-img/lstm2.png)
![3.png](lstm-img/lstm3.png)
![4.png](lstm-img/lstm4.png)

In [87]:
lstm = nn.LSTM(5,3,2)

* __input_size__ – The number of expected features in the input x
* __hidden_size__ – The number of features in the hidden state h
* __num_layers__ – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1 

(__this is how many layers stack upper together__)

* __bias__ – If False, then the layer does not use bias weights b_ih and b_hh. Default: True
* __batch_first__ – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False
* __dropout__ – If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. Default: 0
* __bidirectional__ – If True, becomes a bidirectional LSTM. Default: False

## step1: initialize hidden
hidden is tuple, 
(h0,c0)
In this case, both h0 and c0 are (2,1,3)

                                    (
                                    
                                     2  => lstm_num_layer,
                                     1  => mini-batch_size,
                                     3  => hidden_size
                                     
                                     )

In [108]:
hidden = (torch.randn(2, 1, 3),
          torch.randn(2, 1, 3))

## step2: construct input
input of lstm is (5,1,10)

                 (
                 
                  4 => sequence_num,
                  1 => mini-batch_size,
                  5 => features number for every time
                  
                 )

In [109]:
input1 = torch.cat(inputs).view(len(inputs),1,-1)
input1.size()

torch.Size([4, 1, 5])

In [110]:
print("before inputs is:\n {}\n\n"
      "before length of inputs is: {}\n\n"
      "after input1 is:\n {}\n\n"
      "after input1.size is: {}".format(inputs, len(inputs), input1, input1.size()))

before inputs is:
 [tensor([[ 0.0786,  0.5960,  2.1105, -0.3480,  1.2661]]), tensor([[ 0.1282, -0.3206,  1.8067,  0.9972, -0.4360]]), tensor([[-0.6857, -1.5372, -0.7193,  0.3391, -0.7413]]), tensor([[-0.6050, -0.5376,  0.3297,  2.1191,  0.4590]])]

before length of inputs is: 4

after input1 is:
 tensor([[[ 0.0786,  0.5960,  2.1105, -0.3480,  1.2661]],

        [[ 0.1282, -0.3206,  1.8067,  0.9972, -0.4360]],

        [[-0.6857, -1.5372, -0.7193,  0.3391, -0.7413]],

        [[-0.6050, -0.5376,  0.3297,  2.1191,  0.4590]]])

after input1.size is: torch.Size([4, 1, 5])


### input of lstm is:

__input1, (h0, c0)__

### Output of lstm is:

__output, (hn, cn)__


In [66]:

for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    # hidden is a tuple
    

# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[ 0.0829,  0.0525,  0.3308]]]) (tensor([[[ 0.0829,  0.0525,  0.3308]]]), tensor([[[ 0.1171,  0.5475,  1.0197]]]))
tensor([[[ 0.3024,  0.6301, -0.0483]]]) (tensor([[[ 0.3024,  0.6301, -0.0483]]]), tensor([[[ 0.7309,  1.2399, -0.1758]]]))
tensor([[[ 0.1797,  0.2369,  0.1919]]]) (tensor([[[ 0.1797,  0.2369,  0.1919]]]), tensor([[[ 0.4599,  1.5573,  0.2789]]]))
tensor([[[ 0.1223,  0.1286,  0.1049]]]) (tensor([[[ 0.1223,  0.1286,  0.1049]]]), tensor([[[ 0.1961,  1.3332,  0.4209]]]))
tensor([[[ 0.4101,  0.5140, -0.1696]]]) (tensor([[[ 0.4101,  0.5140, -0.1696]]]), tensor([[[ 0.4990,  0.7224, -0.2178]]]))


In [34]:
inputs

tensor([[[ 1.7789, -2.6498,  1.9009, -1.2295,  0.0009, -1.2562, -0.6465,
           1.2039,  0.4597,  0.2007]],

        [[-0.8459,  1.1001,  1.5433,  0.3010, -1.3644, -0.7850, -1.0751,
          -0.8574,  1.8880, -0.9413]],

        [[ 0.0915,  0.1007, -0.1625,  0.5021, -0.0464, -0.7785, -0.8695,
          -0.2019, -0.5868,  0.0673]],

        [[-0.1955,  0.4135,  0.0048,  0.7767,  0.5062,  1.2176,  0.5187,
          -0.7878, -1.2677, -0.6992]],

        [[-0.2435, -0.3629,  1.2211,  2.8558,  1.2008,  0.5523, -0.4086,
           2.0123, -1.2107, -0.2737]]])