[![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/itmorn/AI.handbook/blob/main/DL/torch/nn/Recurrent/LSTMCell.ipynb)

# RNNCell  VS  LSTMCell

在RNNCell中是由hx来维护“记忆”的，但是由于其变化较快，下一时刻的哪怕是不重要的输入都会大幅度改变之前的“记忆”，这就导致RNNCell不能从较长的序列中提取有效信息；  
而在LSTMCell中，使用cx来维护“记忆”，hx主要是控制向“记忆”中加入信息和遗忘信息的，比如某时刻遇到一个不重要的输入，hx就可以控制不要去对记忆做任何操作，当输出当前步的隐状态时，还要用当前时刻的记忆去控制每个记忆位置输出的强度。因此这种结构就决定了，我们能从较长的序列中提取有效信息。


# 为什么叫LSTM
长短期记忆网络（Long Short-Term Memory），主要强调他能够维持比较长程或短程的记忆。

# LSTMCell

**定义**：   
torch.nn.LSTMCell(input_size, hidden_size, bias=True, device=None, dtype=None)

**公式**：
$$\begin{array}{ll}
        i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
        f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
        g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
        o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
        c' = f * c + i * g \\
        h' = o * \tanh(c') \\
\end{array}$$

**参数**：  
- input_size (int) – The number of expected features in the input x.  时间序列某一时刻的特征向量长度

- hidden_size (int) – The number of features in the hidden state h.  隐藏层向量长度

- bias (bool) – If False, then the layer does not use bias weights b_ih and b_hh. Default: True.  是否加待学习的偏置项


# 图解前向过程
<p align="center">
<img src="./imgs/LSTM2-notation.png"
    width="700" /></p>
    
<p align="center">
<img src="./imgs/LSTM3-chain.png"
    width="1000" /></p>

    
<p align="center">
<img src="./imgs/LSTMCell.svg"
    width="2000" /></p>

<!-- <p align="center">
<a href="https://raw.githubusercontent.com/itmorn/AI.handbook/main/DL/torch/nn/Recurrent/imgs/RNNCell.svg">
<img src="./imgs/RNNCell.svg"
    width="2000" /></a></p> -->

In [1]:
# 调包计算
import torch
import torch.nn as nn
torch.manual_seed(666)

L = 2  # sequence_length  也可理解为time_steps
N = 1  # batch_size
H_in = 3  # input_size 输入层特征向量的长度
H_out = 4  # hidden_size 隐藏层向量的长度

input = torch.randn(L, N, H_in) # (time_steps, batch, input_size)
h = torch.randn(N, H_out) # (batch, hidden_size) 负责决定如何改变“记忆”
c = torch.randn(N, H_out) # (batch, hidden_size) 负责维护“记忆”
print("input:\n", input, "\n")
print("h_0:\n", h, "\n")
print("c_0:\n", c, "\n")

lstm_cell = nn.LSTMCell(H_in, H_out, bias=False) # (input_size, hidden_size) 为了画图简洁，不要偏置项
print("weight_hh:\n", lstm_cell.weight_hh, "\n")
print("weight_ih:\n", lstm_cell.weight_ih, "\n")

output = []  #保存每个时刻的隐藏层的数据
for i in range(input.size()[0]):
    h, c = lstm_cell(input[i], (h, c))
    print(f"h_{i+1}:\n", h, "\n")
    print(f"c_{i+1}:\n", c, "\n")
    output.append(h)

input:
 tensor([[[-2.1188,  0.0635, -1.4555]],

        [[-0.0126, -0.1548, -0.0927]]]) 

h_0:
 tensor([[ 2.5916,  0.4542, -0.6890, -0.9962]]) 

c_0:
 tensor([[0.1856, 0.1476, 0.8628, 0.2379]]) 

weight_hh:
 Parameter containing:
tensor([[-0.0506, -0.1730, -0.3312,  0.0733],
        [-0.1884, -0.2347,  0.1158,  0.3620],
        [-0.1595,  0.2099, -0.4129,  0.0649],
        [ 0.4904, -0.2916, -0.2753, -0.2733],
        [ 0.1248,  0.1446, -0.4906, -0.3950],
        [-0.4422,  0.3924, -0.4710, -0.3778],
        [ 0.2299,  0.0562, -0.3475,  0.2820],
        [ 0.0986,  0.2850,  0.0672,  0.2846],
        [ 0.2367, -0.2018, -0.2490, -0.3651],
        [-0.0249, -0.4682,  0.0340,  0.3999],
        [ 0.1624,  0.0436,  0.3629, -0.4253],
        [ 0.0332, -0.3253, -0.1894, -0.3643],
        [-0.4892, -0.3443, -0.4984, -0.0707],
        [ 0.1803, -0.3030, -0.2147, -0.1813],
        [-0.2879,  0.1350, -0.3416,  0.1918],
        [ 0.3599,  0.4089,  0.2544,  0.0915]], requires_grad=True) 

weight_ih:


In [88]:
# 手工计算
X_1 = torch.tensor([[-2.1188,  0.0635, -1.4555]]).T
h_0 = torch.tensor([[2.5916,  0.4542, -0.6890, -0.9962]]).T
c_0 = torch.tensor([[0.1856, 0.1476, 0.8628, 0.2379]]).T
weight_hh = torch.tensor([[-0.0506, -0.1730, -0.3312,  0.0733],
                          [-0.1884, -0.2347,  0.1158,  0.3620],
                          [-0.1595,  0.2099, -0.4129,  0.0649],
                          [0.4904, -0.2916, -0.2753, -0.2733],
                          [0.1248,  0.1446, -0.4906, -0.3950],
                          [-0.4422,  0.3924, -0.4710, -0.3778],
                          [0.2299,  0.0562, -0.3475,  0.2820],
                          [0.0986,  0.2850,  0.0672,  0.2846],
                          [0.2367, -0.2018, -0.2490, -0.3651],
                          [-0.0249, -0.4682,  0.0340,  0.3999],
                          [0.1624,  0.0436,  0.3629, -0.4253],
                          [0.0332, -0.3253, -0.1894, -0.3643],
                          [-0.4892, -0.3443, -0.4984, -0.0707],
                          [0.1803, -0.3030, -0.2147, -0.1813],
                          [-0.2879,  0.1350, -0.3416,  0.1918],
                          [0.3599,  0.4089,  0.2544,  0.0915]])

weight_ih = torch.tensor([[-0.4336,  0.3111, -0.2333],
                          [-0.4399, -0.1921, -0.2115],
                          [0.3916,  0.1119, -0.0959],
                          [-0.0424, -0.0969, -0.4728],
                          [0.4485, -0.0107, -0.3058],
                          [-0.4678, -0.4511,  0.0390],
                          [0.0608,  0.0176,  0.2500],
                          [-0.0070, -0.0432, -0.2586],
                          [-0.1025, -0.2100, -0.1622],
                          [-0.3667, -0.0440,  0.2744],
                          [-0.1232,  0.2019,  0.0582],
                          [0.3487, -0.1404,  0.3816],
                          [0.1317,  0.1924, -0.3796],
                          [0.4379,  0.1783,  0.1848],
                          [-0.2799, -0.0428,  0.1167],
                          [-0.0991, -0.2574, -0.4477]])
weight_hh_and_ih = torch.concat([weight_hh, weight_ih], dim=1)
hidden_and_input = torch.concat([h_0, X_1], dim=0)
result_mm = torch.mm(weight_hh_and_ih, hidden_and_input)
print(result_mm)
σ1 = torch.sigmoid(result_mm[4:8, :])
σ2 = torch.sigmoid(result_mm[:4, :])
tanh3 = torch.tanh(result_mm[8:12, :]) 
σ4 = torch.sigmoid(result_mm[12:, :])

c_0*σ1
σ2*tanh3
c_1 = c_0*σ1+σ2*tanh3
torch.tanh(c_0*σ1+σ2*tanh3)
h_1 = torch.tanh(c_0*σ1+σ2*tanh3)*σ4

c_1, h_1
# c_1: tensor([[ 0.8194, -0.0751,  0.6597, -0.4974]], grad_fn=<AddBackward0>) 
# h_1: tensor([[ 0.2202, -0.0278,  0.2574, -0.3888]], grad_fn=<MulBackward0>) 
# 可以看到和调包计算结果一致

tensor([[ 1.2235],
        [ 0.1924],
        [-0.7812],
        [ 2.3723],
        [ 0.6148],
        [ 0.6389],
        [ 0.0883],
        [ 0.4436],
        [ 1.4970],
        [-0.3242],
        [ 0.8035],
        [-0.8715],
        [-0.7247],
        [-0.5273],
        [-0.2200],
        [ 1.6973]])


(tensor([[ 0.8194],
         [-0.0751],
         [ 0.6596],
         [-0.4973]]),
 tensor([[ 0.2202],
         [-0.0278],
         [ 0.2574],
         [-0.3888]]))

# netron可视化计算图
我们也可以输出LSTMCell的计算图，然后通过 https://netron.app/ 查看

<p align="center">
<img src="./imgs/LSTMCell_netron.svg"
    width="800" /></p>

In [1]:
import torch
import torch.nn as nn
torch.manual_seed(666)

L = 2  # sequence_length  也可理解为time_steps
N = 1  # batch_size
H_in = 3  # input_size 输入层特征向量的长度
H_out = 4  # hidden_size 隐藏层向量的长度

input = torch.randn(L, N, H_in) # (time_steps, batch, input_size)
h = torch.randn(N, H_out) # (batch, hidden_size) 负责决定如何改变“记忆”
c = torch.randn(N, H_out) # (batch, hidden_size) 负责维护“记忆”

lstm_cell = nn.LSTMCell(H_in, H_out, bias=False) # (input_size, hidden_size) 为了画图简洁，不要偏置项

import torch
import onnx
import onnx.utils
import onnx.version_converter

torch.onnx.export(
    lstm_cell,
    input[0],
    'LSTMCell.onnx',
    export_params=True,
    opset_version=16,
)

# 增加维度信息
model_file = 'LSTMCell.onnx'
onnx_model = onnx.load(model_file)
onnx.save(onnx.shape_inference.infer_shapes(onnx_model), model_file)
print("输出完成")

输出完成


# 参考资料
[Understanding LSTM Networks](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)