In [1]:
import torch
import torch.nn as nn

## DNN -> RNNCell

In [2]:
# embedding 的作用是将离散的词汇映射到连续的向量空间
# 创建了一个嵌入层，其中num_embeddings=100表示词汇表大小为100，embedding_dim=5表示每个词汇的嵌入向量维度为5
embed_layer = nn.Embedding(num_embeddings=100, embedding_dim=5)
fc_layer = nn.Linear(5, 6, bias=False)
# 原始输入 --> 原始输入一般输入的是token id列表，[N,T]表示N个文本，每个文本T个token
x = torch.randint(100, size=(4, 16))
# embedding将x的16个特征的每一个特征映射到5维
print(x.shape)
x = embed_layer(x)  # [N,T] -> [N,T,E]
print(x.shape)
# 在PyTorch中，nn.Linear层会自动对最后一个维度进行矩阵乘法，所以这里的操作实际上是对每个序列的每个token的5维向量进行线性变换，得到6维向量
x2 = fc_layer(x)  # dot([N,T,E], [E,E2]) --> [N,T,E2]
print(x2.shape)
print(x2[0][:3])

torch.Size([4, 16])
torch.Size([4, 16, 5])
torch.Size([4, 16, 6])
tensor([[-1.1337,  0.3365,  0.3753,  1.1819,  1.0518,  0.1465],
        [ 0.5635,  0.1306,  0.5549, -0.4253, -0.2323,  0.0867],
        [-0.8078, -0.6334, -0.0954, -0.5659,  0.3392, -0.0654]],
       grad_fn=<SliceBackward0>)


In [3]:
fc_layer.weight.T.shape

torch.Size([5, 6])

In [6]:
N, T, E = x.shape   # [4, 16, 5]
for i in range(N):
    # matmul进行矩阵乘法
    xi = torch.matmul(x[i], fc_layer.weight.T)
    print(xi[:3])  # xi  [16,6]   # 第i个样本的前3个时刻
    break

print ("="*20)
for j in range(T):
    xj = torch.matmul(x[:, j, :], fc_layer.weight.T)
    print(xj[0])  # 第0个样本的第j个时刻的值
    if j >= 2:
        break

tensor([[ 0.5323,  0.0053, -0.7381, -0.7990, -0.3872, -0.2328],
        [ 0.4266, -0.1517, -0.7670,  0.2292,  0.2249,  1.0018],
        [ 0.4354,  0.2249,  0.4269,  0.4689, -0.1023, -0.3638]],
       grad_fn=<SliceBackward0>)
tensor([ 0.5323,  0.0053, -0.7381, -0.7990, -0.3872, -0.2328],
       grad_fn=<SelectBackward0>)
tensor([ 0.4266, -0.1517, -0.7670,  0.2292,  0.2249,  1.0018],
       grad_fn=<SelectBackward0>)
tensor([ 0.4354,  0.2249,  0.4269,  0.4689, -0.1023, -0.3638],
       grad_fn=<SelectBackward0>)


## 测试1

无记忆信息

In [7]:
embed_layer = nn.Embedding(num_embeddings=100, embedding_dim=5)
w1 = nn.Parameter(torch.randn(5, 6))
# 原始输入 --> 原始输入一般输入的是token id列表，[N,T]表示N个文本，每个文本T个token
x = torch.randint(100, size=(4, 16))
print(x.shape)
x = embed_layer(x)  # [N,T] -> [N,T,E]
print(x.shape)

torch.Size([4, 16])
torch.Size([4, 16, 5])


In [13]:
N, T, E = x.shape
outputs = []
for j in range(T):
    xj = torch.matmul(x[:, j, :], w1)  # 所有样本第j个时刻的特征值
    if j < 2:
        print(xj[0])  # 第0个样本的第j个时刻的值
    # print (xj.shape)
    xj = torch.unsqueeze(xj, dim=1)
    # print (xj.shape)
    outputs.append(xj)
print(outputs[0].shape)
outputs = torch.concat(outputs, dim=1)
print(outputs.shape)

outputs2 = torch.matmul(x, w1)  # 对应的Linear的计算
print(outputs2.shape)

print(torch.mean((outputs2 - outputs)))

tensor([ 2.4380,  0.2844,  3.5782, -0.7446,  0.0423, -1.0942],
       grad_fn=<SelectBackward0>)
tensor([-0.6224, -5.1465, -7.7115,  0.5931, -0.6324, -2.6013],
       grad_fn=<SelectBackward0>)
torch.Size([4, 1, 6])
torch.Size([4, 16, 6])
torch.Size([4, 16, 6])
tensor(0., grad_fn=<MeanBackward0>)


## 测试2

有记忆信息

In [14]:
embed_layer = nn.Embedding(num_embeddings=100, embedding_dim=5)
w1 = nn.Parameter(torch.randn(5, 6))
w2 = nn.Parameter(torch.randn(6, 6))
# 原始输入 --> 原始输入一般输入的是token id列表，[N,T]表示N个文本，每个文本T个token
x = torch.randint(100, size=(4, 16))
print(x.shape)
x = embed_layer(x)  # [N,T] -> [N,T,E]
print(x.shape)

torch.Size([4, 16])
torch.Size([4, 16, 5])


In [15]:
N, T, E = x.shape
states = torch.zeros(size=(N, w1.shape[1]))  # 记忆的状态信息
outputs = []
for j in range(T):
    # 分步进行矩阵运算
    xj = torch.matmul(x[:, j, :], w1)  # 所有样本第j个时刻的当前特征值 [N,6]
    pre_states = torch.matmul(states, w2)  # 上一个时刻的，每个样本的记忆信息 [N,6]
    xj = xj + pre_states  # 将当前的特征和之前的序列特征合并到一起
    states = xj  # 当前的输入作为当前时刻的记忆信息

    if j < 2:
        print(xj[0])  # 第0个样本的第j个时刻的值
    xj = torch.unsqueeze(xj, dim=1)
    outputs.append(xj)
outputs1 = torch.concat(outputs, dim=1)
print(outputs1.shape)

tensor([-1.5002,  0.8570,  0.9604, -2.7841, -0.2044, -0.7272],
       grad_fn=<SelectBackward0>)
tensor([ 2.6479,  1.7846, -3.7785,  7.6299,  2.6714, -2.5300],
       grad_fn=<SelectBackward0>)
torch.Size([4, 16, 6])


In [18]:
states = torch.zeros(size=(N, w1.shape[1]))  # 记忆的状态信息
outputs = []
w3 = torch.concat([w1, w2], dim=0)  # [5+6, 6]
for j in range(T):
    # 参数concat，一步矩阵运算
    xj = torch.concat([x[:, j, :], states], dim=1)  # [N,5+6]
    xj = torch.matmul(xj, w3)  # 当前时刻的特征信息
    states = xj  # 当前的输入作为当前时刻的记忆信息

    if j < 2:
        print(xj[0])  # 第0个样本的第j个时刻的值
    xj = torch.unsqueeze(xj, dim=1)   # [N,6] -> [N,1,6]
    outputs.append(xj)
outputs2 = torch.concat(outputs, dim=1)
print(outputs2.shape)
print(torch.mean(outputs1 - outputs2))

tensor([-1.5002,  0.8570,  0.9604, -2.7841, -0.2044, -0.7272],
       grad_fn=<SelectBackward0>)
tensor([ 2.6479,  1.7846, -3.7785,  7.6299,  2.6714, -2.5300],
       grad_fn=<SelectBackward0>)
torch.Size([4, 16, 6])
tensor(-2.7506e-05, grad_fn=<MeanBackward0>)


## RNN参数理解

In [4]:
rnn = nn.RNN(
    input_size=5,
    hidden_size=6,
    num_layers=2,  # 有多少层rnn
    nonlinearity='tanh',  # 激活函数 tanh、relu，默认不支持的
    batch_first=True,  # 输入数据的维度格式: true->[N,T,E] 或者 false->[T,N,E]
    bidirectional=False  # 是否是双向RNN
)
print(len(list(rnn.parameters())))
for name, param in list(rnn.named_parameters()):
    print(f"{name} --- {param.shape}")

x = torch.randn(4, 16, 5)
# 返回的是一个Tuple，
# 第一个元素是每个时刻的输出信息[N,T,hidden_size*(2 if bidirectional else 1)]
# 第二个元素是最后一个时刻的状态信息
outputs, hn = rnn(x)
print(outputs.shape)  # [N,T,hidden_size*(2 if bidirectional else 1)]
print(hn.shape)  # [(2 if bidirectional else 1) * num_layers, N, hidden_size]


8
weight_ih_l0 --- torch.Size([6, 5])
weight_hh_l0 --- torch.Size([6, 6])
bias_ih_l0 --- torch.Size([6])
bias_hh_l0 --- torch.Size([6])
weight_ih_l1 --- torch.Size([6, 6])
weight_hh_l1 --- torch.Size([6, 6])
bias_ih_l1 --- torch.Size([6])
bias_hh_l1 --- torch.Size([6])
torch.Size([4, 16, 6])
torch.Size([2, 4, 6])


In [22]:
outputs[:,-1,:]

tensor([[-0.6009,  0.3361,  0.4664,  0.6237,  0.1764,  0.5827],
        [-0.1840,  0.3164,  0.0512, -0.2778,  0.7838, -0.2781],
        [-0.5195,  0.4852, -0.3609,  0.1783,  0.5618, -0.1561],
        [-0.3937,  0.0829,  0.1113,  0.3195,  0.6565,  0.0419]],
       grad_fn=<SliceBackward0>)

In [23]:
hn[-1,:,:]

tensor([[-0.6009,  0.3361,  0.4664,  0.6237,  0.1764,  0.5827],
        [-0.1840,  0.3164,  0.0512, -0.2778,  0.7838, -0.2781],
        [-0.5195,  0.4852, -0.3609,  0.1783,  0.5618, -0.1561],
        [-0.3937,  0.0829,  0.1113,  0.3195,  0.6565,  0.0419]],
       grad_fn=<SliceBackward0>)

## 多层RNN



In [24]:
rnn = nn.RNN(5, 6, num_layers=2, batch_first=True)   # 2层RNN

rnn1 = nn.RNN(5, 6, num_layers=1, batch_first=True)  # 1层RNN
rnn.weight_ih_l0 = rnn1.weight_ih_l0
rnn.weight_hh_l0 = rnn1.weight_hh_l0
rnn.bias_ih_l0 = rnn1.bias_ih_l0
rnn.bias_hh_l0 = rnn1.bias_hh_l0

rnn2 = nn.RNN(6, 6, num_layers=1, batch_first=True)  # 1层RNN
rnn.weight_ih_l1 = rnn2.weight_ih_l0
rnn.weight_hh_l1 = rnn2.weight_hh_l0
rnn.bias_ih_l1 = rnn2.bias_ih_l0
rnn.bias_hh_l1 = rnn2.bias_hh_l0

# 手动多层RNN
x = torch.randn(4, 16, 5)
z1, z1_hn = rnn1(x)  # [4,16,5] -> [4,16,6]
z2, z2_hn = rnn2(z1)  # [4,16,6] -> [4,16,6]
print(x.shape, z1.shape, z2.shape)

# 一次性多层RNN
z, z_hn = rnn(x)
# 第一个元素是每个时刻的输出信息[N,T,hidden_size*(2 if bidirectional else 1)]
# 第二个元素是最后一个时刻的状态信息（所有层）
print(z.shape)
print(torch.mean(z - z2))
print(torch.mean(z_hn[0] - z1_hn[0]))
print(torch.mean(z_hn[1] - z2_hn[0]))

torch.Size([4, 16, 5]) torch.Size([4, 16, 6]) torch.Size([4, 16, 6])
torch.Size([4, 16, 6])
tensor(0., grad_fn=<MeanBackward0>)
tensor(0., grad_fn=<MeanBackward0>)
tensor(0., grad_fn=<MeanBackward0>)


## RNN与手动RNN

In [28]:
rnn1 = nn.RNN(5, 6, num_layers=1, batch_first=True, bidirectional=False)

x = torch.randn(2, 4, 5)
n, t, e = x.shape
z1 = rnn1(x)  # [2,4,5] -> [2,4,6]
print(z1)

# NOTE: 从rnn1中提取参数，然后基于rnn的结构，自己基于matmul进行矩阵运算
# print(list(rnn1.named_parameters()))
for name, param in list(rnn1.named_parameters()):
    print(f"{name} --- {param.shape}")

(tensor([[[-0.2392, -0.3527, -0.1397, -0.0224,  0.8324,  0.4148],
         [-0.5996, -0.6327, -0.4335,  0.8206, -0.6344, -0.9057],
         [ 0.3178, -0.4827, -0.8100,  0.7526, -0.4380,  0.3839],
         [ 0.2191,  0.0669, -0.5125, -0.0014,  0.6231, -0.3698]],

        [[-0.0683, -0.6586,  0.2314,  0.1382, -0.0557, -0.6852],
         [-0.0458,  0.1359, -0.9235,  0.4663,  0.1652,  0.7605],
         [-0.2023, -0.2630,  0.3109,  0.4793,  0.8942,  0.1332],
         [-0.4330,  0.3109, -0.6638, -0.4971,  0.8337,  0.0388]]],
       grad_fn=<TransposeBackward1>), tensor([[[ 0.2191,  0.0669, -0.5125, -0.0014,  0.6231, -0.3698],
         [-0.4330,  0.3109, -0.6638, -0.4971,  0.8337,  0.0388]]],
       grad_fn=<StackBackward0>))
weight_ih_l0 --- torch.Size([6, 5])
weight_hh_l0 --- torch.Size([6, 6])
bias_ih_l0 --- torch.Size([6])
bias_hh_l0 --- torch.Size([6])


In [34]:
states = torch.zeros((1, n, 6))
outputs = []
# 针对每个时刻进行遍历
for j in range(t):
    # 针对当前时刻的输入
    z_t = torch.matmul(x[:, j, :], rnn1.weight_ih_l0.T) + rnn1.bias_ih_l0
    # 针对上一个时刻的状态信息的转换
    h_pt = torch.matmul(states[0], rnn1.weight_hh_l0.T) + rnn1.bias_hh_l0
    # 将当前时刻的输入提取特征和上一个时刻的特征合并
    
    zh = z_t + h_pt
    # 做一个激活函数
    zh = torch.tanh(zh)
    # 当前时刻的最终输出以及状态信息的保存
    outputs.append(torch.unsqueeze(zh, dim=1))  # 输出的特征信息
    states[0] = zh  # 当前时刻的状态信息
print(states.shape)
outputs = torch.concat(outputs, dim=1)
print(outputs.shape)
print(torch.mean(torch.abs(outputs - z1[0])))
print(torch.mean(torch.abs(states - z1[1])))

output_proj = nn.Linear(6, 3)
outputs = output_proj(outputs)
print(outputs.shape)

torch.Size([1, 2, 6])
torch.Size([2, 4, 6])
tensor(0., grad_fn=<MeanBackward0>)
tensor(0., grad_fn=<MeanBackward0>)
torch.Size([2, 4, 3])


In [32]:
z_t.shape, h_pt.shape

(torch.Size([2, 6]), torch.Size([2, 6]))

## RNN ~ bidirectional=True

In [35]:
rnn = nn.RNN(5, 6, num_layers=1, batch_first=True, bidirectional=True)

x = torch.randn(3, 4, 5)
ho, hn = rnn(x)  # ho: 每个时刻的输出， hn：每个rnn的最后一个时刻的状态
print(ho.shape)
print(hn.shape)
print(dict(rnn.named_parameters()).keys())

torch.Size([3, 4, 12])
torch.Size([2, 3, 6])
dict_keys(['weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0', 'weight_ih_l0_reverse', 'weight_hh_l0_reverse', 'bias_ih_l0_reverse', 'bias_hh_l0_reverse'])


In [37]:
rnn1 = nn.RNN(5, 6, num_layers=1, batch_first=True, bidirectional=False)  # 正向rnn
rnn1.weight_ih_l0 = rnn.weight_ih_l0
rnn1.weight_hh_l0 = rnn.weight_hh_l0
rnn1.bias_ih_l0 = rnn.bias_ih_l0
rnn1.bias_hh_l0 = rnn.bias_hh_l0
ho1, hn1 = rnn1(x)
print (ho1.shape, hn1.shape)

torch.Size([3, 4, 6]) torch.Size([1, 3, 6])


In [38]:
rnn2 = nn.RNN(5, 6, num_layers=1, batch_first=True, bidirectional=False)  # 反向rnn
rnn2.weight_ih_l0 = rnn.weight_ih_l0_reverse
rnn2.weight_hh_l0 = rnn.weight_hh_l0_reverse
rnn2.bias_ih_l0 = rnn.bias_ih_l0_reverse
rnn2.bias_hh_l0 = rnn.bias_hh_l0_reverse
ho2, hn2 = rnn2(torch.flip(x, dims=[1]))
ho2 = torch.flip(ho2, dims=[1])  # 还原按照实际的顺序进行排列
print(x)
print(torch.flip(x, dims=[1]))

tensor([[[ 0.6355, -1.2483,  0.1849,  0.4790,  0.2085],
         [ 0.7573,  0.8759,  1.2106,  0.0069, -0.8169],
         [ 3.6274,  1.7327,  1.2932, -1.1668,  0.9719],
         [-1.0654,  0.5295,  0.2408, -0.7390,  0.0983]],

        [[-0.1030,  0.9930,  0.3306, -0.5867, -0.6893],
         [ 0.1868, -1.5110, -0.3317,  0.8243,  0.8216],
         [-0.6317,  0.1180, -1.5435, -0.9352, -1.3807],
         [ 1.4175, -1.0238, -2.0363, -0.0244, -0.2260]],

        [[ 0.8135,  0.6997,  0.0050, -0.8873,  0.6621],
         [ 1.9415, -0.1520, -0.1533, -0.7482,  1.7013],
         [-2.4631,  0.8586,  0.7060,  0.5539,  0.1218],
         [-1.5974,  1.9678,  0.3687,  0.6994,  0.8923]]])
tensor([[[-1.0654,  0.5295,  0.2408, -0.7390,  0.0983],
         [ 3.6274,  1.7327,  1.2932, -1.1668,  0.9719],
         [ 0.7573,  0.8759,  1.2106,  0.0069, -0.8169],
         [ 0.6355, -1.2483,  0.1849,  0.4790,  0.2085]],

        [[ 1.4175, -1.0238, -2.0363, -0.0244, -0.2260],
         [-0.6317,  0.1180, -1.5435, -0.

In [39]:
ho_ = torch.concat([ho1, ho2], dim=2)
hn_ = torch.concat([hn1, hn2], dim=0)
print("=" * 100)
print(ho)
print(ho_)
print(ho - ho_)
print("=" * 100)
print(hn)
print(hn_)
print(hn - hn_)
print(torch.mean(torch.abs(ho - ho_)))
print(torch.mean(torch.abs(hn - hn_)))

tensor([[[ 2.4997e-01, -6.5079e-01, -1.0831e-01, -3.4363e-01,  6.5327e-01,
           3.4949e-01, -9.0123e-01, -1.3707e-01,  5.2284e-01, -7.3734e-01,
           8.2482e-01,  8.3302e-01],
         [-1.1522e-01, -5.8889e-01, -2.6241e-01,  2.7714e-01, -2.3205e-01,
          -3.0220e-01, -9.2499e-01,  4.1108e-01,  8.9493e-01, -1.2808e-01,
           7.1807e-01,  7.1384e-01],
         [-6.7210e-01, -8.6936e-01, -2.9957e-01,  8.6866e-02,  7.8730e-01,
          -2.9405e-01, -8.7535e-01,  9.8309e-01,  9.6490e-01,  9.5614e-01,
          -3.3722e-01,  2.2426e-01],
         [-6.2781e-01, -2.7431e-01,  2.7192e-01, -1.6900e-02, -6.9228e-01,
          -6.3565e-01, -1.6193e-01,  4.1156e-02,  1.8112e-02, -1.5733e-01,
           7.7520e-02,  5.9341e-01]],

        [[-3.5992e-01, -1.0978e-01,  1.6988e-01,  7.1630e-02, -6.1059e-01,
          -2.1763e-02, -1.6360e-01, -1.2046e-01,  5.7042e-01,  3.2563e-01,
          -7.2687e-02,  2.4515e-01],
         [ 3.0524e-04, -5.3361e-01, -2.7804e-01, -6.6308e-01,  

In [40]:
rnn = nn.RNN(5, hidden_size=6, num_layers=4, batch_first=True, bidirectional=True)
for name, param in rnn.named_parameters():
    print(name, param.shape)
x = torch.rand(3, 4, 5)
ho, hs = rnn(x)
print(ho.shape)  # 最后一层每个时刻的输出值[n,t,2*hidden_size]
print(hs.shape)  # 每一层每个方向的RNN的最后一个时刻的状态信息# [(2 if bidirectional else 1) * num_layers, N, hidden_size]


weight_ih_l0 torch.Size([6, 5])
weight_hh_l0 torch.Size([6, 6])
bias_ih_l0 torch.Size([6])
bias_hh_l0 torch.Size([6])
weight_ih_l0_reverse torch.Size([6, 5])
weight_hh_l0_reverse torch.Size([6, 6])
bias_ih_l0_reverse torch.Size([6])
bias_hh_l0_reverse torch.Size([6])
weight_ih_l1 torch.Size([6, 12])
weight_hh_l1 torch.Size([6, 6])
bias_ih_l1 torch.Size([6])
bias_hh_l1 torch.Size([6])
weight_ih_l1_reverse torch.Size([6, 12])
weight_hh_l1_reverse torch.Size([6, 6])
bias_ih_l1_reverse torch.Size([6])
bias_hh_l1_reverse torch.Size([6])
weight_ih_l2 torch.Size([6, 12])
weight_hh_l2 torch.Size([6, 6])
bias_ih_l2 torch.Size([6])
bias_hh_l2 torch.Size([6])
weight_ih_l2_reverse torch.Size([6, 12])
weight_hh_l2_reverse torch.Size([6, 6])
bias_ih_l2_reverse torch.Size([6])
bias_hh_l2_reverse torch.Size([6])
weight_ih_l3 torch.Size([6, 12])
weight_hh_l3 torch.Size([6, 6])
bias_ih_l3 torch.Size([6])
bias_hh_l3 torch.Size([6])
weight_ih_l3_reverse torch.Size([6, 12])
weight_hh_l3_reverse torch.Size(