[![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/itmorn/AI.handbook/blob/main/DL/torch/nn/Transformer/TransformerEncoderLayer.ipynb)

# TransformerEncoderLayer
TransformerEncoderLayer是Transformer模型的组成部分之一，用于处理输入序列的编码器层。它由多个子层组成，包括多头自注意力层（Multi-Head Attention Layer）、前馈全连接层（Feed-Forward Layer）和残差连接（Residual Connection）等。

具体来说，TransformerEncoderLayer将输入序列作为其输入，并将其经过多头自注意力层进行编码。在自注意力层中，每个位置的编码向量会同时参与计算所有位置的编码向量，从而捕捉序列中的全局信息。然后，编码向量会通过前馈全连接层进行进一步处理，以便提取更高级别的特征。最后，残差连接被应用于多头自注意力层和前馈全连接层，使得模型更容易训练和优化。

由于TransformerEncoderLayer可以被堆叠起来形成多层编码器，因此它是自然语言处理和其他序列到序列任务中最常用的模型之一，比如翻译、文本分类等。


**定义**：  
torch.nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=<function relu>, layer_norm_eps=1e-05, batch_first=False, norm_first=False, device=None, dtype=None)

**参数**:  
- d_model (int) – the number of expected features in the input (required).  输入中特征维度(必需的)。

- nhead (int) – the number of heads in the multiheadattention models (required).  多头注意力模型中的头数(必需的)。

- dim_feedforward (int) – the dimension of the feedforward network model (default=2048).  前馈网络模型的维度

- dropout (float) – the dropout value (default=0.1).  dropout值(默认=0.1)。

- activation (Union[str, Callable[[Tensor], Tensor]]) – the activation function of the intermediate layer, can be a string (“relu” or “gelu”) or a unary callable. Default: relu  中间层的激活函数，可以是一个字符串(" relu "或" gelu ")或一个一元的可调用对象。默认值:relu

- layer_norm_eps (float) – the eps value in layer normalization components (default=1e-5).  层归一化组件中的eps值(默认=1e-5)。

- batch_first (bool) – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False (seq, batch, feature).  如果为True，则输入和输出张量将作为(batch, seq, feature)提供。默认值:False (seq, batch, feature)。

- norm_first (bool) – if True, layer norm is done prior to attention and feedforward operations, respectively. Otherwise it’s done after. Default: False (after).  如果为True，则层norm分别在注意和前馈操作之前完成。否则以后再做。默认值:False(之后)。


# 图解ChannelShuffle
<p align="center">
<img src="./ChannelShuffle.svg"
    width="1000" /></p>

In [1]:
# 单个样本简单举例
import torch
import torch.nn as nn

torch.manual_seed(666)
d_model = 4
encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=2,dim_feedforward=2048)
src = torch.rand(3, 1, d_model)  # Time, N, D
out = encoder_layer(src)
print("out:\n", out, "\n")


out:
 tensor([[[-0.1235,  1.6189, -1.0983, -0.3970]],

        [[-1.5583,  1.2149,  0.0358,  0.3076]],

        [[ 1.4580,  0.3129, -0.5627, -1.2082]]],
       grad_fn=<NativeLayerNormBackward0>) 



In [23]:
import torch

a = torch.tensor([[0.8778, 0.5302, 0.5404, 0.2252]])
w = torch.tensor([[-0.2898, -0.1611,  0.0042,  0.5008],
                  [-0.1465,  0.3110, -0.1026, -0.6014],
                  [-0.5311,  0.3810, -0.2857, -0.5387],
                  [-0.2353, -0.2591,  0.4796,  0.1370],
                  [-0.1174, -0.0520, -0.1187, -0.5790],
                  [0.5493, -0.0131, -0.3745, -0.5730],
                  [-0.5524,  0.0478,  0.0745,  0.0216],
                  [0.3062, -0.0086, -0.0529, -0.3167],
                  [-0.1256, -0.2572, -0.1987, -0.4491],
                  [-0.0539,  0.3361, -0.1509,  0.2473],
                  [0.0713,  0.4271, -0.1720,  0.4674],
                  [0.1613,  0.2357, -0.4649,  0.5364]])
# y = xA^T + b
torch.mm(a,w.T)

tensor([[-0.2248, -0.1546, -0.5399, -0.0539, -0.3252,  0.1438, -0.4144,  0.1643,
         -0.4551,  0.1050,  0.3013,  0.1361]])

In [12]:
import torch

a = torch.tensor([-0.0965,  0.0869])
b = torch.tensor([-0.1063,  0.0400])
(a*b).sum()

tensor(0.0137)

In [24]:
import torch

a = torch.tensor([-0.1365,  0.1229])
b = torch.tensor([-0.3252,  0.1438])
(a*b).sum()/(2**0.5)

tensor(0.0439)

In [28]:
x = torch.tensor([0.0137,  0.0139,  0.0439])
(x.exp())/(x.exp().sum())

tensor([0.3299, 0.3300, 0.3401])

In [29]:
0.3666*torch.tensor([-0.2409,  0.2068])+0.3778*torch.tensor([-0.4551,  0.1050])

tensor([-0.2603,  0.1155])

In [32]:
import torch

a = torch.tensor([[-0.2603,  0.1155,  0.4382,  0.2123]])
w = torch.tensor([[-0.1881, -0.2299, -0.3882, -0.3988],
        [-0.3123, -0.4819, -0.1683, -0.4154],
        [ 0.0732, -0.4921, -0.2480,  0.0518],
        [ 0.3785,  0.0281, -0.0039,  0.4791]])
# y = xA^T + b
torch.mm(a,w.T)
# w.T

tensor([[-0.2324, -0.1363, -0.1736,  0.0047]])

In [34]:
torch.tensor([[[-0.2324, -0.1363, -0.1736,  0.0048]],
              [[-0.2081, -0.1121, -0.2520, -0.0917]],
              [[-0.2075, -0.1110, -0.2529, -0.0937]]])/0.9


tensor([[[-0.2582, -0.1514, -0.1929,  0.0053]],

        [[-0.2312, -0.1246, -0.2800, -0.1019]],

        [[-0.2306, -0.1233, -0.2810, -0.1041]]])

In [59]:
# 单个样本简单举例
import torch
import torch.nn as nn

torch.manual_seed(666)
d_model = 4
encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=2)
src = torch.rand(3, 1, d_model)  # Time, N, D
out = encoder_layer(src)
print("out:\n", out, "\n")


out:
 tensor([[[-0.1235,  1.6189, -1.0983, -0.3970]],

        [[-1.5583,  1.2149,  0.0358,  0.3076]],

        [[ 1.4580,  0.3129, -0.5627, -1.2082]]],
       grad_fn=<NativeLayerNormBackward0>) 



In [63]:
import torch

input1 = torch.tensor([[[[ 0.2180,  0.4969, -0.0965,  0.0667]],
                        [[ 0.1420,  0.7860,  0.4493,  0.4992]],
                        [[ 0.6472,  0.4069,  0.2594,  0.1210]]]])
input1 = torch.tensor([[[[ 0.2180,  0.4969, -0.0965,  0.0667]]]])

# input1 = torch.tensor([
#     [
#         [[1, 6],
#          [9, 4]],
#         [[12, 18],
#          [13, 11]]],
#     [
#         [[2, 7],
#          [3, 8]],
#         [[19, 17],
#          [15, 11]]
#     ]
# ], dtype=torch.float32)

print("input1:\n", input1, "\n")

# 第1步：求均值和方差：
VarX, EX = torch.var_mean(input1, dim=(1,2,3),unbiased=False)  # NCHW
print("Ex:\n", EX, "\n")
print("VarX:\n", VarX, "\n")

# 第2步：减去均值：
result2 = input1-EX
print("input1-Ex:\n", result2, "\n")

# 第3步：求sqrt(VarX+eps)：
eps = 1e-5
result3 = torch.sqrt(VarX+eps)
print("sqrt(VarX+eps):\n", result3, "\n")

# 第4步：第2步的结果/第3步的结果，完成batch内的数据规范化:
result4 = result2/result3
print("(input1-Ex)/sqrt(VarX+eps):\n", result4, "\n")

# 第5步：使用γ=1，β=0 进行再校正：
γ = 1
β = 0
result5 = result4 * γ + β
print("[(input1-Ex)/sqrt(VarX+eps)] * γ + β:\n", result5, "\n") # 结果和图上一致


input1:
 tensor([[[[ 0.2180,  0.4969, -0.0965,  0.0667]]]]) 

Ex:
 tensor([0.1713]) 

VarX:
 tensor([0.0477]) 

input1-Ex:
 tensor([[[[ 0.0467,  0.3256, -0.2678, -0.1046]]]]) 

sqrt(VarX+eps):
 tensor([0.2185]) 

(input1-Ex)/sqrt(VarX+eps):
 tensor([[[[ 0.2139,  1.4906, -1.2258, -0.4787]]]]) 

[(input1-Ex)/sqrt(VarX+eps)] * γ + β:
 tensor([[[[ 0.2139,  1.4906, -1.2258, -0.4787]]]]) 

