# Transformer pytorch
![](https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fpic2.zhimg.com%2Fv2-c2cb536d843168c1f44c39b8e0ec41d4_1440w.jpg%3Fsource%3D172ae18b&refer=http%3A%2F%2Fpic2.zhimg.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=auto?sec=1657722307&t=19f124823ccd6e8bb467e1b6c3eb9703)

## 2. Encoder
### 2.1 Positional Encoding
使用sin、cos函数表示序列模型信息，将其**加上**（非拼接）原始词嵌入。

$$
\begin{aligned}
P E_{(p o s, 2 i)} &=\sin \left(p o s / 10000^{2 i / d_{\text {model }}}\right) \\
P E_{(p o s, 2 i+1)} &=\cos \left(p o s / 10000^{2 i / d_{\text {model }}}\right)
\end{aligned}
$$
$pos$: token 位置，`[0, seq_len)`;
$i$: Embedding向量维度序号 `[0, embedding_dim/2)`;
$d_{\text {model }$: 嵌入维度，`embedding_dim`；



In [37]:
import torch
from torch import nn

In [38]:
class PositionalEncoding(nn.Module):
    """
    Args:
        d_model: 特征维度。
        dropout: 默认值0.1。
        max_len: 序列最大长度，默认值5k。
    Inputs:
        x: (batch_size, seq_len, embedding_dim)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.pe = torch.zeros(1, max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.pow(10000, torch.arange(0, d_model, 2, dtype=torch.float32) / d_model)
        self.pe[:, :, 0::2] = torch.sin(position / div_term)
        self.pe[:, :, 1::2] = torch.cos(position / div_term)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :].to(x.device)
        return self.dropout(x)


In [39]:
seq_len = 10
batch_size = 32
d_model = 512
PE = PositionalEncoding(d_model, 0)
x = torch.zeros(batch_size, seq_len, d_model)
y = PE(x)
print(y.shape)


torch.Size([32, 10, 512])


### 2.2 ScaledDotProductAttention
- 步骤：
    1. 计算`Q` 和 `K` 的分数`scores`。
        $
        \mathbf{scores} = \mathbf{Q} \mathbf{K}^{\top} / \sqrt{d_{k}}
        $

    2. 掩蔽`scores`<PAD>后，做`softmax` 得到 `attn`
        $
            \mathbf{attn}=softmax(mask(\mathbf{scores}))
        $

    3. 计算`attn` 和 `V` 得到`context`。
        $
            \mathbf{context}= \mathbf{attn} \cdot \mathbf{K}
        $





In [40]:
class ScaledDotProductAttention(nn.Module):
    """
    Inputs:
        q: (batch_size, n_heads, len_q, d_k)
        k: (batch_size, n_heads, len_k, d_k)
        v: (batch_size, n_heads, len_v, d_v) , (len_v==len_k)
        attn_mask: (batch_size, n_heads, len_q, len_k)
    Returns:
        context: (batch_size, n_heads, len_q, d_v)
        attn: (batch_size, n_heads, len_k, d_k)
    """

    def __init__(self, d_k, dropout=0.1):
        super(ScaledDotProductAttention, self).__init__()
        self.d_k = d_k
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, q, k, v, attn_mask):
        scores = torch.matmul(q, k.transpose(-1, -2)) / (self.d_k ** (1 / 2))
        if attn_mask is not None:
            scores.masked_fill(attn_mask == 0, -1e9)
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, v)
        return context, attn

## 2.2 MultiHeadAttention

In [None]:
class MultiHeadAttention(nn.Module):
    """
    Inputs:
        q : (batch_size, len_q, d_model)
        k : (batch_size, len_k, d_model)
        v : (batch_size, len_v, d_model) , (len_v==len_k)
        attn_mask : (batch_size, n_heads, len_q, len_k)
    """

    def __init__(self, n_heads, d_model, d_k, d_v, dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        self.n_heads = n_heads
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_heads * d_k, bias=False)
        self.w_ks = nn.Linear(d_model, n_heads * d_k, bias=False)
        self.w_vs = nn.Linear(d_model, n_heads * d_v, bias=False)
        self.fc = nn.Linear(n_heads * d_v, d_model, bias=False)

        self.attention = ScaledDotProductAttention(d_k=d_k)
        self.dropout = nn.Dropout(p=dropout)
        self.layernorm = nn.LayerNorm(d_model)

    def forward(self, q, k, v, attn_mask=None):
        d_k, d_v, n_heads = self.d_k, self.d_v, self.n_head
        batch_size, len_q, d_model = q.shape
        len_k, len_v = k.shape[1], v.shape[1]

        residual = q

        q = self.w_qs(q).view(batch_size, len_q, n_heads, d_k)
        k = self.w_ks(k).view(batch_size, len_k, n_heads, d_k)
        v = self.w_vs(v).view(batch_size, len_v, n_heads, d_v)

        # 转成 ScaledDotProductAttention 输入形状 (batch_size, n_heads, len_q, d_k)
        q = q.permute(0, 2, 1, 3)
        k = k.permute(0, 2, 1, 3)
        v = v.permute(0, 2, 1, 3)

        # 按 n_heads 维度复制
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)

        context, attn = self.attention(q, k, v, attn_mask)
        #   context: (batch_size, n_heads, len_q, d_v)
        #   attn: (batch_size, n_heads, len_k, d_k)

        context = context.permute(0, 2, 1, 3).view(batch_size, len_q, n_heads * d_v)

        output = self.fc(context)
        # output : (batch_size, len_q, d_model)

        output = self.layernorm(output)

        return output, attn



In [55]:
x = torch.tensor([1, 2, 3, 4],dtype=torch.float)
y = nn.LayerNorm(normalized_shape=4)(x)
y

tensor([-1.3416, -0.4472,  0.4472,  1.3416],
       grad_fn=<NativeLayerNormBackward0>)