## transformer模型结构
![image.png](../add_pic/transformer结构.png)

In [150]:
import math
import torch
import torch.nn as nn
from typing import List, Optional
from torch import optim
import torch.nn.functional as F

In [151]:
class FeedForward(nn.Module):
    def __init__(self, d_model: int, d_ff: int,
                 dropout: float = 0.1,
                 activation=nn.ReLU(),
                 is_gated: bool = False,
                 bias1: bool = True,
                 bias2: bool = True,
                 bias_gate: bool = True):
        super().__init__()
        # 初始化第一层线性变换，输入维度为 d_model，输出维度为 d_ff
        self.layer1 = nn.Linear(d_model, d_ff, bias=bias1)
        # 初始化第二层线性变换，输入维度为 d_ff，输出维度为 d_model
        self.layer2 = nn.Linear(d_ff, d_model, bias=bias2)
        # 初始化 dropout 层，用于在训练过程中随机关闭部分神经元
        self.dropout = nn.Dropout(dropout)
        # 激活函数，用于引入非线性
        self.activation = activation
        # 是否启用门控机制
        self.is_gated = is_gated
        if is_gated:
            # 如果启用门控机制，初始化另一层线性变换，用于门控
            self.linear_v = nn.Linear(d_model, d_ff, bias=bias_gate)

    def forward(self, x: torch.Tensor):
        # 应用第一层线性变换和激活函数
        g = self.activation(self.layer1(x))
        # 如果启用门控机制
        if self.is_gated:
            # 应用门控机制
            g = g * self.linear_v(x)
        else:
            x = g
        # 应用 dropout
        x = self.dropout(g)
        # 应用第二层线性变换
        return self.layer2(x)


![image.png](../add_pic/transformer多头注意力机制1.png)
![image.png](../add_pic/transformer多头注意力机制2.png)

In [152]:
class PrepareForMultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, heads: int, d_k: int, bias: bool = True):
        super().__init__()
        # 初始化线性变换，用于将输入的 d_model 维度向量转换为 heads 个 d_k 维度向量
        self.linear = nn.Linear(d_model, heads * d_k, bias=bias)
        # 多头注意力的头数
        self.heads = heads
        # 单个头的维度
        self.d_k = d_k

    def forward(self, x: torch.Tensor):
        # x 的形状为 [seq_len, batch_size, d_model]
        # 获取输入的初始形状，用于之后的变形操作
        head_shape = x.shape[:-1]
        # 将输入的 d_model 维度向量转换为 heads 个 d_k 维度向量
        x = self.linear(x)
        # 将最后一个维度分割为多个头，并为每个头分配 d_k 维度
        x = x.view(*head_shape, self.heads, self.d_k)
        # 返回转换后的张量，形状为 [seq_len, batch_size, heads, d_k]
        return x

In [153]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, heads: int, d_k: int, dropout_prob: float = 0.1, bias: bool = True):
        super().__init__()
        # 计算每个注意力头的维度
        self.d_k = d_model // heads
        # 注意力头的数量
        self.heads = heads

        # 初始化query、key、value的线性变换
        self.query = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias)
        self.key = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias)
        self.value = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias)

        # 初始化 softmax 层
        self.softmax = nn.Softmax(dim=1)

        # 输出层，将多个头的输出连接起来
        self.output = nn.Linear(d_model, d_model, bias=bias)

        # 初始化 dropout 层
        self.dropout = nn.Dropout(dropout_prob)

        # 计算缩放因子
        self.scale = 1 / math.sqrt(self.d_k)

        # 存储注意力权重
        self.attn = None

    def get_scores(self, query: torch.Tensor, key: torch.Tensor):
        # 计算query和key之间的点积
        scores = torch.matmul(query, key.transpose(-2, -1))

    def preper_mask(self, mask: torch.Tensor, query_shape: List[int], key_shape: List[int]):
        # 调整 mask 的形状，使其与 query 和 key 的形状相匹配
        if mask.dim() == 2:
            mask = mask.unsqueeze(0)
        if mask.size(1) == query_shape[0] and mask.size(2) == key_shape[0]:
            mask = mask.unsqueeze(1)
        else:
            raise ValueError("Mask shape is not compatible with query and key shape.")
        return mask
    
    def forward(self, *, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None):
        # 获取序列长度和批次大小
        seq_len, batch_size = query.size(0), query.size(1)
        
        # 若提供了 mask，则调整 mask 的形状
        if mask is not None:
            mask = self.preper_mask(mask, query.shape, key.shape)

        # 将 query、key 和 value 分别通过线性变换
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        # 计算注意力分数
        scores = self.get_scores(query, key)

        # 应用缩放因子
        scores = scores * self.scale

        # 应用 mask
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # 计算注意力权重
        attn = self.softmax(scores)

        # 应用 dropout
        attn = self.dropout(attn)

        # 计算注意力输出
        output = torch.matmul(attn, value)

        # 存储注意力权重
        self.attn = attn.detach()

        # 将多个头的输出连接起来
        output = output.view(seq_len, batch_size, -1)

        # 通过输出层
        output = self.output(output)

        return output


![image.png](../add_pic/transformer位置编码和嵌入.png)

In [154]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout_prob: float, max_len: int = 5000):
        super().__init__()
        # 初始化 dropout 层
        self.dropout = nn.Dropout(dropout_prob)
        # 创建位置编码并注册为 buffer
        self.register_buffer("position_encodings", self.get_positional_encoding(d_model, max_len), False)

        def forward(self, x: torch.Tensor) -> torch.Tensor:
            # 获取位置编码的一部分并确保不计算其梯度
            pe = self.position_encodings[:x.shape[0]].detach().requires_grad_(False)
            # 将位置编码添加到输入张量中
            x = x + pe
            # 应用 dropout
            x = self.dropout(x)
            return x
    
    def get_positional_encoding(self, d_model: int, max_len: int = 5000) -> torch.Tensor:
        # 初始化一个全为 0 的位置编码矩阵，形状为 [max_len, d_model]
        encodings = torch.zeros(max_len, d_model)
        # 创建一个位置索引，形状为 [max_len, 1]
        position = torch.arange(0, max_len).unsqueeze(1).float().unsqueeze(0)
        # 创建一个序列，用于在正弦和余弦函数中的分母，形状为 [d_model]
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        tow_i = torch.arange(0, d_model, 2).float()
        # 计算分母
        div_term = torch.exp(tow_i * (-math.log(10000.0) / d_model))
        # 填充位置编码的偶数索引
        encodings[:, 0::2] = torch.sin(position * div_term)
        # 填充位置编码的奇数索引
        encodings[:, 1::2] = torch.cos(position * div_term)
        # 增加一个批次维度
        encodings = encodings.unsqueeze(1).requires_grad_(False)
        return encodings

    class EmbeddingsWithPositionalEncoding(nn.Module):
        def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
            super().__init__()
            # 初始化嵌入层，映射词汇表中的单词到 d_model 维度向量
            self.embedding = nn.Embedding(n_vocab, d_model)
            # 存储模型维度
            self.d_model = d_model
            # 生成并添加位置编码
            self.register_buffer("position_encodings", get_positional_encoding(d_model, max_len))

        def forward(self, x: torch.Tensor) -> torch.Tensor:
            # 获取位置编码的一部分并确保不计算其梯度
            pe = self.position_encodings[:x.shape[0]].detach().requires_grad_(False)
            # 将嵌入向量乘以 d_model 的平方根并加上位置编码
            return self.embedding(x) * math.sqrt(self.d_model) + pe
        
    class EmbeddingsWithLearnedEncoding(nn.Module):
        def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
            super().__init__()
            # 初始化嵌入层，映射词汇表中的单词到 d_model 维度向量
            self.embedding = nn.Embedding(n_vocab, d_model)
            # 存储模型维度
            self.d_model = d_model
            # 初始化位置编码
            self.position_encodings = nn.Parameter(torch.zeros(max_len, 1, d_model), requires_grad=True)

        def forward(self, x: torch.Tensor) -> torch.Tensor:
            # 获取位置编码
            pe = self.position_encodings[:x.shape[0]]
            # 将嵌入向量乘以 d_model 的平方根并加上位置编码
            return self.embedding(x) * math.sqrt(self.d_model) + pe