In [97]:
import torch
import torch.nn as nn
from math import sqrt
import copy
import math

In [98]:
def clones(module,N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

def generate_causal_mask(seq_len):
    # 生成上三角矩阵（对角线及以上为 1，其余为 0）
    mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
    return mask  # 形状: (seq_len, seq_len)

class SublayerConnection(nn.Module):
    def __init__(self,size,dropout):
        super(SublayerConnection,self).__init__()
        self.norm = nn.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x,sublayer):
        return self.dropout(self.norm(x+sublayer(x)))

def attention(queries,keys,values,causal_mask = None,padding_mask = None):
    d_k = queries.size(-1)
    scores = torch.matmul(queries,keys.transpose(-2,-1)) / sqrt(d_k)
    if causal_mask is not None:
        scores = scores.masked_fill(causal_mask == 1, -1e9)

    if padding_mask is not None:
        scores = scores.masked_fill(padding_mask == 0, -1e9)
    attention_scores = torch.softmax(scores,dim=-1)
    result = torch.matmul(attention_scores,values)
    return result

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, heads, dropout):
        super(MultiHeadAttention, self).__init__()
        assert d_model % heads == 0, "d_model must be divisible by heads"
        self.d_model = d_model
        self.d_k = d_model // heads
        self.heads = heads
        self.linears = clones(nn.Linear(d_model, d_model), 4)  # Q, K, V, 输出投影
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, padding_mask=None,causal_mask=None):

        batch_size = query.size(0)
        # 线性变换并拆分为多头
        q, k, v = [lin(x).view(batch_size, -1, self.heads, self.d_k).transpose(1, 2)
                   for lin, x in zip(self.linears[:3], (query, key, value))]
        # 计算注意力
        atten = attention(q, k, v, causal_mask, padding_mask=padding_mask)
        # 合并多头
        x = atten.transpose(1, 2).contiguous().reshape(batch_size, -1, self.d_model)
        # 输出投影
        out = self.linears[-1](x)

        return out

In [99]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        sin_div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        cos_div_term = torch.exp(torch.arange(1, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * sin_div_term)
        pe[:, 1::2] = torch.cos(position * cos_div_term)

        self.register_buffer('pe', pe.unsqueeze(0))  # shape: (1, max_len, d_model)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]  # 动态截取位置编码长度

In [100]:
class FFN(nn.Module):
    def __init__(self,d_model,d_ff):
        super(FFN,self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.linears = nn.Sequential(nn.Linear(d_model,d_ff),nn.ReLU(),nn.Linear(d_ff,d_model))

    def forward(self,x):
        return self.linears(x)

class EncoderLayer(nn.Module):
    def __init__(self,d_model,d_ff,heads,dropout):
        super(EncoderLayer,self).__init__()
        self.sublayers = clones(SublayerConnection(d_model,dropout),2)
        self.atten = MultiHeadAttention(d_model,heads,dropout)
        self.ffn = FFN(d_model,d_ff)

    def forward(self, x,src_mask=None):
        x = self.sublayers[0](x,lambda x :self.atten(x,x,x,src_mask))
        x = self.sublayers[1](x, self.ffn)
        return x

class DecoderLayer(nn.Module):
    def __init__(self,d_model,d_ff,heads,dropout):
        super(DecoderLayer,self).__init__()
        self.sublayers = clones(SublayerConnection(d_model,dropout),3)
        self.atten_s = MultiHeadAttention(d_model,heads,dropout)
        self.atten_c = MultiHeadAttention(d_model,heads,dropout)
        self.ffn = FFN(d_model,d_ff)


    def forward(self,memory,tgt,src_mask,tgt_mask,causal_mask):
        x = self.sublayers[0](tgt,lambda x:self.atten_s(tgt,tgt,tgt,tgt_mask,causal_mask))
        x = self.sublayers[1](x,lambda  x: self.atten_c(x,memory,memory,src_mask))
        x = self.sublayers[2](x,self.ffn)
        return x

class OutputLayer(nn.Module):
    def __init__(self,d_model,vocab_size):
        super(OutputLayer,self).__init__()
        self.layer = nn.Sequential(nn.Linear(d_model,vocab_size),nn.Softmax(dim=-1))

    def forward(self,x):
        return self.layer(x)

In [101]:
class Transformer(nn.Module):
    def __init__(self, d_model, d_ff, heads, dropout, vocab_size, block_number=6):
        super(Transformer, self).__init__()
        self.encoder_layers = clones(EncoderLayer(d_model, d_ff, heads, dropout), block_number)
        self.decoder_layers = clones(DecoderLayer(d_model, d_ff, heads, dropout), block_number)
        self.position_enc = PositionalEncoding(d_model)
        self.out = OutputLayer(d_model, vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # 添加位置编码
        src = self.position_enc(src)
        tgt = self.position_enc(tgt)

        #现场生成因果掩码
        seq_len = src.shape[1]
        causal_mask = generate_causal_mask(seq_len)

        # Encoder
        for layer in self.encoder_layers:
            src = layer(src, src_mask)

        # Decoder
        for layer in self.decoder_layers:
            tgt = layer(src,tgt, src_mask, tgt_mask,causal_mask=causal_mask)

        return self.out(tgt)



In [103]:
d_model = 512
d_ff = 2048
heads = 8
dropout = 0.2
vocab_size = 5
src = torch.randn(1,20,d_model)
tgt = torch.randn(1,20,d_model)
model = Transformer(d_model,d_ff,heads,dropout,vocab_size)
y = model.generate(src)
print(y.size())

RuntimeError: The size of tensor a (2) must match the size of tensor b (512) at non-singleton dimension 2