In [2]:
import torch
from torch import nn
from torch.nn import functional as F
import math

In [3]:
X = torch.randn(128,64,512)  # (batch_size, seq_len, embedding_dim)
print(X.shape)

torch.Size([128, 64, 512])


In [4]:
d_model = 512
n_heads = 8

#### 多头注意力

In [5]:
class multi_head_attention(nn.Module):
    def __init__(self,d_model,n_heads):
        super().__init__()

        assert d_model % n_heads == 0
        self.d_model = d_model
        self.n_heads = n_heads
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_combine = nn.Linear(d_model, d_model)
        self.softmax = nn.Softmax(dim=-1)
        self.d_k = d_model // n_heads

    def forward(self, q, k, v, mask=None):
        batch_size, seq_len, _ = q.shape
        q, k ,v = self.w_q(q), self.w_k(k), self.w_v(v)
        q = q.view(batch_size, seq_len, self.n_heads, self.d_k).permute(0, 2, 1, 3)  # (batch_size, n_heads, seq_len, d_k)
        k = k.view(batch_size, seq_len, self.n_heads, self.d_k).permute(0, 2, 1, 3)
        v = v.view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1,2)

        score = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)  # (batch_size, n_heads, seq_len, seq_len)
        if mask is not None:
            # mask = torch.tril(torch.ones(seq_len, seq_len,dtype=bool))
            score = score.masked_fill(mask == 0, float('-inf'))  # 只看前面的
        score = self.softmax(score) @ v
        score = score.transpose(1,2).contiguous().view(batch_size, seq_len, self.d_model)
        output = self.w_combine(score)
        return output
    
mha = multi_head_attention(d_model, n_heads)
output = mha(X, X, X)
print(f"output--->{output},\noutput.shape--->{output.shape}")  # (batch_size, seq_len, d_model)



output--->tensor([[[ 0.0367,  0.1196,  0.0331,  ...,  0.0248, -0.0792, -0.0767],
         [ 0.0482,  0.1216,  0.0219,  ...,  0.0257, -0.0672, -0.0462],
         [ 0.0415,  0.1056,  0.0198,  ...,  0.0270, -0.0822, -0.0797],
         ...,
         [ 0.0553,  0.1517,  0.0564,  ..., -0.0017, -0.0722, -0.0655],
         [ 0.0440,  0.1149,  0.0349,  ...,  0.0269, -0.0449, -0.0280],
         [ 0.0472,  0.1310,  0.0410,  ..., -0.0052, -0.0670, -0.0557]],

        [[ 0.0391,  0.0544, -0.0012,  ...,  0.0549, -0.0275, -0.0609],
         [ 0.0023,  0.0430,  0.0061,  ...,  0.0462, -0.0289, -0.0527],
         [ 0.0221,  0.0493,  0.0106,  ...,  0.0086, -0.0431, -0.0728],
         ...,
         [ 0.0563,  0.0281, -0.0107,  ...,  0.0307, -0.0307, -0.0553],
         [ 0.0227,  0.0617, -0.0195,  ...,  0.0318, -0.0325, -0.0421],
         [-0.0034,  0.0391, -0.0054,  ...,  0.0455, -0.0302, -0.0466]],

        [[-0.0370,  0.0946, -0.0143,  ...,  0.0410, -0.0995, -0.0281],
         [-0.0353,  0.1263,  0.0035

#### Token Embedding

In [6]:
class TokenEmbedding(nn.Embedding):
    def __init__(self,vocab_size, d_model):
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)


#### Post Embedding


In [7]:
class PositionEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        self.encoding = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1).float()
        _2i = torch.arange(0, d_model, 2).float()
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
    def forward(self, x):
        return x + self.encoding[:x.size(1), :].to(x.device)
    
pos_enc = PositionEncoding(d_model)
output = pos_enc(X)
print(f"output--->{output},\noutput.shape--->{output.shape}")  # (batch_size, seq_len, d_model)

output--->tensor([[[-0.9353,  0.8711,  0.7479,  ...,  1.0006, -0.1504,  2.0920],
         [ 1.8241,  1.1223,  1.4230,  ...,  0.2347, -1.7227,  2.1554],
         [ 0.1815, -1.1800,  2.5106,  ...,  0.2161,  1.9291,  0.7145],
         ...,
         [-1.7381,  0.3854,  2.0538,  ...,  2.3049,  0.7007,  1.2737],
         [-0.5405, -0.0754,  0.2926,  ...,  1.3374, -0.2684,  0.3385],
         [-0.1564,  1.8902, -1.6382,  ...,  0.5211, -0.3646,  0.3770]],

        [[-0.1989, -0.5804,  0.9845,  ...,  0.8966,  2.1996,  1.3870],
         [ 2.3472,  0.1405, -0.3111,  ...,  1.8604, -2.4045, -0.5477],
         [ 0.9121, -1.6780,  0.9404,  ...,  1.7230, -0.2655,  2.0115],
         ...,
         [ 0.0090, -1.0691,  1.5351,  ...,  1.1705, -1.4624,  1.2464],
         [ 0.2031,  1.7874,  1.5162,  ...,  1.3028, -0.4745,  1.0997],
         [ 0.7151,  0.2676, -1.2182,  ...,  1.3420, -0.3032,  0.5342]],

        [[ 1.2044,  0.7288,  0.3654,  ...,  0.9930, -1.4187,  0.0511],
         [ 2.0304,  0.5797,  1.4508

#### Total Embedding

In [8]:
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, dropout=0.1):
        super(TransformerEmbedding, self).__init__()
        self.token_embedding = TokenEmbedding(vocab_size, d_model)
        self.position_encoding = PositionEncoding(d_model, max_len)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        token_emb = self.token_embedding(x)
        pos_emb = self.position_encoding(token_emb)
        return pos_emb+ token_emb
    
transformer_emb = TransformerEmbedding(vocab_size=10000, d_model=d_model, max_len=5000)
output = transformer_emb(torch.randint(0, 10000, (128, 64)))
print(f"output--->{output},\noutput.shape--->{output.shape}")  # (batch_size, seq_len, d_model)

output--->tensor([[[-2.6721e+00,  2.0651e+00, -1.3672e-01,  ...,  7.1631e-01,
          -4.8644e-01, -3.1298e+00],
         [ 2.9003e+00,  1.6272e+00,  7.7190e-01,  ...,  4.5478e+00,
          -7.8348e-01,  2.5305e+00],
         [ 1.6634e+00, -6.4350e-01, -1.5827e+00,  ...,  8.6007e-01,
           8.3142e-01,  1.2924e+00],
         ...,
         [-3.8266e+00, -3.0057e+00, -1.1582e+00,  ...,  2.4564e+00,
           2.5342e+00,  3.6748e-01],
         [ 2.5169e+00,  4.8155e+00, -4.1446e+00,  ...,  1.7320e+00,
          -5.1551e+00,  3.2759e+00],
         [ 4.6657e-01,  5.1061e+00, -2.8093e+00,  ..., -1.1008e-01,
          -3.3807e+00, -1.8950e+00]],

        [[ 7.3553e-01, -2.6307e-01, -7.9150e-02,  ...,  1.1460e+00,
           2.1650e+00,  5.4773e-01],
         [ 1.7295e+00,  3.7172e-01,  3.1760e+00,  ...,  4.0051e+00,
          -3.2645e-01, -6.7654e-01],
         [ 1.3691e+00, -3.7171e+00,  3.3594e+00,  ..., -7.5549e-01,
          -1.6906e+00,  2.7740e+00],
         ...,
         [-3.06

#### LayerNorm

In [9]:
class LayerNorm(nn.Module):
    def __init__(self,d_model, eps = 1e-6):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))

    def forward(self,x):
        mean = x.mean(dim=-1,keepdim=True)
        std = x.std(dim=-1,keepdim=True)
        x = (x-mean) / (std + self.eps)
        return self.gamma * x + self.beta

ln = LayerNorm(d_model)
output = ln(X)
print(f"output--->{output},\noutput.shape--->{output.shape}")  # (batch_size, seq_len, d_model)
    

output--->tensor([[[-8.2814e-01, -1.1455e-02,  8.7660e-01,  ...,  1.1970e-01,
          -3.3195e-02,  1.2250e+00],
         [ 8.5700e-01,  4.7555e-01,  4.9383e-01,  ..., -8.0727e-01,
          -1.7190e+00,  1.0216e+00],
         [-6.5263e-01, -6.8685e-01,  1.5315e+00,  ..., -7.0587e-01,
           1.8680e+00, -2.3293e-01],
         ...,
         [-8.3752e-01,  6.6420e-01,  1.3662e+00,  ...,  1.3659e+00,
           7.1810e-01,  2.7189e-01],
         [ 1.7974e-01, -7.7588e-01,  3.9393e-01,  ...,  3.1958e-01,
          -2.9778e-01, -6.8774e-01],
         [-2.9427e-01,  9.3563e-01, -7.2572e-01,  ..., -4.4955e-01,
          -3.4167e-01, -5.9390e-01]],

        [[-1.7947e-01, -1.5378e+00,  9.8403e-01,  ..., -8.5618e-02,
           2.1788e+00,  3.9657e-01],
         [ 1.5262e+00, -3.1626e-01, -1.0251e+00,  ...,  9.0219e-01,
          -2.2546e+00, -1.4261e+00],
         [-5.4766e-02, -1.3056e+00, -5.3590e-02,  ...,  6.5764e-01,
          -3.2030e-01,  9.4295e-01],
         ...,
         [ 9.83

#### FFN

In [10]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dropout(F.relu(self.w_1(x)))
        x = self.w_2(x)
        return x
    
ffn = PositionwiseFeedForward(d_model)
output = ffn(X)
print(f"output--->{output},\noutput.shape--->{output.shape}")  # (batch_size, seq_len, d_model)

output--->tensor([[[-0.1287,  0.2055, -0.0140,  ...,  0.1522, -0.3233,  0.0281],
         [-0.0809,  0.3722,  0.3491,  ...,  0.3314, -0.4500, -0.0104],
         [ 0.1162, -0.2394,  0.3441,  ..., -0.2541, -0.1235,  0.3885],
         ...,
         [-0.0984, -0.0346,  0.3007,  ..., -0.0445, -0.4175, -0.0134],
         [-0.1214,  0.0887,  0.2027,  ...,  0.1193, -0.2498,  0.1965],
         [-0.1386, -0.2615,  0.0962,  ...,  0.4148, -0.1505,  0.1376]],

        [[ 0.1689, -0.2057, -0.0895,  ..., -0.3432, -0.5748, -0.2058],
         [-0.1668, -0.3674, -0.3573,  ...,  0.0699, -0.5165,  0.1554],
         [ 0.2569, -0.7297, -0.0336,  ..., -0.2967, -0.1650, -0.0086],
         ...,
         [-0.1440, -0.0537,  0.0292,  ..., -0.0751, -0.6167, -0.1168],
         [-0.2178, -0.1165, -0.3610,  ...,  0.4018, -0.4244,  0.4999],
         [ 0.1494, -0.0570,  0.3570,  ...,  0.4982, -0.0237,  0.1025]],

        [[-0.1903,  0.0579,  0.1035,  ..., -0.0476, -0.3276,  0.1784],
         [ 0.0147, -0.2066, -0.3353

#### EncoderLayer

In [11]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff=2048, dropout=0.1):
        super().__init__()
        self.self_attn = multi_head_attention(d_model, n_heads)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x+self.dropout(attn_output))
        ffn_output = self.ffn(x)
        x = self.norm2(x+self.dropout(ffn_output))
        return x
    
encoder_layer = EncoderLayer(d_model, n_heads)
output = encoder_layer(X)
print(f"output--->{output},\noutput.shape--->{output.shape}")  # (batch_size, seq_len, d_model)

output--->tensor([[[-7.0603e-01,  5.7553e-01,  7.5261e-01,  ..., -2.0858e-01,
           5.8189e-02,  1.1278e+00],
         [ 1.1365e+00,  6.0335e-01,  8.4254e-01,  ..., -5.2876e-01,
          -1.6772e+00,  1.2292e+00],
         [-7.5794e-02, -6.9841e-01,  1.2713e+00,  ..., -7.7817e-01,
           2.0499e+00, -2.0246e-01],
         ...,
         [-3.7518e-01,  2.6045e-01,  1.3156e+00,  ...,  1.0443e+00,
           2.9703e-01,  5.3704e-03],
         [ 1.6703e-01, -1.0393e+00,  4.4509e-01,  ..., -3.8164e-02,
           9.4134e-02, -3.0491e-01],
         [-7.9923e-04,  7.0837e-01, -6.2037e-01,  ..., -4.2584e-01,
          -3.6852e-01, -6.1785e-01]],

        [[-1.7826e-01, -1.5271e+00,  1.4500e+00,  ..., -5.7538e-01,
           2.3379e+00,  5.8379e-01],
         [ 1.6827e+00, -2.9564e-01, -1.0193e+00,  ...,  1.0930e+00,
          -2.0300e+00, -1.1677e+00],
         [-5.4246e-02, -1.3351e+00, -1.5155e-02,  ...,  7.9079e-01,
          -7.9953e-01,  6.9293e-01],
         ...,
         [ 1.01

#### DecoderLayer

In [12]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff=2048, dropout=0.1):
        super().__init__()
        self.self_attn = multi_head_attention(d_model, n_heads)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        self.cross_attn = multi_head_attention(d_model, n_heads)
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        self_attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout1(self_attn_output))
        cross_attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout2(cross_attn_output))
        ffn_output = self.ffn(x)
        x = self.norm3(x + self.dropout3(ffn_output))
        return x
        
decoder_layer = DecoderLayer(d_model, n_heads)
output = decoder_layer(X, X)
print(f"output--->{output},\noutput.shape--->{output.shape}")  # (batch_size, seq_len, d_model)

output--->tensor([[[-1.1896e+00, -7.0887e-02,  6.6363e-01,  ...,  4.0739e-01,
           3.4183e-01,  1.0413e+00],
         [ 8.8248e-01,  1.7339e-01,  3.7401e-01,  ..., -1.0262e+00,
          -1.1995e+00,  8.1794e-01],
         [-3.6680e-01, -9.5613e-01,  1.3482e+00,  ..., -5.8089e-01,
           2.3191e+00, -3.3271e-01],
         ...,
         [-4.5736e-01,  5.4850e-01,  8.2544e-01,  ...,  1.3540e+00,
           9.9084e-01, -1.6764e-01],
         [ 3.3166e-01, -7.0236e-01, -3.0535e-01,  ...,  3.0848e-01,
           2.1121e-01, -6.5877e-01],
         [-1.7814e-01,  1.0016e+00, -6.4724e-01,  ..., -4.5331e-01,
           1.9896e-02, -7.8978e-01]],

        [[-1.1991e-02, -1.8435e+00,  8.8692e-01,  ...,  1.8697e-01,
           2.3730e+00,  2.4594e-01],
         [ 1.9889e+00, -5.6041e-01, -1.2818e+00,  ...,  8.6363e-01,
          -1.7861e+00, -1.7457e+00],
         [-4.4614e-02, -1.6247e+00, -4.2892e-01,  ...,  8.1105e-01,
          -3.1793e-01,  8.5236e-01],
         ...,
         [ 1.25

#### Encoder

In [13]:
class Encoder(nn.Module):
    def __init__(self,vocab_size, d_model, n_layers, n_heads,  max_len, d_ff=2048, dropout=0.1):
        super().__init__()
        self.embedding = TransformerEmbedding(vocab_size, d_model, max_len)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])
        self.norm = LayerNorm(d_model)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x, mask)
        x = self.norm(x)
        return x

encoder = Encoder(vocab_size=10000, d_model=d_model, n_layers=6, n_heads=n_heads, max_len=5000)
output = encoder(torch.randint(0, 10000, (128, 64)))
print(f"output--->{output},\noutput.shape--->{output.shape}")  # (batch_size, seq_len, d_model)

output--->tensor([[[ 9.5672e-01, -5.5659e-01, -5.1308e-01,  ...,  2.9756e-01,
          -1.8059e-01,  1.3197e-01],
         [-9.7916e-02, -1.3802e+00, -1.0517e-01,  ..., -6.2186e-01,
           2.8005e-02, -4.2974e-01],
         [ 3.7342e-01, -6.5561e-01,  1.4070e+00,  ..., -2.7220e-01,
          -5.0496e-01,  8.1798e-01],
         ...,
         [ 8.7133e-01, -1.0461e+00,  2.0067e+00,  ..., -4.6159e-02,
           1.2420e+00, -3.7341e-01],
         [ 8.6260e-01, -2.6430e-01,  1.7339e+00,  ...,  6.5361e-02,
          -5.2534e-01, -3.2787e-01],
         [ 2.1095e+00,  4.9862e-01, -9.9248e-01,  ...,  1.4241e-01,
          -3.5422e-01,  1.1507e+00]],

        [[ 3.1536e-01, -4.4696e-01,  5.3404e-01,  ...,  5.9221e-01,
          -6.4635e-03,  9.9375e-01],
         [ 1.0559e+00,  1.1605e+00, -5.2131e-01,  ...,  1.3782e+00,
          -1.4143e+00,  3.3445e-01],
         [ 6.3440e-01, -9.9827e-01, -1.0822e+00,  ..., -8.4854e-01,
          -5.7946e-01,  7.5415e-01],
         ...,
         [ 1.16

#### Decoder

In [14]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, max_len, d_ff=2048, dropout=0.1):
        super().__init__()
        self.embedding = TransformerEmbedding(vocab_size, d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])
        self.norm = LayerNorm(d_model)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)
        x = self.norm(x)
        return x

decoder = Decoder(vocab_size=10000, d_model=d_model, n_layers=6, n_heads=n_heads, max_len=5000)
output = decoder(torch.randint(0, 10000, (128, 64)), torch.randn(128, 64, d_model))
print(f"output--->{output},\noutput.shape--->{output.shape}")  # (batch_size, seq_len, d_model)

output--->tensor([[[ 0.4541, -0.5923,  0.6336,  ...,  1.5332, -0.6194,  2.1135],
         [-1.4333,  0.8145,  0.3547,  ...,  0.1490,  0.3329,  0.6539],
         [ 0.1714,  0.6863,  1.1653,  ..., -0.4941, -0.6235, -0.2031],
         ...,
         [-0.6739, -1.6603,  0.9431,  ...,  0.6502,  0.8397,  1.0983],
         [ 0.6853, -1.3998, -0.3372,  ...,  0.7246,  0.3279,  0.2709],
         [ 0.5900, -0.4388,  0.1294,  ..., -1.4263,  0.3875,  1.5885]],

        [[-0.5721, -0.1229,  1.3940,  ..., -0.0215, -0.4064,  1.3293],
         [ 0.4503, -0.8469,  0.6139,  ..., -0.6272,  0.4465,  0.9701],
         [ 0.5181, -2.0120,  0.4316,  ..., -0.0662,  0.9880, -0.7793],
         ...,
         [-0.8555, -1.5487,  1.0834,  ...,  1.7692,  1.0356,  0.4158],
         [-1.3909, -0.9398, -0.2683,  ..., -1.1808, -0.8694, -1.2517],
         [ 0.9656,  0.2940, -1.3547,  ...,  0.6702,  0.6983,  0.2181]],

        [[ 0.5427,  0.0380,  1.0362,  ..., -0.2925,  0.6481,  0.8463],
         [ 0.0268, -1.9165, -0.3009

#### Transformer

In [15]:
class Transformer(nn.Module):
    def __init__(self,src_pad_idx, tgt_pad_idx, vocab_size, d_model, n_layers, n_heads, max_len, d_ff=2048, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(vocab_size, d_model, n_layers, n_heads, max_len, d_ff, dropout)
        self.decoder = Decoder(vocab_size, d_model, n_layers, n_heads, max_len, d_ff, dropout)
        self.src_pad_idx = src_pad_idx
        self.tgt_pad_idx = tgt_pad_idx
        self.output_layer = nn.Linear(d_model, vocab_size)

    def make_pad_mask(self, q, k, pad_idx_q, pad_idx_k):
        len_q, len_k = q.size(1), k.size(1)
        q = q.ne(pad_idx_q).unsqueeze(1).unsqueeze(3)
        q = q.repeat(1,1,1,len_k)

        k = k.ne(pad_idx_k).unsqueeze(1).unsqueeze(2)
        k = k.repeat(1,1,len_q,1)

        mask = q & k
        return mask



    def mask_casual_mask(self, q, k):
        len_q, len_k = q.size(1), k.size(1)
        mask = torch.tril(torch.ones(len_q, len_k, dtype=torch.bool))
        return mask
    
    def forward(self, src, tgt):
        src_mask = self.make_pad_mask(src, src, self.src_pad_idx, self.src_pad_idx)
        tgt_mask = self.make_pad_mask(tgt, tgt, self.tgt_pad_idx, self.tgt_pad_idx) & self.mask_casual_mask(tgt, tgt)
        src_trg_mask = self.make_pad_mask(src, tgt, self.src_pad_idx, self.tgt_pad_idx)

        enc_output = self.encoder(src, src_mask)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        output = self.output_layer(dec_output)
        return output
    
transformer = Transformer(src_pad_idx=1, tgt_pad_idx=1, vocab_size=10000, d_model=d_model, n_layers=6, n_heads=n_heads, max_len=5000)
src = torch.randint(0, 10000, (128, 64))
tgt = torch.randint(0, 10000, (128, 64))
output = transformer(src, tgt)
print(f"output--->{output},\noutput.shape--->{output.shape}")  # (batch_size, seq_len, vocab_size)





output--->tensor([[[ 0.0704, -0.7916,  0.0345,  ...,  0.9713,  0.5251, -0.2577],
         [ 0.4158,  0.3712, -0.6179,  ...,  0.2595, -0.3644,  0.4199],
         [ 0.2551, -0.7157, -0.1395,  ...,  0.3235, -0.1678, -0.2280],
         ...,
         [ 1.0497, -1.1774,  0.8196,  ..., -0.1789, -0.4988, -0.0934],
         [-0.0275, -0.8394,  0.3046,  ...,  0.3125, -0.4127, -1.0884],
         [ 0.8755, -0.2182, -0.3787,  ...,  0.8447, -0.1265, -0.6728]],

        [[-0.0675,  0.5254,  0.4661,  ...,  1.0474, -0.7141,  0.2792],
         [-0.1281, -0.6991, -0.2546,  ...,  0.2783,  0.1304,  0.6848],
         [ 0.5277, -0.5158, -0.2010,  ...,  0.3533, -0.2300, -0.1879],
         ...,
         [ 0.3603,  0.0540, -0.0661,  ...,  0.3685, -0.1012, -0.1499],
         [ 0.5997, -0.2729,  0.6627,  ...,  0.0234, -0.4179, -0.3762],
         [ 0.1777, -0.1046, -0.2325,  ..., -0.1315, -0.9080, -0.1761]],

        [[ 0.5476, -1.6461,  0.6430,  ...,  0.7594, -1.1301,  0.1436],
         [ 0.8264, -0.7490,  0.9204