In [17]:
import torch
from torch import nn
import torch.nn.functional as F
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import Image
# default: 100
mpl.rcParams['figure.dpi'] = 150

In [18]:
import os#环境代理设置
os.environ["http_proxy"] = "http://127.0.0.1:7890"
os.environ["https_proxy"] = "http://127.0.0.1:7890"

In [19]:
model_ckpt = 'bert-base-uncased'
model = AutoModel.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
config = AutoConfig.from_pretrained(model_ckpt)

In [20]:
token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
text = 'time flies like an arrow'
model_inputs = tokenizer(text, return_tensors='pt', add_special_tokens=False)
input_embeddings = token_embedding(model_inputs['input_ids'])
input_embeddings.shape

torch.Size([1, 5, 768])

In [21]:
import math
q = k = v = input_embeddings
# (1, 5, 768) * (1, 768, 5) => (1, 5, 5)
scores = torch.bmm(q, k.transpose(1,2))/math.sqrt(k.size(-1))
scores

tensor([[[27.1610,  0.7026,  0.1613, -0.9768, -1.0512],
         [ 0.7026, 29.9493, -0.9095,  1.1273, -1.4273],
         [ 0.1613, -0.9095, 27.4205,  0.1608,  0.5826],
         [-0.9768,  1.1273,  0.1608, 29.4293,  1.8717],
         [-1.0512, -1.4273,  0.5826,  1.8717, 30.7920]]],
       grad_fn=<DivBackward0>)

In [22]:
seq_len = model_inputs['input_ids'].size(-1)
# triangular lower（上三角的话，torch.triu，upper triangular）
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0)
mask

tensor([[[1., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 1.]]])

In [23]:
scores.masked_fill(mask==0, -float('inf'))

tensor([[[27.1610,    -inf,    -inf,    -inf,    -inf],
         [ 0.7026, 29.9493,    -inf,    -inf,    -inf],
         [ 0.1613, -0.9095, 27.4205,    -inf,    -inf],
         [-0.9768,  1.1273,  0.1608, 29.4293,    -inf],
         [-1.0512, -1.4273,  0.5826,  1.8717, 30.7920]]],
       grad_fn=<MaskedFillBackward0>)

In [24]:
torch.exp(torch.tensor(-float('inf')))

tensor(0.)

In [25]:
weight = F.softmax(scores, dim=-1)
weight

tensor([[[1.0000e+00, 3.2302e-12, 1.8800e-12, 6.0240e-13, 5.5924e-13],
         [1.9874e-13, 1.0000e+00, 3.9645e-14, 3.0392e-13, 2.3620e-14],
         [1.4503e-12, 4.9711e-13, 1.0000e+00, 1.4497e-12, 2.2102e-12],
         [6.2345e-14, 5.1123e-13, 1.9448e-13, 1.0000e+00, 1.0763e-12],
         [1.4815e-14, 1.0170e-14, 7.5897e-14, 2.7549e-13, 1.0000e+00]]],
       grad_fn=<SoftmaxBackward0>)

(masked)self attention

In [28]:
def scaled_dot_product_attn(q, k, v, maske=None):
    dim_k = k.size(-1)
    attn_scores = torch.bmm(q, k.transpose(1,2))/math.sqrt(dim_k)
    if mask is not None:
        scores.masked_fill(mask==0, -float('inf'))
    attn_weights = F.softmax(attn_scores, dim=-1)
    print(attn_weights)
    return torch.bmm(attn_weights,v)

In [29]:
scaled_dot_product_attn(q, k, v, mask)

tensor([[[1.0000e+00, 3.2302e-12, 1.8800e-12, 6.0240e-13, 5.5924e-13],
         [1.9874e-13, 1.0000e+00, 3.9645e-14, 3.0392e-13, 2.3620e-14],
         [1.4503e-12, 4.9711e-13, 1.0000e+00, 1.4497e-12, 2.2102e-12],
         [6.2345e-14, 5.1123e-13, 1.9448e-13, 1.0000e+00, 1.0763e-12],
         [1.4815e-14, 1.0170e-14, 7.5897e-14, 2.7549e-13, 1.0000e+00]]],
       grad_fn=<SoftmaxBackward0>)


tensor([[[-1.3221,  0.5982, -0.6843,  ..., -0.7314, -0.3412, -1.0361],
         [-0.0584,  1.5743,  0.1616,  ..., -0.1830,  0.6903,  0.4571],
         [-0.0127, -0.0299, -0.6004,  ...,  0.5157, -0.1864, -0.2521],
         [-0.5243, -0.5876,  0.5193,  ...,  1.9051,  0.3476,  1.1640],
         [ 1.6745,  0.0067,  0.0389,  ..., -0.0784,  0.3701,  1.2834]]],
       grad_fn=<BmmBackward0>)

源码分析

In [None]:
decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
# from encoder output
# (seq_len, batch_size, hidden_dim)
memory = torch.rand(10,32,512)
# 因为seq2seq的输入输出长度可能不同，如中英翻译
target = torch.rand(20,32,512)

In [32]:
decoder_layer

TransformerDecoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
  )
  (linear1): Linear(in_features=512, out_features=2048, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=2048, out_features=512, bias=True)
  (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
  (dropout3): Dropout(p=0.1, inplace=False)
)