In [None]:
# 入力
from transformers import AutoTokenizer

model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

text = "time flies like an arrow"
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
inputs.input_ids # 特殊トークンを除いた5個

In [None]:
# マスク行列(下三角)
import torch
seq_len = inputs.input_ids.size(-1)
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0)
mask[0]

In [None]:
# 埋め込みスコア
from torch import nn
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_ckpt)
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
inputs_embeds = token_emb(inputs.input_ids)
inputs_embeds.size() # [1, 5, 768]
from math import sqrt
query = key = value = inputs_embeds
dim_k = key.size(-1)
scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
scores

In [None]:
# アテンションマスク化
scores.masked_fill_(mask==0, -float("inf"))

In [None]:
# スケール化ドット積アテンションを調整
import torch.nn.functional as F
def scaled_dot_product_attention(query, key, value, mask=None):
  dim_k = query.size(-1)
  scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
  if mask is not None:
    scores = scores.masked_fill(mask==0, -float("inf")) # マスク導入
  weights = F.softmax(scores, dim=-1)
  return weights.bmm(value)