In [17]:
import torch
import math
import torch.nn as nn
class selfAttentionV1(nn.Module):
  def __init__(self, hidden_dim:int = 728) ->None:
    super().__init__()
    self.hidden_dim = hidden_dim
    # 初始化三个不同的线性映射层
    self.W_q = nn.Linear(hidden_dim, hidden_dim)
    self.W_k = nn.Linear(hidden_dim, hidden_dim)
    self.W_v = nn.Linear(hidden_dim, hidden_dim)
  def forward(self, X):
    # X: [batch_size, seq_len, hidden_dim]
    Q = self.W_q(X)
    K = self.W_k(X)
    V = self.W_v(X)
    # Q, K, V: [batch_size, seq_len, hidden_dim]
    # 计算注意力分数
    attention_value = torch.matmul(
      Q,K.transpose(-1, -2)
    )
    # attention_value: [batch_size, seq_len, seq_len]
    attention_weight = torch.softmax(
      attention_value / math.sqrt(self.hidden_dim),
      dim=-1
    )
    print(attention_weight)
    # 需要对softmax的最后一个维度做softmax A = K^T * Q
    # A 矩阵的横排进行了 softmax运算
    # (bacth,seq ,hidden) * (bacth, seq, hidden)
    attention_output = torch.matmul(
      attention_weight, V
    )
    return attention_output


In [13]:
X = torch.randn(3,2,4)
X

tensor([[[ 0.3802,  0.4722,  0.1428,  0.1132],
         [ 0.5309,  0.4243,  0.3981, -0.0057]],

        [[ 0.5544,  1.2497, -0.9296,  1.0535],
         [ 0.2638,  0.4064,  0.1934,  2.0676]],

        [[ 1.9152,  0.3164,  0.6095,  0.5894],
         [ 1.9456,  1.9285,  1.0518,  0.9114]]])

In [18]:
self_atn = selfAttentionV1(4)
self_atn(X)
# 2*4 * 4*2 # 2*2

tensor([[[0.4917, 0.5083],
         [0.4919, 0.5081]],

        [[0.5943, 0.4057],
         [0.5051, 0.4949]],

        [[0.4151, 0.5849],
         [0.3315, 0.6685]]], grad_fn=<SoftmaxBackward0>)


tensor([[[ 0.9084,  0.6090,  0.0637, -0.0840],
         [ 0.9084,  0.6089,  0.0637, -0.0840]],

        [[ 0.8165,  0.7386, -0.3221, -1.1168],
         [ 0.7677,  0.7507, -0.3931, -1.1077]],

        [[ 1.9613,  1.4878,  0.4216,  0.0239],
         [ 2.0266,  1.5347,  0.4132, -0.0323]]], grad_fn=<UnsafeViewBackward0>)

In [19]:
# dropout
# attention_mask
# output 矩阵映射
class selfAttentionV3(nn.Module):
  def __init__(self, hidden_dim:int = 728,dropout_rate = 0.1,*args,**kwargs) ->None:    
    super().__init__(*args, **kwargs)
    self.hidden_dim = hidden_dim
    # 初始化三个不同的线性映射层
    self.proj = nn.Linear(hidden_dim, hidden_dim*3)
    self.attention_dropout = nn.Dropout(dropout_rate)
    self.output_proj = nn.Linear(hidden_dim, hidden_dim)
  def forward(self, X ,attention_mask = None):
    QKV = self.proj(X)
    Q,K,V = torch.split(QKV, self.hidden_dim, dim=-1)
    attention_weight = Q @ K.transpose(-1, -2) / math.sqrt(self.hidden_dim)
    if attention_mask is not None:
      attention_weight = attention_weight.masked_fill(
        attention_mask==0,
        float("-1e20")
      )
    print(attention_weight)
    attention_weight = torch.softmax(
      attention_weight, dim=-1
    )
    attention_weight = self.attention_dropout(attention_weight)
    attention_result = attention_weight @ V
    #
    output = self.output_proj(attention_result)
    

In [None]:
X = torch.randn(3,2,4)
mask = torch.tensor(
  [
    [1,1,1,0],
    [1,1,0,0],
    [1,0,0,0]
  ]
)
mask = mask.unsqueeze(dim=1).repeat(1,4,1)
print("repeat shape is:{mask.size}")

In [None]:
# learn about some fuction from pytorch
'''
like a mask matrix
[
  1,2,3       
  4,5,6   
  7,8,9
]
mask matrix 
[
  1 1 1
  0 1 1
  0 0 1
]
在经过Mask矩阵的乘法后0的位置(False)会被置为0 
[
  1 2 3
  0 5 6
  0 0 9
]
'''

In [22]:
import torch
X = torch.arange(0, 16).view(4, 4)
mask = torch.eye(4, dtype=torch.bool)
print(mask)
masked_X = X.masked_fill(mask, 0)
masked_X

tensor([[ True, False, False, False],
        [False,  True, False, False],
        [False, False,  True, False],
        [False, False, False,  True]])


tensor([[ 0,  1,  2,  3],
        [ 4,  0,  6,  7],
        [ 8,  9,  0, 11],
        [12, 13, 14,  0]])

In [24]:
# Dropout 是一种常用的正则化方法 随机将部分神经元的输出置为0
m = nn.Dropout(p=0.2)  # p是丢弃的概率
input_tensor = torch.randn(5, 5)
output_tensor = m(input_tensor)
print("Input Tensor:\n", input_tensor)
print("Output Tensor after Dropout:\n", output_tensor)
# 部分变为0 占总数的0.2
# 非0参数都除以0.8这些值都变大了

Input Tensor:
 tensor([[ 1.0157,  1.3349,  0.2062, -1.1796, -0.5577],
        [-0.4872, -1.5037, -1.1942,  0.0972,  0.1060],
        [ 0.6175, -0.3676, -0.3595,  1.3189, -0.2613],
        [ 2.2171, -0.8076,  1.6101,  0.1074,  1.2273],
        [-0.0859,  0.1804, -1.2247,  1.0781, -0.7553]])
Output Tensor after Dropout:
 tensor([[ 1.2696,  1.6687,  0.2577, -1.4745, -0.6972],
        [-0.6090, -1.8797, -1.4928,  0.1215,  0.1325],
        [ 0.0000, -0.4595, -0.4493,  0.0000, -0.0000],
        [ 2.7714, -0.0000,  2.0126,  0.0000,  1.5341],
        [-0.1073,  0.0000, -1.5309,  1.3476, -0.9441]])


In [None]:
tensor1 = torch.tensor([[1, 2, 3], [4, 5, 6]])
tensor2 = torch.tensor([[4,5],
                        [4,5],
                        [4,5]])
# matmul函数带有广播机制 适用于高维向量的矩阵乘法
