## 注意力机制
注意力机制就是建模相似度  
```
mapping a query and a set of key-value pairs to an output
```
**output** computed as a weighted sum of the values  
**weight** compatibility between query and key

In [None]:
# 导个包先
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

### Attention is all you need
transformer架构的开山之作 最开始用在神经网络翻译领域  

#### 相关概念介绍
`embedding`: 把输入编码成等长的向量 从现实空间 -> 向量空间   
`Q for query`: 在这里任务里面 Q就是一句话的embedding n个长为d的向量 `[n_q, d_q]`   
`KV for key and value`: 我们这里是计算自注意力 所以key和value也是由同样的输入算出来的   

#### 网络结构
![](../../Source/transformer.png)

#### 注意力公式
$
attention = softmax(\frac{QK^T}{\sqrt{d_k}})V
$

#### 流程分解
1. matmul  U = Q @ K^T
2. scale   U = U / sqrt(d_k)
3. mask    U = U.masked_fill(mask, -inf)
4. softmax A = softmax(U)
5. matmul  O = A @ V

In [None]:
# 测试代码
batch = 1
n_q, n_k, n_v = 2, 4, 4
d_q, d_k, d_v = 128, 128, 64

q = torch.randn(batch, n_q, d_q)
k = torch.randn(batch, n_k, d_k)
v = torch.randn(batch, n_v, d_v)
mask = torch.zeros(batch, n_q, n_k).bool()

attention = Attention()
attn, output = attention(q, k, v, mask=mask)

In [None]:
class Attention(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, q, k, v):
        B, n_k, d_k = k.shape()
        u = torch.bmm(q, k.transpose(1, 2))
        u /= torch.sqrt(d_k)
        attn = F.softmax(u, dim=2)

        return torch.bmm(attn, v)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, dim=768, nums_heads=8):
        super().__init__()
        
        self.W_Q = nn.Linear(dim, d_q)
        self.W_K = nn.Linear(dim, d_k)
        self.W_V = nn.Linear(dim, d_v)
        self.fc = nn.Linear(d_v, dim)
        self.ln = nn.LayerNorm(dim)

        self.nums_heads = nums_heads
    
    def forward(self, q, k, v):
        residual, batch = q, q.shape(0)

        q = self.W_Q(q).reshape(batch, n_q, self.nums_heads, -1).transpose(1, 2)
        k = self.W_K(k).reshape(batch, n_k, self.nums_heads, -1).transpose(1, 2)
        V = self.W_K(v).reshape(batch, n_v, self.nums_heads, -1).transpose(1, 2)

        attn = torch.bmm(q, k.transpose(2, 3))  # [bacth, heads, n_q, n_k]
        attn = F.softmax(attn / np.sqrt(d_k), dim=-1)
        output = torch.bmm(attn, v).transpose(1, 2).reshape(batch, n_v, -1)
        output = self.fc(output)

        return self.ln(output + residual)

In [None]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, dim, num_heads=8):
        super().__init__()
        self.qkv = nn.Linear(in_features=dim, out_features=dim * 3)
        self.proj = nn.Linear(in_features=dim, out_features=dim)
        self.num_heads = num_heads
    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        # [batchsz, heads, nums, dim//heads]
        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = (q @ k.transpose(-2, -1)) * np.power(C, -0.5)
        attn = attn.softmax(dim=-1)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        return x

### An image is worth 16×16 words
Vision Transformer 开山之作 把transformer引入到了CV领域

#### 相关概念介绍
这里只是把单词换成了16×16的图像块 一张图像作为一个句子来处理

#### 注意力公式
$
attention = softmax(\frac{QK^T}{\sqrt{d_k}})V
$

#### 流程分解

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

B = 1    # batch size
N = 196  # patches
C = 768  # channel / embed_dim

image = torch.randn([1, 3, 224, 224])

PatchEmbed实际上利用kernel_size和stride都等于patch_size的二维卷积将图像分块  
```python
                [1, 3, 224, 224]
Conv2d      ->  [1, 768, 14, 14]
flatten     ->  [1, 768, 196]
transpose   ->  [1, 196, 768]
```

In [None]:
class PatchEmbed(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_c=3, embed_dim=768):
        super().__init__()
        self.proj = nn.Conv2d(in_channels=in_c, out_channels=embed_dim, kernel_size=patch_size, stride=patch_size)
    def forward(self, x):
        x = self.proj(x).flatten(2).transpose(1, 2)
        return x

patchembed = PatchEmbed()
image = patchembed(image)
print(image.shape)

拆分成多头计算注意力然后拼起来, patch_num应该是196+1, 有一个cls_token  
一个线性层计算出qkv, 然后就是常规的注意力操作
```python
                [1, 196, 768]
linear      ->  [1, 196, 3, 8, 96]
permute     ->  [3, 1, 8, 196, 96]
q @ k       ->  [1, 8, 196, 196]
attn @ v    ->  [1, 8, 196, 96]
transpose   ->  [1, 196, 8, 96]
reshape     ->  [1, 196, 768]
```

In [None]:
class Attention(nn.Module):
    def __init__(self, dim, num_heads=8):
        super().__init__()
        self.qkv = nn.Linear(in_features=dim, out_features=dim * 3)
        self.proj = nn.Linear(in_features=dim, out_features=dim)
        self.num_heads = num_heads
    def forward(self, x):
        B, N, C = x.shape
        # [batchsz, blocks, 3, heads, dim//heads] - > [3, batchsz, heads, blocks, dim//heads]
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(1, 3, 0, 2, 4)
        # [batchsz, heads, blocks, dim//heads]
        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = (q @ k.transpose(-2, -1)) * np.power(C, -0.5)
        attn = attn.softmax(dim=-1)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        return x
    
attention = Attention(dim=C)
image = attention(image)
print(image.shape)

In [None]:
class mlp(nn.Module):
    def __init__(self, dim) -> None:
        super().__init__()

        self.fc = nn.Sequential(
            nn.Linear(dim, dim * 4),
            nn.ReLU(),
            nn.Linear(dim, dim * 4)
        )
    
    def forward(self, x):
        residual = x
        output = self.fc(x)

        return self.ln(output + residual)

batchnorm是在一个batch的各个样本之间做归一化，参数大小是 2×C  
Layernorm则是在特征之间做归一化，所以参数大小和batchsize有关？

可以看看这个，LayerNorm就是加在最后D个维度上，被称为normalized_shape  
https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html

为什么对于图像是`nn.LayerNorm([C, H, W])`而对于NLP是`nn.LayerNorm(embedding_dim)`  
考虑单独的实例，一张图像的组成就是[C, H, W]， 而一个单词只是[N]维的向量，这里做LN是在单词维度上，而不是句子，所以只需要最后一个维度。

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, dim, eps=1e-12) -> None:
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(dim))
        self.beta = nn.parameter(torch.zeros(dim))
        self.eps = eps

    def forward(self, x):
        # 是把batch内每个实例，单独对其特征求均值和方差，这里的特征只有最后一个维度
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True)

        out = (x - mean) / torch.sqrt(var + self.eps)  # 尺度不变
        out = self.gamma * out + self.beta  # 特征变换 所以gamma和beta是dim尺寸

        return out

In [None]:
class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_dim, dropout_rate=0.1):
        super(Block, self).__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.attn = Attention(dim, num_heads)
        self.mlp = MLP(dim, mlp_dim, dropout_rate)
        self.drop_path_prob = dropout_rate  # Drop Path 的概率

    def forward(self, x):
        # 自注意力层
        attention_output = self.attn(self.norm1(x))
        # 添加 drop path 或者残差连接
        x = x + self.drop_path(attention_output)
        
        # 多层感知器层
        mlp_output = self.mlp(self.norm2(x))
        # 再次添加 drop path 或者残差连接
        x = x + self.drop_path(mlp_output)

        return x

    def drop_path(self, x):
        if self.training and self.drop_path_prob > 0:
            keep_prob = 1.0 - self.drop_path_prob
            mask = torch.rand(x.shape[0], 1, 1, 1, device=x.device) < keep_prob
            x = x / keep_prob * mask  # 对保留的路径进行缩放
        return x

In [None]:
class VisionTransformer(nn.Module):
    def __init__(self, image_size, patch_size, num_classes, embed_dim, depth, num_heads):
        super(VisionTransformer, self).__init__()

        # 计算图像中的分块数量
        self.num_patches = (image_size // patch_size) ** 2

        # 创建图像的分块嵌入
        self.patch_embed = PatchEmbed(image_size=image_size, patch_size=patch_size, embed_dim=embed_dim)

        # 类别编码
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))

        # 位置编码
        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + 1, embed_dim))

        # Transformer 编码块的堆叠
        self.blocks = nn.Sequential(*[Block(embed_dim=embed_dim, num_heads=num_heads, mlp_dim=4 * embed_dim, depth=depth) for _ in range(depth)])

    def forward(self, x):
        # 图像分块嵌入
        x = self.patch_embed(x)

        # 扩展类别编码
        cls_token = self.cls_token.expand(x.shape[0], -1, -1)

        # 连接类别编码和分块嵌入
        x = torch.cat((cls_token, x), dim=1)

        # 加上位置编码
        x = x + self.pos_embed

        # 通过 Transformer 编码块
        x = self.blocks(x)

        # 返回类别编码
        return x[:, 0]

# 创建 VisionTransformer 模型的实例
image_size = 224
patch_size = 16
num_classes = 1000
embed_dim = 768
depth = 12
num_heads = 12
model = VisionTransformer(image_size, patch_size, num_classes, embed_dim, depth, num_heads)

# 示例输入
input_data = torch.randn(1, 3, image_size, image_size)

# 使用模型进行前向传播
output = model(input_data)

In [10]:
import torch
cls_token = torch.nn.Parameter(torch.zeros(1, 1, 768))
cls_token = cls_token.expand(10, -1, -1)
cls_token.shape

torch.Size([10, 1, 768])