### 小题1：概念与公式阐述

1. **词嵌入（Word Embedding）**：
   - 请解释词嵌入的定义及其作用。
   - 说明词嵌入如何解决传统词表示方法的局限性。
   - 举例说明一种常见的词嵌入模型及其特点。
2. **多头自注意力（Multi-Head Self-Attention）**：
   - 说明多头自注意力的核心思想
   - 写出缩放点积注意力（Scaled Dot-Product Attention）的计算公式，并解释公式中各参数的含义。
   
### 回答1：
- 定义：词嵌入是一种将自然语言中的词语表示为低维、稠密的连续向量的方法。这些向量在空间中能够捕捉词语之间的语义和语法关系。 作用：让计算机能够“理解”词的语义（相似的词有相近的向量）。将原本离散的符号（词）转换为模型可计算的数值形式。减少特征维度，提升训练效率和泛化能力。
- 传统方法主要是 One-Hot 编码 和 词袋模型（Bag of Words, BOW），它们的局限性：维度过高：词表中有多少个词，向量就有多长（通常上万或更多）。稀疏性强：大多数维度都是 0，不利于模型学习。缺乏语义信息：例如 One-Hot 中，“猫”和“狗”的向量完全正交，看不出它们的相似性。
- Word2Vec（Google，2013）特点：训练高效，能在大规模语料上快速学习。能捕捉词之间的语义关系，向量空间结构具有良好的可解释性。训练得到的词向量可以迁移到下游任务中作为预训练词向量使用。

### 回答2：
- 将输入的向量通过不同的线性映射拆分成 多个子空间（多个头，heads）；每个头在子空间上独立计算注意力（即学习不同的相关性模式，比如语法关系、语义关系等）；最后将所有头的输出拼接（concatenate）起来，再做一次线性变换得到最终结果。
- $Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V$

Q (Query)：查询矩阵,K (Key)：键矩阵，V (Value)：值矩阵，$\sqrt{d_k}$缩放因子,softmax：归一化操作.

In [23]:
import numpy as np

np.random.seed(114514)

def scaled_dot_product_attention(query, key, value, mask=None):
    
    N, K, M = query.shape
    output = np.zeros_like(query)
    attention_weights = np.zeros((N, K, K))
    
    for n in range(N):
        scores = query[n] @ key[n].T / np.sqrt(M) # (K, K)
        
        if mask is not None:
            scores = np.where(mask[n], scores, -1e9)
            
        scores_exp = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
        att_weights = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
        attention_weights[n] = att_weights
        output[n] = att_weights @ value[n]
        
    return output, attention_weights

In [24]:
def multi_head_attention(embed_size, num_heads, x, mask=None):
    
    N, K, D = x.shape
    head_dim = D // num_heads
    assert D % num_heads == 0
    
    # 初始化
    W_Q = np.random.randn(D,D)
    W_K = np.random.randn(D,D)
    W_V = np.random.randn(D,D)
    W_O = np.random.randn(D,D)
    
    output = np.zeros((N, K, D))
    all_weights = np.zeros((N, num_heads, K, K))
    
    for n in range(N):
        Q = x[n] @ W_Q # (K, D)
        K = x[n] @ W_K
        V = x[n] @ W_V
        
        head_outputs = []
        for h in range(num_heads):
            Q_h = Q[:, h*head_dim: (h+1)*head_dim]  # (K, head_dim)
            K_h = K[:, h*head_dim: (h+1)*head_dim]
            V_h = V[:, h*head_dim: (h+1)*head_dim]

            out_h, attn_h = scaled_dot_product_attention(
                Q_h[None, :, :],
                K_h[None, :, :],
                V_h[None, :, :],
                mask
            )
            # 去掉 batch 维
            out_h = out_h[0]
            attn_h = attn_h[0]
            
            head_outputs.append(out_h)
            all_weights[n,h] = attn_h
        
        head_outputs = np.concatenate(head_outputs, axis=-1)
        output[n] = head_outputs @ W_O
    
    return output, all_weights

In [28]:
# 测试代码
batch_size = 10
seq_len = 20
embed_size = 128
num_heads = 8
input = np.random.randn(batch_size, seq_len, embed_size) 
output, weights = multi_head_attention(embed_size, num_heads, input)

print(output.shape, weights.shape)
print(output[0][0][:10], weights[0][0][0][:10])

(10, 20, 128) (10, 8, 20, 20)
[ -64.9601117   -38.18690465  -49.38122126    1.33917629  -55.27862891
 -121.93890998 -107.96320283 -141.02273922   33.95973751  -29.02877682] [5.20961722e-145 2.71639176e-113 1.79900964e-097 2.92622563e-101
 1.92231371e-060 8.65088333e-081 1.56849882e-122 8.85718800e-066
 3.92946177e-186 1.73323736e-033]


In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    """基于PyTorch的多头自注意力实现"""
    def __init__(self, embed_size, num_heads, dropout=0.1):
        super().__init__()
        self.embed_size = embed_size
        self.num_heads = num_heads
        self.dropout = nn.Dropout(dropout)
        
        assert embed_size % num_heads == 0
        self.head_dim = embed_size // num_heads
        
        self.W_Q = nn.Linear(embed_size, embed_size)
        self.W_K = nn.Linear(embed_size, embed_size)
        self.W_V = nn.Linear(embed_size, embed_size)
        
        self.W_O = nn.Linear(embed_size, embed_size)
        
        
        def scaled_dot_product_attention(self, Q, K, V, mask=None):
            
            N, K, M = Q.shape
            weights = torch.bmm(Q, K.transpose(1,2)) / (M ** 0.5)
            
            if mask is not None:
                weights = weights.masked_fill(mask, -1e9)
                
            weights_softmax = F.softmax(weights, axis=-1)
            y = torch.bmm(weights_softmax, value)
            
            return y, weights_softmax
            
    def forward(self, query, key, value, mask=None):

        N, K, M = query.shape
        
        Q = self.W_Q(query)  # (N, seq_len, embed_size)
        K = self.W_K(key)
        V = self.W_V(value)
        
        Q = Q.view(N, K, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(N, K, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(N, K, self.num_heads, self.head_dim).transpose(1, 2)
        
        out, attention = self.scaled_dot_product_attention(Q, K, V, mask)
        out = out.transpose(1, 2).contiguous().view(N, seq_len, self.embed_size)
        
        out = self.W_O(out)
        return out, attention

In [33]:
batch_size = 10
seq_len = 20
embed_size = 128
num_heads = 8
input = np.random.randn(batch_size, seq_len, embed_size) 
output, weights = multi_head_attention(embed_size, num_heads, input)

print(output.shape, weights.shape)
print(output[0][0][:10], weights[0][0][0][:10])

(10, 20, 128) (10, 8, 20, 20)
[-196.87236493  130.3874899  -112.67200377  111.43094804   65.09261902
   24.64296057 -162.82767893  147.82880116  -29.39549941  210.36298327] [7.86194491e-103 2.09418313e-038 4.83344845e-079 6.58217747e-033
 1.48395612e-013 9.99990519e-001 3.20118328e-034 1.82026292e-068
 7.31896795e-107 1.15190036e-085]
