In [7]:
import torch
import torch.nn as nn
import math

Token Embedding + Positional Encoding:
Token Embedding: 把token转换为vector
Positional Encoding:让模型知道词和词的顺序

Positional Encoding的作用是给每个token一个在句子中处于第几个的信息
`Z = PE(X)`$z_i = x_i + p_i$，其中$x_i$是第i个词的embedding，$p_i$是第i个位置的encoding

Sinusoidal Positional Encoding的原理：
原文的说法是
>we must inject some information about the relative or absolute position of the
tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the
bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel
as the embeddings, so that the two can be summed.

也就是通过使用多个频率的$sin$叠加，编码出position信息，使得这个PE具有相对、绝对（对于我来说认为是局部或者整体）的position信息。
对于位置$pos \in \{0,1,2,...\}$，维度$i \in \{0,1,...,d_{model}-1\}$
$\text{PE}_{pos, 2i} = \sin\left(\frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}}\right)
\text{PE}_{pos, 2i+1} = \cos\left(\frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}}\right)$

最后输出$z_i = x_i + p_i$
个人理解：
假设d_model = 512:
说明对于一个token embedding来说，有一个512维的PE encoding，包含256组(pair) sin cos函数：
而这256对sin cos函数，每一对的frequency是不一样的(1pair内的sin cos是一样的frequency)。 -->也就是对encoding进行了傅里叶编码。
先说为什么要一对sin cos:
sin、cos是orthogonal的，不包含任何重复、冗余的信息
再说说为什么要很多对不同的函数：
利用多频率正交信号，使得每一对 sin/cos 捕捉一种尺度下的位置信息
多尺度并联就构成了“序列的空间坐标系” → 模型可以在这个坐标系中，通过点积、加权感知 token 的相对位置


sin(x),cos(x)关于orthogonal的推导：
$
\text{定义函数空间中的内积：} \quad
\langle f, g \rangle = \int_a^b f(x) g(x) \, dx \\

\text{设} \ f(x) = \sin(x),\ g(x) = \cos(x),\ \text{区间为} [0, 2\pi] \\

\Rightarrow
\langle \sin(x), \cos(x) \rangle = \int_0^{2\pi} \sin(x) \cos(x) \, dx \\

\text{利用三角恒等式：} \quad
\sin(x) \cos(x) = \frac{1}{2} \sin(2x) \\

\Rightarrow
\int_0^{2\pi} \sin(x) \cos(x) \, dx = \frac{1}{2} \int_0^{2\pi} \sin(2x) \, dx = 0 \\

\therefore \quad \sin(x) \perp \cos(x)

$


In [8]:
class TokenEmbedding(nn.Module):
    def __init__(self,vocab_size,d_model):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,d_model)
    def forward(self,x):
        #  embedding(x): (batch_size, seq_len, d_model) --> embedding(x):(vocab_size,d_model)
        return self.embedding(x)
    

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term) #奇数列
        pe[:, 1::2] = torch.cos(position * div_term) #偶数列
        pe = pe.unsqueeze(0)  # [shape] pe: (1, max_len, d_model)

        self.register_buffer('pe', pe)

    def forward(self, x):
        # [shape] x: (batch_size, seq_len, d_model) pe:(1, max_len, d_model) 也就是说只需要提供奇拿
        # x.size(1) = seq_len --> pe => (1,seq_len,d_model) x=>(batch_suze,seq_len,d_model)
        x = x + self.pe[:, :x.size(1)].to(x.device)
        return x


In [9]:
print(torch.arange(0, 32, 2).float())
print(torch.arange(0, 32, 2).float() * (-math.log(10000.0)))
print(torch.arange(0, 32, 2).float() * (-math.log(10000.0)/32)) # 原论文中的encoding函数

tensor([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18., 20., 22., 24., 26.,
        28., 30.])
tensor([  -0.0000,  -18.4207,  -36.8414,  -55.2620,  -73.6827,  -92.1034,
        -110.5241, -128.9448, -147.3654, -165.7861, -184.2068, -202.6275,
        -221.0482, -239.4689, -257.8895, -276.3102])
tensor([-0.0000, -0.5756, -1.1513, -1.7269, -2.3026, -2.8782, -3.4539, -4.0295,
        -4.6052, -5.1808, -5.7565, -6.3321, -6.9078, -7.4834, -8.0590, -8.6347])


In [10]:
pe_test = torch.zeros((1,5000,512))
print(pe_test[:,:3].shape)
print(pe_test[:,:3,:].shape)
print(pe_test[:,:,:3].shape)
del pe_test

torch.Size([1, 3, 512])
torch.Size([1, 3, 512])
torch.Size([1, 5000, 3])


In [11]:
class InputEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len=5000):
        super().__init__()
        self.token_embedding = TokenEmbedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)

    def forward(self, x):
        # [shape] x: (batch_size, seq_len)
        x = self.token_embedding(x)  # TokenEmbedding@x.shape → (batch, seq_len, d_model)
        x = self.pos_encoding(x)     # PositionalEncoding@x.shape → (batch, seq_len, d_model)
        return x


In [12]:
def test_input_embedding():
    vocab_size = 1000
    d_model = 512
    seq_len = 10
    batch_size = 2

    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
    embedder = InputEmbedding(vocab_size, d_model, max_len=seq_len)

    out = embedder(input_ids)
    print("Input IDs shape:", input_ids.shape)         # (2, 10)
    print("Output embedding shape:", out.shape)        # (2, 10, 512)
test_input_embedding()

Input IDs shape: torch.Size([2, 10])
Output embedding shape: torch.Size([2, 10, 512])
