## 导入包

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

torch.manual_seed(24)

<torch._C.Generator at 0x20f5fd9b3b0>

## word2vec

## nn.Linear

In [2]:
linear = nn.Linear(in_features=4, out_features=5, bias=False)
# 权重矩阵的行数等于输出特征数（5），列数等于输入特征数（4）
print (f"linear weight shape: {linear.weight.shape}")   # [5, 4]
x = torch.rand(2, 4)  # [2, 4]
y1 = linear(x)   # [2, 4]*[4,5] - > [2,5]
print(y1.shape)
"""
x代表两个文本 文本各有四个特征
linear代表全连接 将四个特征投射到五个特征
特征就是特定的文本
"""

linear weight shape: torch.Size([5, 4])
torch.Size([2, 5])


'\nx代表两个文本 文本各有四个特征\nlinear代表全连接 将四个特征投射到五个特征\n特征就是特定的文本\n'

## torch.matmul

In [13]:
# y2 = x @ linear.weight.T
y2 = torch.matmul(x, linear.weight.T) # [2,4]*[4,5] --> [2,5]
print(y2.shape)

print(torch.mean(y1 - y2).item())

torch.Size([2, 5])
0.0


## x_idx & x

In [14]:
# 3个文档，每个文档2个词
x_idx = [
    [0, 1],  # 类别序号/下标， 每篇文章有相同数量的单词，切割或者填充到指定长度
    [0, 3],  # 若单词的种类很多但是标签大的单词数量又少 此时onehot就不是很好用了
    [1, 2]
]
# onehot
x = [
    [1, 1, 0, 0],  # 文本1
    [1, 0, 0, 1],  # 文本2
    [0, 1, 1, 0]  # 文本3
]

In [20]:
x = torch.tensor(x, dtype=torch.float32)  # [n,m] n个样本，m维大小(词表的大小)
y1 = linear(x)  # [n,m]*[m,o] -> [n,o] o表示输出特征向量大小 --> 计算量n*m*o
print(y1)   # 文档集的表征
print(linear.weight.T[0] + linear.weight.T[1] )   # 第一篇文章的表征=第一行+第二行
print(linear.weight.T[0] + linear.weight.T[3] )   # 第二篇文章的表征=第一行+第四行
print(linear.weight.T[1] + linear.weight.T[2] )   # 第三篇文章的表征=第二行+第三行

tensor([[ 0.1395,  0.2430, -0.2605, -0.5502,  0.2602],
        [ 0.2952,  0.8570, -0.0513, -0.5145,  0.3064],
        [-0.5499, -0.3858, -0.2917,  0.3419, -0.3963]], grad_fn=<MmBackward0>)
tensor([ 0.1395,  0.2430, -0.2605, -0.5502,  0.2602], grad_fn=<AddBackward0>)
tensor([ 0.2952,  0.8570, -0.0513, -0.5145,  0.3064], grad_fn=<AddBackward0>)


  x = torch.tensor(x, dtype=torch.float32)  # [n,m] n个样本，m维大小(词表的大小)


In [16]:
w = linear.weight.T  # [4, 5]  在这里，w就表示每个单词对应一个稠密的特征向量
print(w.shape)
print(w)
print (np.reshape(x_idx, -1))

torch.Size([4, 5])
tensor([[ 0.2644,  0.4660, -0.0696, -0.4073,  0.4554],
        [-0.1249, -0.2230, -0.1910, -0.1429, -0.1952],
        [-0.4249, -0.1628, -0.1007,  0.4848, -0.2011],
        [ 0.0308,  0.3910,  0.0183, -0.1072, -0.1490]],
       grad_fn=<PermuteBackward0>)
[0 1 0 3 1 2]


In [17]:
w[np.reshape(x_idx, -1)]

tensor([[ 0.2644,  0.4660, -0.0696, -0.4073,  0.4554],
        [-0.1249, -0.2230, -0.1910, -0.1429, -0.1952],
        [ 0.2644,  0.4660, -0.0696, -0.4073,  0.4554],
        [ 0.0308,  0.3910,  0.0183, -0.1072, -0.1490],
        [-0.1249, -0.2230, -0.1910, -0.1429, -0.1952],
        [-0.4249, -0.1628, -0.1007,  0.4848, -0.2011]],
       grad_fn=<IndexBackward0>)

In [19]:
r = w[np.reshape(x_idx, -1)].reshape(-1, 2, 5) # 每个单词对应一个横向的向量
print(r.shape)
print(r)

torch.Size([3, 2, 5])
tensor([[[ 0.2644,  0.4660, -0.0696, -0.4073,  0.4554],
         [-0.1249, -0.2230, -0.1910, -0.1429, -0.1952]],

        [[ 0.2644,  0.4660, -0.0696, -0.4073,  0.4554],
         [ 0.0308,  0.3910,  0.0183, -0.1072, -0.1490]],

        [[-0.1249, -0.2230, -0.1910, -0.1429, -0.1952],
         [-0.4249, -0.1628, -0.1007,  0.4848, -0.2011]]],
       grad_fn=<ViewBackward0>)


In [11]:
r = r.sum(dim=1)  # 合并到一起，得到每个文本对应一个向量 即为两个单词对应向量之和
print(r.shape)
print(r)

torch.Size([3, 5])
tensor([[ 0.1395,  0.2430, -0.2605, -0.5502,  0.2602],
        [ 0.2952,  0.8570, -0.0513, -0.5145,  0.3064],
        [-0.5499, -0.3858, -0.2917,  0.3419, -0.3963]], grad_fn=<SumBackward1>)


## nn.Embedding

nn.Embedding用于将离散的ID映射到连续的向量，而nn.Linear用于将连续的输入特征映射到连续的输出特征。

In [4]:
torch.tensor(x_idx, dtype=torch.int64)

NameError: name 'x_idx' is not defined

In [3]:
embed = nn.Embedding(num_embeddings=4, embedding_dim=5, _weight=w)
print(w)
r2 = embed(torch.tensor(x_idx, dtype=torch.int64))
print(r2.shape)
print(r2)

NameError: name 'w' is not defined

### sum(dim=2)

In [14]:
sum2 = r2.sum(dim=2)
print (sum2.shape)
sum2

torch.Size([3, 2])


tensor([[ 0.7090, -0.8770],
        [ 0.7090,  0.1839],
        [-0.8770, -0.4047]], grad_fn=<SumBackward1>)

In [15]:
sum(r2[0,0,:])

tensor(0.7090, grad_fn=<AddBackward0>)

In [16]:
0.2644+0.4660-0.0696-0.4073+0.4554

0.7089000000000001

### sum(dim=1)

In [17]:
sum2 = r2.sum(dim=1)
print (sum2.shape)
sum2

torch.Size([3, 5])


tensor([[ 0.1395,  0.2430, -0.2605, -0.5502,  0.2602],
        [ 0.2952,  0.8570, -0.0513, -0.5145,  0.3064],
        [-0.5499, -0.3858, -0.2917,  0.3419, -0.3963]], grad_fn=<SumBackward1>)

In [18]:
sum(r2[0,:,0])

tensor(0.1395, grad_fn=<AddBackward0>)

In [19]:
0.2644-0.1249

0.1395

### sum(dim=0)

In [20]:
sum2 = r2.sum(dim=0)
print (sum2.shape)
sum2

torch.Size([2, 5])


tensor([[ 0.4039,  0.7090, -0.3301, -0.9576,  0.7156],
        [-0.5191,  0.0052, -0.2734,  0.2347, -0.5454]], grad_fn=<SumBackward1>)

In [21]:
sum(r2[:,0,0])

tensor(0.4039, grad_fn=<AddBackward0>)

In [22]:
0.2644+0.2644-0.1249

0.40390000000000004

### runtime: linear vs Embedding

In [24]:
import time
n = 100000
tx_idx = torch.tensor(x_idx, dtype=torch.int64)

t1 = time.time()
for i in range(n):
    linear(x)

t2 = time.time()
for i in range(n):
    embed(tx_idx).sum(dim=1)  # 得到文本向量
t3 = time.time()
print(t3 - t2, t2 - t1)

1.0124552249908447 0.5925133228302002


## nn.Linear & nn.Embedding

In [25]:
vocab_size = 10000  # 10000个单词
# 将每个单词映射到一个128维的向量空间中
# 这里的“128”指的是向量的维度，而不是类别的数量。每个单词都被映射到这个高维空间中的一个点，而这个点的坐标（即向量的值）是通过模型学习得到的，
# 能够反映单词的语义信息和上下文信息。不同单词的向量可以用于计算它们之间的相似度，或者作为机器学习模型的输入特征。
linear = nn.Linear(in_features=vocab_size, out_features=128, bias=False)  # 输入特征数为单词的数量10000 输出为128
embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=128, _weight=linear.weight.T)

In [26]:
# x = torch.randint(0, vocab_size, size=(16,))  # [16,]
x = torch.randint(0, vocab_size, size=(16,4))  # [16,] x将是一个形状为16x4的张量，其中的每个元素都是从0到9999之间的一个随机整数。
print (x)
x_onehot = F.one_hot(x, num_classes=vocab_size)  # [16,10000]
x_onehot = x_onehot.to(torch.float32)
x_onehot.shape

tensor([[5201, 5015, 8614, 8790],
        [5308, 1772, 2039, 6459],
        [4190, 2291, 4415, 6457],
        [9011, 3615, 3342, 6473],
        [ 570,  799, 7698, 7682],
        [3532, 2425, 6052, 2040],
        [ 310, 9880, 1542,  375],
        [2362, 1892, 1865,  612],
        [8985, 9904, 9304, 6305],
        [4370, 6500, 8489, 6536],
        [1830, 9250, 4491, 8797],
        [1799, 2333, 5285,  727],
        [2347, 1495, 2387,  255],
        [1695, 5885, 8576, 3007],
        [4671, 3274, 7818, 4286],
        [1352,  801, 5166,  442]])


torch.Size([16, 4, 10000])

In [29]:
y1 = linear(x_onehot)
y2 = embed(x)
print(y1)
print(y2)
print(torch.mean(torch.abs(y1 - y2)))

torch.Size([16, 4, 128])
tensor([[[-9.6898e-03, -5.0855e-03,  6.2885e-03,  ...,  8.3103e-03,
          -5.6621e-03,  3.1911e-03],
         [ 3.9277e-03,  4.8748e-03, -9.7823e-03,  ..., -8.5736e-03,
           6.1870e-04,  9.3422e-03],
         [ 2.2417e-03, -8.7106e-03,  6.9648e-03,  ...,  9.5045e-03,
          -2.7521e-03,  4.2019e-03],
         [-9.2161e-03,  8.3791e-06, -1.7729e-03,  ...,  3.7920e-03,
           5.9289e-03,  6.9248e-03]],

        [[ 5.8255e-03,  7.9945e-04, -7.2156e-03,  ...,  1.9155e-03,
          -7.1027e-03,  9.1937e-03],
         [-1.6206e-04, -3.4390e-04,  7.2426e-03,  ...,  3.1221e-03,
           3.2408e-03, -6.7443e-03],
         [ 7.6800e-03, -7.6522e-03,  9.9162e-03,  ..., -6.8345e-03,
          -9.3088e-03,  2.6958e-03],
         [ 5.0229e-03,  7.8355e-03, -2.4284e-03,  ..., -4.2366e-04,
          -8.4012e-03, -9.5087e-03]],

        [[ 6.4452e-03, -1.9305e-03, -3.9011e-03,  ...,  6.8925e-03,
          -2.2651e-03, -5.5599e-03],
         [-3.2580e-03,  2.

In [27]:
y1.shape

torch.Size([16, 4, 128])