In [3]:
import torch
from torch import nn 
from torch.nn import functional as F

In [None]:
'''
1 embedding 的计算过程

nn.Embedding 实现了什么?
在起前向过程中实现了一个查表

表的形式是怎么样的? 
matrix (embedding.weight, learnable parameters)
martix.shape (v, h)
    - v: vocabulary size
    - h: hidden dimension

查表的过程是如何实现的? 
input: (b, s)
    - b: batch size
    - s: seq len
(b,s) 和 (v,h) =>? (b,s,h)
one-hot + 矩阵乘法
'''

In [6]:
embedding = nn.Embedding(10, 3)
print(embedding.weight)
print(embedding.weight.shape)
print(embedding.weight.dtype)

input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
print(input.dtype)

print(embedding(input))

Parameter containing:
tensor([[-0.4147,  0.9515, -0.0327],
        [-0.0169,  0.1072,  1.0722],
        [ 1.2686,  0.7146,  1.2683],
        [ 0.1776,  0.7265, -0.8443],
        [ 0.6266,  0.5764,  0.8150],
        [ 0.1367,  2.0113,  0.3402],
        [ 0.3035,  0.1572, -0.3762],
        [ 0.9478,  2.1515,  0.9349],
        [ 0.5556,  0.2157,  0.6702],
        [ 1.4950,  1.4584, -0.1224]], requires_grad=True)
torch.Size([10, 3])
torch.float32
torch.int64
tensor([[[-0.0169,  0.1072,  1.0722],
         [ 1.2686,  0.7146,  1.2683],
         [ 0.6266,  0.5764,  0.8150],
         [ 0.1367,  2.0113,  0.3402]],

        [[ 0.6266,  0.5764,  0.8150],
         [ 0.1776,  0.7265, -0.8443],
         [ 1.2686,  0.7146,  1.2683],
         [ 1.4950,  1.4584, -0.1224]]], grad_fn=<EmbeddingBackward0>)


In [14]:
input_onehot = F.one_hot(input, num_classes=10)
print(input_onehot)
print(input_onehot.shape)
print(input_onehot.dtype)

tensor([[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]],

        [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]])
torch.Size([2, 4, 10])
torch.int64


In [19]:
# input_one.shape: (b, s, v)
# embedding.weight.shape: (v, h)
torch.matmul(input_onehot.type(torch.float32), embedding.weight)

tensor([[[-0.5273,  0.8237,  0.8261],
         [-1.0407, -0.3688,  1.2705],
         [-0.2796,  0.6942, -0.6633],
         [ 0.3586,  1.0373, -2.2510]],

        [[-0.2796,  0.6942, -0.6633],
         [-0.5626, -0.2823,  0.1150],
         [-1.0407, -0.3688,  1.2705],
         [ 1.8176,  0.7679, -1.3234]]], grad_fn=<UnsafeViewBackward0>)

In [21]:
'''
(b,s) 和 (v,h) ->? (b,s,h)
(b,s) 经过 one-hot => (b, s, v)
(b, s, v) @ (v, h) => (b, s, h)
'''

'\n(b,s) 和 (v,h) ->? (b,s,h)\n(b,s) 经过 one-hot => (b, s, v)\n(b, s, v) @ (v, h) => (b, s, h)\n'

In [None]:
'''
2 max_norm
max_norm的作用是什么?


In [26]:
# 不设置 max_norm, max_norm == False 时, 特征粒度(所有元素粒度)的范数
# 范数: 向量的长度. 将一个向量中的每个元素取绝对值后，再将这些绝对值求和并开方
embedding = nn.Embedding(3, 5)
print(embedding.weight.mean())
print(embedding.weight.std())
# 01高斯分布采样得到

print(embedding.weight)
print(torch.norm(embedding.weight, dim=1))
# 行的粒度上的norm是随机的

input = torch.tensor([0, 1, 2])
print(input.shape)
output = embedding(input)
print(output)
print(torch.norm(embedding.weight, dim=1)) # 计算L2范数
'''
Frobenius范数: 将矩阵中所有元素平方后相加再开方
L1范数: 将向量中所有元素的平方相加再开方
L2范数: 正则化
'''


tensor(0.0982, grad_fn=<MeanBackward0>)
tensor(1.0048, grad_fn=<StdBackward0>)
Parameter containing:
tensor([[ 0.8402, -0.2582,  0.5306,  1.2442,  0.9572],
        [ 2.4128, -0.6782, -0.1286,  0.1715,  0.1777],
        [-0.5644, -0.9521, -1.0977,  0.2060, -1.3885]], requires_grad=True)
tensor([1.8758, 2.5217, 2.0977], grad_fn=<NormBackward1>)
torch.Size([3])
tensor([[ 0.8402, -0.2582,  0.5306,  1.2442,  0.9572],
        [ 2.4128, -0.6782, -0.1286,  0.1715,  0.1777],
        [-0.5644, -0.9521, -1.0977,  0.2060, -1.3885]],
       grad_fn=<EmbeddingBackward0>)
tensor([1.8758, 2.5217, 2.0977], grad_fn=<NormBackward1>)


In [27]:
# max_norm == True ==> max_norm == 1
embedding = nn.Embedding(3, 5, max_norm=True)
print(embedding.weight.mean())
print(embedding.weight.std())

print(embedding.weight)
print(torch.norm(embedding.weight, dim=1))

input = torch.tensor([0, 1, 2])
print(input.shape)
output = embedding(input)
print(output)
print(torch.norm(embedding.weight, dim=1))
# max_norm == True 会对输出的结果进行一次norm

tensor(0.1618, grad_fn=<MeanBackward0>)
tensor(1.2941, grad_fn=<StdBackward0>)
Parameter containing:
tensor([[-0.7501, -1.5564, -0.1468,  0.4527,  0.9336],
        [ 0.8889, -1.0233,  0.4655, -0.2099, -2.0676],
        [ 1.0238,  1.6944, -0.8625,  2.8629,  0.7222]], requires_grad=True)
tensor([2.0207, 2.5245, 3.6580], grad_fn=<NormBackward1>)
torch.Size([3])
tensor([[-0.3712, -0.7702, -0.0727,  0.2241,  0.4620],
        [ 0.3521, -0.4054,  0.1844, -0.0832, -0.8190],
        [ 0.2799,  0.4632, -0.2358,  0.7826,  0.1974]],
       grad_fn=<EmbeddingBackward0>)
tensor([1.0000, 1.0000, 1.0000], grad_fn=<NormBackward1>)
