In [54]:
import numpy as np
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [55]:
torch.cuda.is_available()

True

In [56]:
print(torch.__version__)

1.1.0


In [57]:
vocabulary_size = 5
words = range(vocabulary_size)

In [58]:
# sequences数据：数字表示字典中的索引
data = [[1, 2, 3], [0, 0, 1, 2, 4], [1, 0, 0, 4], [4, 4, 2]]
# sorted: key应用于element的function，reverse=True为降序
data = sorted(data, key=len, reverse=True)

In [59]:
# 每个序列的长度
lengths = [len(seq) for seq in data]
# max_length
T = len(data[0])
# batch_size
B = len(data)

In [60]:
data

[[0, 0, 1, 2, 4], [1, 0, 0, 4], [1, 2, 3], [4, 4, 2]]

In [61]:
# sequence: 将此序列转为one-hot表示；T：序列的max-length；N：one-hot的dim
def seq2onehot(sequence, T, N):
    data = np.zeros((T, N))
    # data[[0,1,2,...,len], []] = 1
    data[np.array(range(len(sequence))), sequence] = 1
    return data

In [62]:
# 列表生成式：生成的是python list类型
data = [seq2onehot(seq, T, vocabulary_size) for seq in data]

In [73]:
data

tensor([[[1., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0.],
         [0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 1.]],

        [[0., 1., 0., 0., 0.],
         [1., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0.],
         [0., 0., 1., 0., 0.],
         [0., 0., 0., 1., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 1.],
         [0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]], dtype=torch.float64)

In [64]:
# 4个sequences，5个words，5个one-hot维度
type(data)

list

In [65]:
# numpy to tensor
data = torch.from_numpy(np.array(data))

In [66]:
sequence = pack_padded_sequence(data, lengths, batch_first=True)

In [67]:
sequence

PackedSequence(data=tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1.]], dtype=torch.float64), batch_sizes=tensor([4, 4, 4, 2, 1]), sorted_indices=None, unsorted_indices=None)

In [71]:
sequence.batch_sizes

tensor([4, 4, 4, 2, 1])

In [72]:
sequence.data.shape

torch.Size([15, 5])