### 이 튜토리얼에서는 RNN / LSTM 계열의 모델에서 sequence batch를 잘 활용할 수 있는 PackedSequence 와 PaddedSequence를 만드는 법을 배워보겠습니다.

PyTorch 라이브러리 안에는 다음 4가지 함수들이 주어집니다.

pad_sequence, pack_sequence, pack_padded_sequence, pad_packed_sequence

하지만 함수 이름만 봐서는 상당히 헷갈릴 수 있기 때문에 다음 그림을 참고하시면 이해하기 편하실 것 같습니다.

<img src="

In [1]:
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence, pad_packed_sequence

In [2]:
# Random word from random word generator
data = ['hello world',
        'midnight',
        'calculation',
        'path',
        'short circuit']

# Make dictionary
char_set = ['<pad>'] + list(set(char for seq in data for char in seq)) # Get all characters and include pad token
char2idx = {char: idx for idx, char in enumerate(char_set)} # Constuct character to index dictionary
print('char_set:', char_set)
print('char_set length:', len(char_set))

char_set: ['<pad>', 'p', 'd', 'h', 'i', 'c', 'w', 'n', 't', 'r', 's', 'g', 'l', 'o', 'u', ' ', 'm', 'e', 'a']
char_set length: 19


In [3]:
char2idx

{'<pad>': 0,
 'p': 1,
 'd': 2,
 'h': 3,
 'i': 4,
 'c': 5,
 'w': 6,
 'n': 7,
 't': 8,
 'r': 9,
 's': 10,
 'g': 11,
 'l': 12,
 'o': 13,
 'u': 14,
 ' ': 15,
 'm': 16,
 'e': 17,
 'a': 18}

In [4]:
# Convert character to index and make list of tensors
X = [torch.LongTensor([char2idx[char] for char in seq]) for seq in data]

# Check converted result
for sequence in X:
    print(sequence)

tensor([ 3, 17, 12, 12, 13, 15,  6, 13,  9, 12,  2])
tensor([16,  4,  2,  7,  4, 11,  3,  8])
tensor([ 5, 18, 12,  5, 14, 12, 18,  8,  4, 13,  7])
tensor([ 1, 18,  8,  3])
tensor([10,  3, 13,  9,  8, 15,  5,  4,  9,  5, 14,  4,  8])


In [5]:
X

[tensor([ 3, 17, 12, 12, 13, 15,  6, 13,  9, 12,  2]),
 tensor([16,  4,  2,  7,  4, 11,  3,  8]),
 tensor([ 5, 18, 12,  5, 14, 12, 18,  8,  4, 13,  7]),
 tensor([ 1, 18,  8,  3]),
 tensor([10,  3, 13,  9,  8, 15,  5,  4,  9,  5, 14,  4,  8])]