# Data Load

In [3]:
import sentencepiece as spm
import torch

en_vocab_file = "../en.model"
de_vocab_file = "../de.model"

en_vocab = spm.SentencePieceProcessor()
de_vocab = spm.SentencePieceProcessor()

en_vocab.load(en_vocab_file)
de_vocab.load(de_vocab_file)

True

In [7]:
# Example

lines = [
  "summer is leavning",
  "winter is coming"
]

inputs = []
for line in lines:
  pieces = en_vocab.encode_as_pieces(line)
  ids = en_vocab.encode_as_ids(line)
  inputs.append(torch.tensor(ids))
  print('vocab pieces: ', pieces)
  print('encode_as_ids: ', ids)

# 입력 최대 길이에 맞춰 0으로 패딩
inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=0)


print('tensor: ',inputs)

vocab pieces:  ['▁summer', '▁is', '▁le', 'av', 'ning']
encode_as_ids:  [3299, 54, 293, 638, 548]
vocab pieces:  ['▁winter', '▁is', '▁coming']
encode_as_ids:  [5026, 54, 1079]
tensor:  tensor([[3299,   54,  293,  638,  548],
        [5026,   54, 1079,    0,    0]])


In [8]:
import pandas as pd

train_df = pd.read_csv('train_df.csv')
train_df['en'][0]


'Thank you so much, Chris.'

In [10]:
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler


class en2deDataset(Dataset):
    def __init__(self, dataframe, src_vocab, tgt_vocab, src_lang, tgt_lang):
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.src_lang = src_lang  # 'en'
        self.tgt_lang = tgt_lang  # 'de'

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        src_sentence = self.dataframe[self.src_lang][idx]
        tgt_sentence = self.dataframe[self.tgt_lang][idx]
        
        # encode_as_ids: 문장을 입력하면 정수 시퀀스로 변환
        scr_enc = self.src_vocab.encode_as_ids(src_sentence)
        tgt_enc = self.tgt_vocab.encode_as_ids(src_sentence)

        return (torch.tensor(scr_enc), torch.tensor(tgt_enc))


In [13]:
BATCH_SIZE = 32


class en2deDataLoader:
    def __init__(self, dataset, batch_size):
        self.dataset = dataset
        self.batch_size = batch_size

    def collate_fn(self, batch):
        # zip(*batch) = 배치에 저장된 모든 src와 tgt 나눠서 묶는 용도
        # batch = (src_sentence), (tgt_sentence) 쌍
        src_batch, tgt_batch = zip(*batch)
        src_padded = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=0)
        tgt_padded = torch.nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=0)

        return src_padded, tgt_padded

    def get_data_loader(self):
        return DataLoader(self.dataset, batch_size=self.batch_size, collate_fn=self.collate_fn)


dataset = en2deDataset(train_df, en_vocab, de_vocab, 'en', 'de')
data_loader = en2deDataLoader(dataset, BATCH_SIZE)
train_loader = data_loader.get_data_loader()