In [1]:
import pandas as pd

In [4]:
train_df = pd.read_csv("data/sample/train.csv")
train_df["korean"].to_csv("data/sample/train.ko", index=False)
train_df["english"].to_csv("data/sample/train.en", index=False)

In [5]:
valid_df = pd.read_csv("data/sample/valid.csv")
valid_df["korean"].to_csv("data/sample/valid.ko", index=False)
valid_df["english"].to_csv("data/sample/valid.en", index=False)

In [3]:
from nlp.datasets.data_helper import create_or_load_tokenizer

In [4]:
ko_vocab = create_or_load_tokenizer(
    file_path="data/sample/train.ko",
    save_path="dictionary/sample",
    language="ko",
    vocab_size=8000,
    tokenizer_type="unigram"
)

In [7]:
print(ko_vocab.GetPieceSize())
text = "안녕하세요 저는 Estsoft의 정환석입니다."
idx_lst = ko_vocab.EncodeAsIds(text)
print(idx_lst + [4] * (50 - len(idx_lst)))
print(len(idx_lst + [4] * (50 - len(idx_lst))))
print(ko_vocab.EncodeAsPieces(text))
print(ko_vocab.DecodeIds(idx_lst))

8000
[592, 82, 2408, 4977, 1002, 7499, 1019, 10, 351, 2555, 605, 25, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
50
['▁안녕하세요', '▁저는', '▁E', 'st', 's', 'of', 't', '의', '▁정', '환', '석', '입니다', '.']
안녕하세요 저는 Estsoft의 정환석입니다.


In [None]:
en_vocab = create_or_load_tokenizer(
    file_path="data/sample/train.en",
    save_path="dictionary/sample",
    language="en",
    vocab_size=8000,
    tokenizer_type="unigram"
)

In [16]:
print(en_vocab.GetPieceSize())
text = "Hello my name is 정환석"
idx_lst = en_vocab.EncodeAsIds(text)
print(idx_lst)
print(en_vocab.EncodeAsPieces(text))
print(en_vocab.DecodeIds(idx_lst))

8000
[952, 69, 408, 17, 23, 2]
['▁Hello', '▁my', '▁name', '▁is', '▁', '정환석']
Hello my name is  ⁇ 


In [None]:
ko_vocab_bpe = create_or_load_tokenizer(
    file_path="data/sample/train.ko",
    save_path="dictionary/sample_bpe",
    language="ko",
    vocab_size=8000,
    tokenizer_type="bpe"
)

In [18]:
print(ko_vocab_bpe.GetPieceSize())
text = "안녕하세요 저는 Estsoft의 정환석입니다."
idx_lst = ko_vocab_bpe.EncodeAsIds(text)
print(idx_lst)
print(ko_vocab_bpe.EncodeAsPieces(text))
print(ko_vocab_bpe.DecodeIds(idx_lst))

8000
[844, 207, 1781, 5080, 7230, 7107, 7490, 7158, 6736, 48, 6948, 7014, 18, 6717]
['▁안녕하세요', '▁저는', '▁E', 'st', 's', 'o', 'f', 't', '의', '▁정', '환', '석', '입니다', '.']
안녕하세요 저는 Estsoft의 정환석입니다.


In [19]:
ko_vocab_char = create_or_load_tokenizer(
    file_path="data/sample/train.ko",
    save_path="dictionary/sample_char",
    language="ko",
    vocab_size=8000,
    tokenizer_type="char"
)

In [20]:
print(ko_vocab_char.GetPieceSize())
text = "안녕하세요 저는 Estsoft의 정환석입니다."
idx_lst = ko_vocab_char.EncodeAsIds(text)
print(idx_lst)
print(ko_vocab_char.EncodeAsPieces(text))
print(ko_vocab_char.DecodeIds(idx_lst))

1288
[4, 76, 289, 10, 73, 17, 4, 41, 11, 4, 507, 518, 446, 518, 395, 778, 446, 24, 4, 56, 236, 302, 50, 7, 6, 5]
['▁', '안', '녕', '하', '세', '요', '▁', '저', '는', '▁', 'E', 's', 't', 's', 'o', 'f', 't', '의', '▁', '정', '환', '석', '입', '니', '다', '.']
안녕하세요 저는 Estsoft의 정환석입니다.


In [21]:
ko_vocab_word = create_or_load_tokenizer(
    file_path="data/sample/train.ko",
    save_path="dictionary/sample_word",
    language="ko",
    vocab_size=8000,
    tokenizer_type="word"
)

In [22]:
print(ko_vocab_word.GetPieceSize())
text = "안녕하세요 저는 Estsoft의 정환석입니다."
idx_lst = ko_vocab_word.EncodeAsIds(text)
print(idx_lst)
print(ko_vocab_word.EncodeAsPieces(text))
print(ko_vocab_word.DecodeIds(idx_lst))

8000
[720, 23, 2]
['▁안녕하세요', '▁저는', '▁Estsoft의▁정환석입니다.']
안녕하세요 저는 ⁇ 


In [1]:
from nlp.datasets.data_helper import create_or_load_tokenizer

ko_vocab = create_or_load_tokenizer(
    file_path="data/sample/train.ko",
    save_path="dictionary/sample",
    language="ko",
    vocab_size=8000,
    tokenizer_type="unigram"
)

en_vocab = create_or_load_tokenizer(
    file_path="data/sample/train.en",
    save_path="dictionary/sample",
    language="en",
    vocab_size=8000,
    tokenizer_type="unigram"
)

In [12]:
from nlp.datasets.data_helper import TrainDataset
from torch.utils.data import DataLoader, RandomSampler

In [3]:
dataset = TrainDataset(
        x_path="data/sample/train.ko",
        src_vocab=ko_vocab,
        y_path="data/sample/train.en",
        trg_vocab=en_vocab,
        max_sequence_size=50
    )

In [14]:
sampler = RandomSampler(dataset)
loader = DataLoader(dataset=dataset, batch_size=1, sampler=sampler)
idx =0
for i in loader:
    encoder_input, decoder_input, decoder_output = i
    print(encoder_input)
    print(ko_vocab.DecodeIds(encoder_input[0].tolist()))
    print(decoder_input)
    print(en_vocab.DecodeIds(decoder_input[0].tolist()))
    print(decoder_output)
    print(en_vocab.DecodeIds(decoder_output[0].tolist()))
    idx += 1
    if idx == 2:
        break

tensor([[   8,  446, 1946,   15,  169, 3318, 1110, 1946,  529,    4,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3]])
>저 머리가 진짜 멋있는 머리인데.
tensor([[   0,   19,  177,  473,  156,   30,   12,  120,  705,  473, 3443,    4,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3]])
>That hair should be a really cool hairstyle.
tensor([[  19,  177,  473,  156,   30,   12,  120,  705,  473, 3443,    4,    1,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
            3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,    