In [31]:
import pandas as pd

In [32]:
train_df = pd.read_csv("data/sample/train.csv")
train_df["korean"].to_csv("data/sample/train.ko", index=False)
train_df["english"].to_csv("data/sample/train.en", index=False)

In [33]:
valid_df = pd.read_csv("data/sample/valid.csv")
valid_df["korean"].to_csv("data/sample/valid.ko", index=False)
valid_df["english"].to_csv("data/sample/valid.en", index=False)


In [34]:
from nlp.datasets.data_helper import create_or_load_tokenizer


In [35]:
ko_vocab = create_or_load_tokenizer(
    file_path="data/sample/train.ko",
    save_path="dictionary/sample",
    language="ko",
    vocab_size=8000,
    tokenizer_type="unigram"
)
print(ko_vocab.GetPieceSize())
text = "안녕하세요 저는 Estsoft의 정환석입니다."
idx_lst = ko_vocab.EncodeAsIds(text)
print(idx_lst + [4] * (50 - len(idx_lst)))
print(len(idx_lst + [4] * (50 - len(idx_lst))))
print(ko_vocab.EncodeAsPieces(text))
print(ko_vocab.DecodeIds(idx_lst))


8000
[592, 82, 26, 1748, 1673, 858, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
50
['▁안녕하세요', '▁저는', '▁이', '승', '채', '▁입니다', '.', '.']
안녕하세요 저는 이승채 입니다..


In [36]:
en_vocab = create_or_load_tokenizer(
    file_path="data/sample/train.en",
    save_path="dictionary/sample",
    language="en",
    vocab_size=8000,
    tokenizer_type="unigram"
)

In [37]:
print(en_vocab.GetPieceSize())
text = "Hello my name is 이승채"
idx_lst = en_vocab.EncodeAsIds(text)
print(idx_lst)
print(en_vocab.EncodeAsPieces(text))
print(en_vocab.DecodeIds(idx_lst))


8000
[952, 69, 408, 17, 23, 2]
['▁Hello', '▁my', '▁name', '▁is', '▁', '이승채']
Hello my name is  ⁇ 


In [38]:
ko_vocab_bpe = create_or_load_tokenizer(
    file_path="data/sample/train.ko",
    save_path="dictionary/sample_bpe",
    language="ko",
    vocab_size=8000,
    tokenizer_type="bpe"
)

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=data/sample/train.ko --model_prefix=ko_corpus_8000 --model_type=bpe --vocab_size=8000 --bos_id=0 --eos_id=1 --unk_id=2 --pad_id=3
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: data/sample/train.ko
  input_format: 
  model_prefix: ko_corpus_8000
  model_type: BPE
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 2
  bos_id: 0
  eos_id: 1

In [39]:
print(ko_vocab_bpe.GetPieceSize())
text = "안녕하세요 저는 이승채 입니다."
idx_lst = ko_vocab_bpe.EncodeAsIds(text)
print(idx_lst)
print(ko_vocab_bpe.EncodeAsPieces(text))
print(ko_vocab_bpe.DecodeIds(idx_lst))

8000
[844, 207, 10, 7180, 7121, 781, 6717]
['▁안녕하세요', '▁저는', '▁이', '승', '채', '▁입니다', '.']
안녕하세요 저는 이승채 입니다.


In [40]:
ko_vocab_char = create_or_load_tokenizer(
    file_path="data/sample/train.ko",
    save_path="dictionary/sample_char",
    language="ko",
    vocab_size=8000,
    tokenizer_type="char"
)

In [41]:
print(ko_vocab_char.GetPieceSize())
text = "안녕하세요 저는 Estsoft의 정환석입니다."
idx_lst = ko_vocab_char.EncodeAsIds(text)
print(idx_lst)
print(ko_vocab_char.EncodeAsPieces(text))
print(ko_vocab_char.DecodeIds(idx_lst))

1288
[4, 76, 289, 10, 73, 17, 4, 41, 11, 4, 8, 468, 409, 4, 50, 7, 6, 5]
['▁', '안', '녕', '하', '세', '요', '▁', '저', '는', '▁', '이', '승', '채', '▁', '입', '니', '다', '.']
안녕하세요 저는 이승채 입니다.


In [42]:
ko_vocab_word = create_or_load_tokenizer(
    file_path="data/sample/train.ko",
    save_path="dictionary/sample_word",
    language="ko",
    vocab_size=8000,
    tokenizer_type="word"
)

In [43]:
print(ko_vocab_word.GetPieceSize())
text = "안녕하세요 저는 이승채 입니다.."
idx_lst = ko_vocab_word.EncodeAsIds(text)
print(idx_lst)
print(ko_vocab_word.EncodeAsPieces(text))
print(ko_vocab_word.DecodeIds(idx_lst))

8000
[720, 23, 2]
['▁안녕하세요', '▁저는', '▁이승채▁입니다..']
안녕하세요 저는 ⁇ 


In [48]:
from nlp.datasets.data_helper import TrainDataset
from torch.utils.data import DataLoader, RandomSampler

In [49]:
dataset = TrainDataset(
    x_path='data/sample/train_ko',
    src_vocab=ko_vocab,
    y_path='data/sample/train_en',
    trg_vocab=en_vocab,

IndexError: not equal src_data, trg_data line size

In [50]:
from torch.utils.data import DataLoader, RandomSampler
sampler = RandomSampler(dataset)
loader = DataLoader(dataset=dataset , batch_size=1,sampler=sampler)
for i in loader:
    print(i)


NameError: name 'loader' is not defined