In [1]:
import sys

In [2]:
# load local modules
script_abs_path = '/home/sasatake/repos/python/transformer/script'
sys.path.append(script_abs_path)

from load_dataset import Multi30k
from tokenizer import Tokenizer

# load dataset

In [3]:
# load dataset
multi30 = Multi30k()

# tokenize and add index

In [4]:
# make vocab
tokenizer = Tokenizer()
# 参照元にはデータセットの結合は行われていないが，Eisfischerhütteという単語がvalidの方にしか入っていないため，トーカナイズが失敗する．
all_texts_de = multi30.datasets['texts_src_de_train'] + multi30.datasets['texts_src_de_valid']
all_texts_en = multi30.datasets['texts_tgt_en_train'] + multi30.datasets['texts_tgt_en_valid']
token2idx_de, idx2token_de, counter_de = tokenizer.build_vocab(lang='de', texts=all_texts_de)
token2idx_en, idx2token_en, counter_en = tokenizer.build_vocab(lang='en', texts=all_texts_en)

In [5]:
print(f'german vocab size : {len(idx2token_de)}')
print(f'english vocab size : {len(idx2token_en)}')

german vocab size : 19620
english vocab size : 11010


# result : german

In [6]:
# token to index
idx_limit = 10
for idx, (k, v) in enumerate(token2idx_de.items()):
    print(f'index = {k}\t\ttoken = {v}')
    if idx == idx_limit:
        break

index = <unk>		token = 0
index = <pad>		token = 1
index = <start>		token = 2
index = <end>		token = 3
index = .		token = 4
index = Ein		token = 5
index = einem		token = 6
index = in		token = 7
index = und		token = 8
index = ,		token = 9
index = mit		token = 10


In [7]:
# index to token
idx_limit = 10
for idx, (k, v) in enumerate(idx2token_de.items()):
    print(f'index = {idx}\t\ttoken = {v}')
    if idx == idx_limit:
        break

index = 0		token = <unk>
index = 1		token = <pad>
index = 2		token = <start>
index = 3		token = <end>
index = 4		token = .
index = 5		token = Ein
index = 6		token = einem
index = 7		token = in
index = 8		token = und
index = 9		token = ,
index = 10		token = mit


# result : english

In [8]:
# token to index
idx_limit = 10
for idx, (k, v) in enumerate(token2idx_en.items()):
    print(f'index = {k}\t\ttoken = {v}')
    if idx == idx_limit:
        break

index = <unk>		token = 0
index = <pad>		token = 1
index = <start>		token = 2
index = <end>		token = 3
index = a		token = 4
index = .		token = 5
index = A		token = 6
index = in		token = 7
index = the		token = 8
index = on		token = 9
index = is		token = 10


In [9]:
# index to token
idx_limit = 10
for idx, (k, v) in enumerate(idx2token_en.items()):
    print(f'index = {idx}\t\ttoken = {v}')
    if idx == idx_limit:
        break

index = 0		token = <unk>
index = 1		token = <pad>
index = 2		token = <start>
index = 3		token = <end>
index = 4		token = a
index = 5		token = .
index = 6		token = A
index = 7		token = in
index = 8		token = the
index = 9		token = on
index = 10		token = is


# text indexing

In [10]:
import torch

In [11]:
def convert_text_to_indices(text, vocab, tokenizer, lang):
    return [vocab['<start>']] + [vocab[token] for token in tokenizer.tokenize(lang, text.strip('\n'))] + [vocab['<end>']]

In [12]:
def dataset_to_indices(texts_src, texts_tgt, vocab_src, vocab_tgt, tokenizer_src, tokenizer_tgt):
    data = []
    for (src, tgt) in zip(texts_src, texts_tgt):
        src_tensor = torch.tensor(convert_text_to_indices(lang='de', text=src, vocab=vocab_src, tokenizer=tokenizer_src), dtype=torch.long)
        tgt_tensor = torch.tensor(convert_text_to_indices(lang='en', text=tgt, vocab=vocab_tgt, tokenizer=tokenizer_tgt), dtype=torch.long)
        data.append((src_tensor, tgt_tensor))

    return data

In [13]:
# train dataset
train_data = dataset_to_indices(texts_src=multi30.datasets['texts_src_de_train'],
                                texts_tgt=multi30.datasets['texts_tgt_en_train'],
                                vocab_src=token2idx_de,
                                vocab_tgt=token2idx_en,
                                tokenizer_src=tokenizer,
                                tokenizer_tgt=tokenizer)

In [14]:
# validation dataset
valid_data = dataset_to_indices(texts_src=multi30.datasets['texts_src_de_valid'],
                                texts_tgt=multi30.datasets['texts_tgt_en_valid'],
                                vocab_src=token2idx_de,
                                vocab_tgt=token2idx_en,
                                tokenizer_src=tokenizer,
                                tokenizer_tgt=tokenizer)

In [15]:
# 学習データと検証データを結合したため，以下の単語がトークナイズできるようになった．
# 学習データに入っていない単語を推測できるかは不明だが，楽しみ
token2idx_de['Eisfischerhütte']

19219

# result of tokenized dataset

In [16]:
print(f'indexed text')
print(f'Input(de) {train_data[0][0]}')
print(f'Output(en) {train_data[0][1]}')
print('')

indexed text
Input(de) tensor([   2,   21,   85,  256,   31,   86,   22,   93,    7,   16,  114, 5645,
        3245,    3])
Output(en) tensor([   2,   19,   25,   15, 1197,  817,   17,   58,   84,  332, 1319,    3])



In [17]:
print('text from index')
print(f'Input(de) {" ".join([idx2token_de[x.item()] for x in train_data[0][0]])}')
print(f'Output(en) {" ".join([idx2token_en[x.item()] for x in train_data[0][1]])}')

text from index
Input(de) <start> Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche <end>
Output(en) <start> Two young , White males are outside near many bushes <end>


# save tokenized dataset

In [21]:
import pickle as pkl
import os

# save tokenized dataset
directory_path = '../../data/processed/tokenized_data/'
os.makedirs(directory_path, exist_ok=True)
train_data_path = os.path.join(directory_path, 'train_data.pkl')
valid_data_path = os.path.join(directory_path, 'valid_data.pkl')

with open(train_data_path, 'wb') as f:
    pkl.dump(train_data, f)
with open(valid_data_path, 'wb') as f:
    pkl.dump(valid_data, f)

torch.Tensor