In [1]:
from tokenizers import BertWordPieceTokenizer 
from transformers import BertTokenizer
import pandas as pd
import os

In [2]:
train_df = pd.read_csv('train_preprocess.csv')
train_df

Unnamed: 0,src,tar
0,This message is for Tom.,Ce message est pour Tom.
1,Tom locked himself in his room and cried.,Tom s'est enfermé dans sa chambre et a pleuré.
2,I thought that Tom was in Australia.,Je croyais que Tom était en Australie.
3,Don't you think it's a bad thing?,Tu ne penses pas que c'est une mauvaise chose?
4,I often slept on that bench when I was homeless.,J'ai souvent dormi sur ce banc quand j'étais s...
...,...,...
209457,I got a lot of mail this morning.,Ce matin j'ai beaucoup de courrier.
209458,What time is your plane landing?,À quelle heure votre avion atterrit-il?
209459,There's so much I want to show you.,Il y a tant que je veuille te montrer !
209460,I want a chair.,Je désire une chaise.


In [3]:
with open('english.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(train_df['src']))

In [4]:
src_length = train_df['src'].apply(lambda x: len(x.split(' ')))
src_length

0          5
1          8
2          7
3          7
4         10
          ..
209457     8
209458     6
209459     8
209460     4
209461     5
Name: src, Length: 209462, dtype: int64

In [5]:
src_length.describe()

count    209462.00000
mean          6.08043
std           2.48125
min           1.00000
25%           4.00000
50%           6.00000
75%           7.00000
max          55.00000
Name: src, dtype: float64

In [6]:
tokenizer = BertWordPieceTokenizer(lowercase=False, strip_accents=False)

In [7]:
data_file = 'english.txt'
vocab_size = 30000
limit_alphabet = 6000
min_frequency = 5

tokenizer.train(files=data_file,
                vocab_size=vocab_size,
                limit_alphabet=limit_alphabet,
                min_frequency=min_frequency,
                wordpieces_prefix='##',
                # special_tokens = ['<s>', '</s>', '<pad>']
                )
tokenizer.save_model('./', 'eng-tokenizer')

['./eng-tokenizer-vocab.txt']

In [8]:
tokenizer = BertTokenizer.from_pretrained('./eng-tokenizer-vocab.txt', local_files_only=True, lowercase=False, strip_accents=False)
tokenizer



BertTokenizer(name_or_path='./eng-tokenizer-vocab.txt', vocab_size=11564, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [9]:
encoded = tokenizer.encode(train_df['src'][50])
print('원본 :', train_df['src'][50])
print('토큰화 결과 :',tokenizer.tokenize(train_df['src'][50]))
print('정수 인코딩 :',encoded)
print('디코딩 :',tokenizer.decode(encoded))

원본 : Their deep love for each other was unequivocal.
토큰화 결과 : ['their', 'deep', 'love', 'for', 'each', 'other', 'was', 'une', '##qu', '##iv', '##oc', '##al', '.']
정수 인코딩 : [2, 828, 2035, 556, 208, 1016, 633, 200, 8290, 885, 1885, 1485, 231, 14, 3]
디코딩 : [CLS] their deep love for each other was unequivocal. [SEP]


In [10]:
encoded = tokenizer(train_df['src'][50], add_special_tokens=False)
encoded['input_ids']

[828, 2035, 556, 208, 1016, 633, 200, 8290, 885, 1885, 1486, 231, 14]

In [11]:
tokenizer.decode(encoded['input_ids'])

'their deep love for each other was unequivocal.'

In [12]:
with open('fra.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(train_df['tar']))

In [13]:
tokenizer_fra = BertWordPieceTokenizer(lowercase=True, strip_accents=False)

In [14]:
data_file = 'fra.txt'
vocab_size = 30000
limit_alphabet = 6000
min_frequency = 5

tokenizer_fra.train(files=data_file,
                vocab_size=vocab_size,
                limit_alphabet=limit_alphabet,
                min_frequency=min_frequency,
                wordpieces_prefix='##',
                # special_tokens = ['<s>', '</s>', '<pad>']
                )
tokenizer_fra.save_model('./', 'fra-tokenizer')

['./fra-tokenizer-vocab.txt']

In [15]:
tokenizer = BertTokenizer.from_pretrained('./fra-tokenizer-vocab.txt', local_files_only=True, lowercase=False, strip_accents=False)
tokenizer

BertTokenizer(name_or_path='./fra-tokenizer-vocab.txt', vocab_size=15182, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [16]:
encoded = tokenizer.encode(train_df['tar'][50])
print('원본 :', train_df['tar'][50])
print('토큰화 결과 :',tokenizer.tokenize(train_df['tar'][50]))
print('정수 인코딩 :',encoded)
print('디코딩 :',tokenizer.decode(encoded))

원본 : Leur profond amour l'un pour l'autre était sans équivoque.
토큰화 결과 : ['leur', 'profond', 'amour', 'l', "'", 'un', 'pour', 'l', "'", 'autre', 'était', 'sans', 'équiv', '##oque', '.']
정수 인코딩 : [2, 710, 3539, 2078, 42, 10, 157, 185, 42, 10, 443, 272, 659, 14356, 2248, 16, 3]
디코딩 : [CLS] leur profond amour l'un pour l'autre était sans équivoque. [SEP]
