In [26]:
##################################################################################
# XLM-Roberta 신규 vocab 추가하기
# - tokenizer.add_tokens(new_vocab) 처럼 add_tokens 함수를 이용함
#   => add_tokens 함수를 이용하면 중복제거 로직 필요 없음.
#
# - XLM-Roberta는 GPT-2와 같은 ByteLevelTokenizer 이용함.
# - 신규 vocab 추가하면 added_tokens.json에 추가됨.
# - 신규 vocab 들은 sentencepiece 형식이어야 함. (즉 word=_ 추가, subword=그대로)
# - sentencepiece 만드는 방법은 'tokenizer_sample/make_mecab_vocab.ipynb' 소스 참조

##################################################################################
import torch
import torch.nn.functional as F
import os
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, RobertaTokenizer, RobertaTokenizerFast, RobertaConfig, RobertaModel, RobertaForMaskedLM
import sys
sys.path.append('..')
from myutils import seed_everything, GPU_info, mlogging

device = GPU_info()
print(device)

#seed 설정
seed_everything(333)

False
device: cpu
cpu


In [25]:
# 기존 tokenizer를 불러옴
vocab_path = '../../../model/xml-roberta-base'
#vocab_path = 'xlm-roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(vocab_path)

print(f'*len:{len(tokenizer)}')

# tokenizer 테스트 
sentence = '인공지능에서 가장큰 문제점은 데이터 쉬프트이다'
output = tokenizer.encode(sentence)
print(output)

decode_list=[]
for out in output:
    decode_list.append(tokenizer.decode(out))
    
print(decode_list)
    

*len:250002
[0, 212233, 1180, 13968, 201539, 205473, 697, 74168, 6, 48637, 10068, 3659, 5769, 2]
['<s>', '인공지능', '에서', '가장', '큰', '문제점', '은', '데이터', '', '쉬', '프', '트', '이다', '</s>']


In [16]:
encoded_dict = tokenizer.encode_plus(
            sentence,                
            add_special_tokens = True,
            max_length = 128,     
            pad_to_max_length = True,
            return_attention_mask = True,  
            return_tensors = 'pt' # return pytorch tensors
       )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
encoded_dict

{'input_ids': tensor([[     0, 212233,   1180,  13968, 201539, 205473,    697,  74168,      6,
          48637,  10068,   3659,   5769,      2,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              

In [3]:
# 신규 vocab 파일을 불러옴
# => 신규 vocab 파일은 sentencepiece 방식에 vocab이어야 함
# => tokenizer_sample/make_mecab_vocab.ipynb 소스 참조

new_vocab = []
new_vocab_path = '../../../korpora/moco-corpus-kowiki2022-nouns-32000-sp.txt'

with open(new_vocab_path, 'r', encoding='utf-8') as f:
    data = f.read().split('\n')
    
for vocab in tqdm(data):
     new_vocab.append(vocab)

print(f'*len:{len(new_vocab)}')
print(new_vocab[1111:1115])


  0%|          | 0/32001 [00:00<?, ?it/s]

*len:32001
['▁투입', '▁번의', '▁소개', '▁동의']


In [4]:
# 신규 tokenize 추가 
#new_vocab = ['모코엠시스', '엠파워', '보안파일서버'] # 추가할 vocab 들
new_tokenizer = tokenizer.add_tokens(new_vocab)

# 신규 추가한 tokenzier를 저장함
# => 저장후에는 해당 폴더에 added_tokenis.json 파일 생성됨.
new_tokenizer_path = '../../../model/xml-roberta-base/moco-corpus-kowiki2022'
os.makedirs(new_tokenizer_path, exist_ok=True)
tokenizer.save_pretrained(new_tokenizer_path)

('../../../model/xml-roberta-base/moco-corpus-kowiki2022\\tokenizer_config.json',
 '../../../model/xml-roberta-base/moco-corpus-kowiki2022\\special_tokens_map.json',
 '../../../model/xml-roberta-base/moco-corpus-kowiki2022\\unigram.json',
 '../../../model/xml-roberta-base/moco-corpus-kowiki2022\\added_tokens.json',
 '../../../model/xml-roberta-base/moco-corpus-kowiki2022\\tokenizer.json')

In [11]:
# 추가한 tokenier 불러와봄.
new_tokenizer_path = '../../../model/xml-roberta-base/moco-corpus-kowiki2022'
new_tokenizer = RobertaTokenizerFast.from_pretrained(new_tokenizer_path, do_lower_case=False)

print(f'*len:{len(new_tokenizer)}')

# tokenizer 테스트 
sentence = "인공지능에서 가장큰 문제점은 데이터 쉬프트이다"
output = new_tokenizer.encode(sentence)
print(output)

decode_list=[]
for out in output:
    decode_list.append(new_tokenizer.decode(out))
    
print(decode_list)

*len:278325
[0, 276048, 6, 1180, 272437, 17626, 205473, 697, 252934, 87237, 277231, 31599, 5769, 2]
['<s>', '인공지능', '', '에서', '가장', '큰', '문제점', '은', '데이', '터', '쉬프', '트', '이다', '</s>']


In [18]:
encoded_dict = new_tokenizer.encode_plus(
            sentence,                
            add_special_tokens = True,
            max_length = 128,     
            pad_to_max_length = True,
            return_attention_mask = True,  
            return_tensors = 'pt' # return pytorch tensors
       )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [20]:
encoded_dict

{'input_ids': tensor([[     0, 276048,      6,   1180, 272437,  17626, 205473,    697, 252934,
          87237, 277231,  31599,   5769,      2,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              