In [1]:
from transformers import BertTokenizer,AlbertTokenizer
from konlpy.tag import Mecab

wordpiece_info = {"vocab_path" : "tokenizer/model/wordpiece/version_1.4"}

wordpiece_mecab_info = {"vocab_path" : "/data/bowon_ko/wordpiece/version_1.9"}

sentencepiece_mecab_info = {"vocab_path" : "tokenizer/model/sentencepiece_mecab/version_0.1/version_0.1.model"}

# def load_tokenizer(model_name="wp-mecab"):
    


def make_tokens(text, model_name="wp-mecab"):
    if model_name == "sp-mecab":
        tokenizer_path = sentencepiece_mecab_info["vocab_path"]
        tokenizer = AlbertTokenizer.from_pretrained(tokenizer_path,
                                                do_lower_case=False,
                                                unk_token='<unk>',
                                                sep_token='</s>',
                                                pad_token='<pad>',
                                                cls_token='<s>',
                                                mask_token='<mask>',
                                                use_fast=True)
    elif "wp" in model_name:
        if model_name == "wp-mecab":
            tokenizer_path = wordpiece_mecab_info["vocab_path"]
        elif model_name == "wp":
            tokenizer_path = wordpiece_info["vocab_path"]
        tokenizer = BertTokenizer.from_pretrained(tokenizer_path, 
                                              do_lower_case=False,
                                              unk_token='<unk>',
                                              sep_token='</s>',
                                              pad_token='<pad>',
                                              cls_token='<s>',
                                              mask_token='<mask>',
                                              use_fast=True)
    
    if "mecab" in model_name or model_name=="mecab":
        mecab = Mecab()
        #text를 형태소로 분절 및 결합
        morphs = mecab.morphs(text)
        text = " ".join(morphs)
        if model_name == "mecab":
            return text
        
    #텍스트를 토크나이즈
    tokens = tokenizer.encode(text)
    
    return tokens
    

In [4]:
tokenizer_path = wordpiece_mecab_info["vocab_path"]
tokenizer = BertTokenizer.from_pretrained("/data/bowon_ko/wordpiece/version_1.9", 
                                              do_lower_case=False,
                                              unk_token='<unk>',
                                              sep_token='</s>',
                                              pad_token='<pad>',
                                              cls_token='<s>',
                                              mask_token='<mask>',
                                              use_fast=True)

file /data/bowon_ko/wordpiece/version_1.9/config.json not found


In [6]:
tokenizer = BertTokenizer.from_pretrained("/data/bowon_ko/wordpiece/version_1.9")

file /data/bowon_ko/wordpiece/version_1.9/config.json not found
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
tokenizer.special_tokens_map.items()

dict_items([('unk_token', '[UNK]'), ('sep_token', '[SEP]'), ('pad_token', '[PAD]'), ('cls_token', '[CLS]'), ('mask_token', '[MASK]')])

In [3]:
output = tokenizer.encode("ㆍ정의화 number일 직권상정여 쟁점법안 연계 처리 불투명")
print(output)

[0, 3, 274, 16772, 24512, 6549, 18085, 38598, 6501, 22120, 6242, 6318, 15182, 11557, 2774, 6724, 6562, 2]


In [7]:
tokenizer.decode(output)

'<s> <unk> number일 직권상정여 쟁점법안 연계 처리 불투명 </s>'

In [8]:
tokenizer.special_tokens_map.items()

dict_items([('unk_token', '<unk>'), ('sep_token', '</s>'), ('pad_token', '<pad>'), ('cls_token', '<s>'), ('mask_token', '<mask>')])

In [12]:
tokenizer.all_special_tokens

['<unk>', '</s>', '<pad>', '<s>', '<mask>']

In [20]:
token_id = {}
for k, v in tokenizer.special_tokens_map.items():
    idx = tokenizer.all_special_tokens.index(v)
    print(k,idx)
    token_id[k] = idx

unk_token 0
sep_token 1
pad_token 2
cls_token 3
mask_token 4


In [22]:
token_id.items()

dict_items([('unk_token', 0), ('sep_token', 1), ('pad_token', 2), ('cls_token', 3), ('mask_token', 4)])

In [23]:
token_id.values()

dict_values([0, 1, 2, 3, 4])

In [11]:
tokenizer.max_model_input_sizes

{'bert-base-uncased': 512,
 'bert-large-uncased': 512,
 'bert-base-cased': 512,
 'bert-large-cased': 512,
 'bert-base-multilingual-uncased': 512,
 'bert-base-multilingual-cased': 512,
 'bert-base-chinese': 512,
 'bert-base-german-cased': 512,
 'bert-large-uncased-whole-word-masking': 512,
 'bert-large-cased-whole-word-masking': 512,
 'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
 'bert-large-cased-whole-word-masking-finetuned-squad': 512,
 'bert-base-cased-finetuned-mrpc': 512,
 'bert-base-german-dbmdz-cased': 512,
 'bert-base-german-dbmdz-uncased': 512,
 'TurkuNLP/bert-base-finnish-cased-v1': 512,
 'TurkuNLP/bert-base-finnish-uncased-v1': 512,
 'wietsedv/bert-base-dutch-cased': 512}