In [2]:
#=========================================================
# huggingface tokenizer 방식들에 대한 테스트
#
# 1. encode 함수 사용=>lits: input_ids만 출력 
# 2. encode_plus 함수 사용=> dict: input_ids, token_type_id, attention_mask 출력 
# 3. toeknizer 사용=> dict: 2. encode_plus 방식과 동일
# 4. convert_tokens_to_ids 사용 => list: [CLS]=101, [SEP]=102 빼고 tokeni_id만 출력함
# 5. batch_encode_plus 사용 => dict: input_ids, token_type_id, attention_mask 배치로 묶어 출력함
#=========================================================

from transformers import BertTokenizer
# vocab_path = "bert-multilingual-cased"
vocab_path = "../../model/bert/bert-multilingual-cased"

tokenizer = BertTokenizer.from_pretrained(vocab_path)

In [3]:
text = "hello world!"

In [23]:
# 1. encode 함수 사용=> input_ids 출력 
token_ids = tokenizer.encode(text)
print(token_ids)
print('\n')

# 128 임베딩 출력하고, padding 붙임
token_ids = tokenizer.encode(text, max_length=128, padding="max_length", return_tensors="pt")
print(token_ids.shape)
print(token_ids)

[101, 61694, 10133, 11356, 106, 102]


torch.Size([1, 128])
tensor([[  101, 61694, 10133, 11356,   106,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,

In [21]:
# 2. encode_plus 함수 사용=> input_ids, token_type_id, attention_mask 출력 
token_ids = tokenizer.encode_plus(text)
print(token_ids)
print('\n')

# 128 임베딩 출력하고, padding 붙임
token_ids = tokenizer.encode_plus(text, max_length=128, padding="max_length", return_tensors="pt")
print(token_ids)

{'input_ids': [101, 61694, 10133, 11356, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


{'input_ids': tensor([[  101, 61694, 10133, 11356,   106,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,  

In [25]:
# 3. toeknizer 사용=> encode_plus 방식과 동일, input_ids, token_type_id, attention_mask 출력 
token_ids = tokenizer(text)
print(token_ids)

print('\n')

# 128 임베딩 출력하고, padding 붙임
token_ids = tokenizer(text, max_length=128, padding="max_length", return_tensors="pt")
print(token_ids)


{'input_ids': [101, 61694, 10133, 11356, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


{'input_ids': tensor([[  101, 61694, 10133, 11356,   106,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,  

In [10]:
# 4. convert_tokens_to_ids 사용 => [CLS]=101, [SEP]=102 빼고 tokeni_id만 출력함
tokens = tokenizer.tokenize(text)
print(tokens)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

['hell', '##o', 'world', '!']
[61694, 10133, 11356, 106]


In [9]:
# 5. batch_encode_plus 사용 => input_ids, token_type_id, attention_mask 배치로 묶어 출력함
token_ids = tokenizer.batch_encode_plus([text])
print(token_ids)

{'input_ids': [[101, 61694, 10133, 11356, 106, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1]]}


In [29]:
# tokenizer 와 batch_encode_plus 와는 동일 
text_list = ["hello world!", "hellow mars!", "hellow sum!"]

token_ids = tokenizer.batch_encode_plus(text_list, max_length=8, padding="max_length", return_tensors="pt")
print(token_ids)
print('\n')

token_ids = tokenizer(text_list, max_length=8, padding="max_length", return_tensors="pt")
print(token_ids)

{'input_ids': tensor([[  101, 61694, 10133, 11356,   106,   102,     0,     0],
        [  101, 61694, 16602, 11438,   106,   102,     0,     0],
        [  101, 61694, 16602, 28439,   106,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0]])}


{'input_ids': tensor([[  101, 61694, 10133, 11356,   106,   102,     0,     0],
        [  101, 61694, 16602, 11438,   106,   102,     0,     0],
        [  101, 61694, 16602, 28439,   106,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0]])}


In [30]:
tokenizer.batch_encode_plus?

[0;31mSignature:[0m
[0mtokenizer[0m[0;34m.[0m[0mbatch_encode_plus[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mbatch_text_or_text_pairs[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mTuple[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mTuple[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mTuple[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mint[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0madd_special_to