In [1]:
import torch
import gluonnlp as nlp                  # GluonNLP는 버트를 간단하게 로딩하는 인터페이스를 제공하는 API 임
import numpy as np
from transformers import BertTokenizerFast

In [2]:
#vocab_file="Tokenizer/kobert/kobert_news_wiki_ko_cased-ae5711deb3.spiece" # Kobert vocab
vocab_file="Tokenizer/kobert/kobert_news_wiki_ko_cased-ae5711deb3.spiece"
vocab = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file, padding_token="[PAD]")
tok = nlp.data.BERTSPTokenizer(vocab_file, vocab, lower=False)
transform = nlp.data.BERTSentenceTransform(
            tok, max_seq_length = 128, pad=True, pair=False)

In [3]:
test_sentence = ["식당에 가서 밥을 배 부르게 먹고 문서 중앙화 낙시배를 타고 고기 잡고 요트배를 타고 관광을 해야 겠다"]

In [4]:
transform_data = [transform([i[0]]) for i in [test_sentence]]
token_ids = transform_data[0][0]
valid_length = transform_data[0][1]
segment_ids = transform_data[0][2]

test_sentence_list = []
for i, ids in enumerate(token_ids):
    test_sentence_list.append(vocab.idx_to_token[ids])

print("sentece:\r\n{}".format(test_sentence_list))
print("token_ids:\r\n{}".format(token_ids))
print("valid_length:\r\n{}".format(valid_length))
print("segment_ids:\r\n{}".format(segment_ids))


sentece:
['[CLS]', '▁식당', '에', '▁', '가', '서', '▁밥', '을', '▁배', '▁부르', '게', '▁먹고', '▁문서', '▁중앙', '화', '▁낙', '시', '배', '를', '▁타고', '▁고', '기', '▁잡고', '▁요', '트', '배', '를', '▁타고', '▁관광', '을', '▁해야', '▁', '겠다', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[P

In [15]:
vocab.idx_to_token[8001]

'힙'

In [7]:
tokenizer = BertTokenizerFast(
    vocab_file='my_data1/special_0214/vocab.txt',
    max_len=128,
    do_lower_case=False,
    )

In [8]:
'''
from konlpy.tag import Mecab
mecab = Mecab()

test_mecab = mecab.morphs(test_sentence[0])
print(test_mecab)
print(test_mecab[1])
print(type(test_mecab))

data = ' '.join(test_mecab)
print(data)
'''

"\nfrom konlpy.tag import Mecab\nmecab = Mecab()\n\ntest_mecab = mecab.morphs(test_sentence[0])\nprint(test_mecab)\nprint(test_mecab[1])\nprint(type(test_mecab))\n\ndata = ' '.join(test_mecab)\nprint(data)\n"

In [9]:
tokenized_input = tokenizer(test_sentence[0], return_tensors="pt")

In [17]:
tokenizer.convert_ids_to_tokens(8001)

'##힙'

In [11]:
token_str = [[tokenizer.convert_ids_to_tokens(s) for s in tokenized_input['input_ids'].tolist()[0]]]

# token indexs ( 토큰을 index로 변한한 값)
token_ids = [tokenized_input['input_ids'].tolist()[0]]
# attention_mask (중요토큰 : 1)
token_attention_mask = [tokenized_input['attention_mask'].tolist()[0]]
# segment_id (첫번째문자:0, 다음 문장:1)
token_type_ids = [tokenized_input['token_type_ids'].tolist()[0]]

print(token_str)
print(token_ids)
print(token_attention_mask)
print(token_type_ids)

print(tokenizer.convert_ids_to_tokens(6235))


[['[CLS]', '식당', '##에', '[UNK]', '밥', '##을', '배', '부르', '##게', '먹고', '문서', '중앙', '##화', '낙', '##시', '##배', '##를', '타고', '고', '##기', '잡고', '요', '##트', '##배', '##를', '타고', '관광', '##을', '해야', '[UNK]', '[SEP]']]
[[2, 3007, 6896, 0, 2266, 7088, 2287, 2432, 5400, 2011, 2121, 4269, 7941, 1404, 6705, 6312, 6116, 4700, 993, 5561, 3951, 3480, 7659, 6312, 6116, 4700, 1080, 7088, 5010, 0, 3]]
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
##문을


In [None]:
tokenized_input1 = tokenizer(test_sentence[0], return_tensors="pt")
token_str = [[tokenizer.convert_ids_to_tokens(s) for s in tokenized_input1['input_ids'].tolist()[0]]]

# token indexs ( 토큰을 index로 변한한 값)
token_ids = [tokenized_input['input_ids'].tolist()[0]]
# attention_mask (중요토큰 : 1)
token_attention_mask = [tokenized_input['attention_mask'].tolist()[0]]
# segment_id (첫번째문자:0, 다음 문장:1)
token_type_ids = [tokenized_input['token_type_ids'].tolist()[0]]

print(token_str)
print(token_ids)
print(token_attention_mask)
print(token_type_ids)