In [None]:
# sentencepiece_model_pb2 사용을 위해서는 google-api-python-client 설치해야 함
#!pip install --upgrade google-api-python-client

In [1]:
import sentencepiece as spm
import sentencepiece.sentencepiece_model_pb2 as model

In [2]:
old_spmodel_fpath = "Tokenizer/kobert/kobert_news_wiki_ko_cased-ae5711deb3.spiece"  # 기존 sentencepiece 모델(Kobert 모델)
new_spmodel_fpath = "Tokenizer/kobert/kobert_new_0208_1.model"                      # 새롭게 추가할 Kobert 모델

In [3]:
m = model.ModelProto()
m.ParseFromString(open(old_spmodel_fpath, 'rb').read())

371427

In [7]:
print(m.pieces[0])
print(type(m.pieces[0]))

piece: "[UNK]"
score: 0.0
type: UNKNOWN

<class 'sentencepiece_model_pb2.SentencePiece'>


In [8]:
print(m.pieces[500:505])

[piece: "\342\200\231,"
score: -9.645369529724121
, piece: "\342\200\234"
score: -9.81615161895752
, piece: "\342\200\235"
score: -5.978498935699463
, piece: "\342\200\235,"
score: -9.58655834197998
, piece: "\342\200\262"
score: -10.174786567687988
]


In [9]:
new_piece = type(m.pieces[0])()
new_piece.piece = "문서중앙화"  #추가할 piece
new_piece.score = 0.0          #**score = 0.0 하면 무조건 해당 토큰으로 잘림
new_piece.type = 1             # type은  1

m.pieces.append(new_piece)

In [10]:
# 새로은 sentencepiece 모델에 추가하기 
with open(new_spmodel_fpath, "wb") as f:
    f.write(m.SerializeToString())

In [22]:
sp = spm.SentencePieceProcessor(model_file=new_spmodel_fpath)
print(sp.encode("문서중앙화 생성 하기", out_type=str))

['▁', '문서중앙화', '▁생', '성', '▁하기']


In [21]:
print('new_model size: {}'.format(sp.GetPieceSize()))
print('PiecetoId:{}'.format(sp.PieceToId('문서중앙화')))
print('IdtoPiece: {}'.format(sp.IdToPiece(8002)))

new_model size: 8003
PiecetoId:8002
IdtoPiece: 문서중앙화


In [13]:
# 새로운 모델을 테스트 해봄 
import torch
import gluonnlp as nlp                  # GluonNLP는 버트를 간단하게 로딩하는 인터페이스를 제공하는 API 임
import numpy as np
from transformers import BertTokenizerFast

In [14]:
#vocab_file="Tokenizer/kobert/kobert_news_wiki_ko_cased-ae5711deb3.spiece" # Kobert vocab
vocab = nlp.vocab.BERTVocab.from_sentencepiece(new_spmodel_fpath, padding_token="[PAD]")
tok = nlp.data.BERTSPTokenizer(new_spmodel_fpath, vocab, lower=False)
transform = nlp.data.BERTSentenceTransform(
            tok, max_seq_length = 128, pad=True, pair=False)

In [15]:
test_sentence = ["식당에 가서 밥을 배 부르게 먹고 문서중앙화 낙시배를 타고 고기 잡고 요트배를 타고 관광을 해야 겠다"]

In [16]:
transform_data = [transform([i[0]]) for i in [test_sentence]]
token_ids = transform_data[0][0]
valid_length = transform_data[0][1]
segment_ids = transform_data[0][2]

test_sentence_list = []
for i, ids in enumerate(token_ids):
    test_sentence_list.append(vocab.idx_to_token[ids])

print("sentece:\r\n{}".format(test_sentence_list))
print("token_ids:\r\n{}".format(token_ids))
print("valid_length:\r\n{}".format(valid_length))
print("segment_ids:\r\n{}".format(segment_ids))

sentece:
['[CLS]', '▁식당', '에', '▁', '가', '서', '▁밥', '을', '▁배', '▁부르', '게', '▁먹고', '▁', '문서중앙화', '▁낙', '시', '배', '를', '▁타고', '▁고', '기', '▁잡고', '▁요', '트', '배', '를', '▁타고', '▁관광', '을', '▁해야', '▁', '겠다', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',