# SentencePiece VS Huggingface tokenizer

한국어 서브워드 분절 알고리즘 실습&비교

201229

고우영

| 학습시간 비교(s) | 8000   | 16000 | 32000 | 64000 | 128000|
|------|------|------|------|------|------|
|   Sentencepiece  | 11| 22 |48| 110 |282|
|   hugging_face  | 10| 11 |11| 12 |12|

| 추론시간 비교(s) | 8000| 128000|
|------|------|------|
|   Sentencepiece  | 4.5| 4.93 |
|   hugging_face  | 4.9| 4.97 |

# NSMC 데이터셋 로드
## 15만 문장, 113만 word(띄어쓰기 기준), 평균 7.5word/sentence

In [1]:
%%time
# NSMC 데이터 로드
import pandas as pd
f_train = pd.read_csv('data/nsmc.txt', sep='\t')
train_pair = [(row[1], row[2]) for _, row in f_train.iterrows() if type(row[1]) == str]  # nan 제거

#  문장 및 라벨 데이터 추출
train_data  = [pair[0] for pair in train_pair]
train_label = [pair[1] for pair in train_pair]
print('data loading done!')
print('문장: %s' %(train_data[:3]))
print('라벨: %s' %(train_label[:3]))

# subword 학습을 위해 문장만 따로 저장
with open('data/train_tokenizer.txt', 'w', encoding='utf-8') as f:
    for line in train_data:
        f.write(line+'\n')

# subword 학습을 위해 문장만 따로 저장
with open('data/train_tokenizer.txt', 'r', encoding='utf-8') as f:
    test_tokenizer = f.read().split('\n')
print(test_tokenizer[:3])

num_word_list = [len(sentence.split()) for sentence in test_tokenizer]
print('\n코퍼스 문장수/평균/총 단어 갯수 : %d, %.1f / %d' % (len(num_word_list), sum(num_word_list)/len(num_word_list), sum(num_word_list)))

data loading done!
문장: ['아 더빙.. 진짜 짜증나네요 목소리', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '너무재밓었다그래서보는것을추천한다']
라벨: [0, 1, 0]
['아 더빙.. 진짜 짜증나네요 목소리', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '너무재밓었다그래서보는것을추천한다']

코퍼스 문장수/평균/총 단어 갯수 : 149996, 7.6 / 1137736
Wall time: 27.2 s


# SentencePiece 학습

In [5]:
%%time

# spm_train --input=data/train_tokenizer.txt  --model_prefix=sentencepiece/sp --vocab_size=32000 character_coverage=1.0 --model_type="unigram"

import sentencepiece as spm
import os

input_file = 'data/train_tokenizer.txt'
vocab_size = 32000
sp_model_root='sentencepiece'
if not os.path.isdir(sp_model_root):
    os.mkdir(sp_model_root)
sp_model_name = 'tokenizer_%d' % (vocab_size)
sp_model_path = os.path.join(sp_model_root, sp_model_name)
model_type = 'unigram'  # bpe
character_coverage  = 1.0  # 0.9995
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99], [unused100],[unused101],[unused102],[unused103],[unused104],[unused105],[unused106],[unused107],[unused108],[unused109],[unused110],[unused111],[unused112],[unused113],[unused114],[unused115],[unused116],[unused117],[unused118],[unused119],[unused120],[unused121],[unused122],[unused123],[unused124],[unused125],[unused126],[unused127],[unused128],[unused129],[unused130],[unused131],[unused132],[unused133],[unused134],[unused135],[unused136],[unused137],[unused138],[unused139],[unused140],[unused141],[unused142],[unused143],[unused144],[unused145],[unused146],[unused147],[unused148],[unused149],[unused150],[unused151],[unused152],[unused153],[unused154],[unused155],[unused156],[unused157],[unused158],[unused159],[unused160],[unused161],[unused162],[unused163],[unused164],[unused165],[unused166],[unused167],[unused168],[unused169],[unused170],[unused171],[unused172],[unused173],[unused174],[unused175],[unused176],[unused177],[unused178],[unused179],[unused180],[unused181],[unused182],[unused183],[unused184],[unused185],[unused186],[unused187],[unused188],[unused189],[unused190],[unused191],[unused192],[unused193],[unused194],[unused195],[unused196],[unused197],[unused198],[unused199]'
input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, sp_model_path, vocab_size, user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)
print('train done')

train done
Wall time: 22.6 s


In [6]:
## check
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format(sp_model_path))

tokens = sp.encode_as_pieces('나는 오늘 아침밥을 먹었다.')
ids = sp.encode_as_ids('나는 오늘 아침밥을 먹었다.')

print(ids)
print(tokens)

tokens = sp.decode_pieces(tokens)
ids = sp.decode_ids(ids)

print(ids)
print(tokens)

[426, 658, 6112, 3260, 131, 8423, 140, 120]
['▁나는', '▁오늘', '▁아침', '밥', '을', '▁먹었', '다', '.']
나는 오늘 아침밥을 먹었다.
나는 오늘 아침밥을 먹었다.


# Huggingface tokenizer

## 1. Huggingface setup

## 2. Huggingface train

In [13]:
%%time
from tokenizers import SentencePieceBPETokenizer, BertWordPieceTokenizer

how_to_tokenize = BertWordPieceTokenizer
# how_to_tokenize = SentencePieceBPETokenizer

# Initialize a tokenizer
if str(how_to_tokenize) == str(BertWordPieceTokenizer):
    print('BertWordPieceTokenizer')
    tokenizer = how_to_tokenize(
        strip_accents=False,  # Must be False if cased model
        lowercase=False,
    )
elif str(how_to_tokenize) == str(SentencePieceBPETokenizer):
    print('SentencePieceBPETokenizer')
    tokenizer = how_to_tokenize()
else:
    assert('select right tokenizer')

#########################################
corpus_file   = ['data/train_tokenizer.txt']  # data path
vocab_size    = 32000
limit_alphabet= 6000
output_path   = 'hugging_%d'%(vocab_size)

hf_model_root='huggingface'
if not os.path.isdir(hf_model_root):
    os.mkdir(hf_model_root)
hf_model_name = 'tokenizer_%d.json' % (vocab_size)
hf_model_path = os.path.join(hf_model_root, hf_model_name)

min_frequency = 5

special_tokens=['[BOS]', '[EOS]', '[UNK0]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]', '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]',
                '[unused0]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]',
                '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]',
                '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]',
                '[unused30]', '[unused31]', '[unused32]', '[unused33]', '[unused34]', '[unused35]', '[unused36]', '[unused37]', '[unused38]', '[unused39]',
                '[unused40]', '[unused41]', '[unused42]', '[unused43]', '[unused44]', '[unused45]', '[unused46]', '[unused47]', '[unused48]', '[unused49]',
                '[unused50]', '[unused51]', '[unused52]', '[unused53]', '[unused54]', '[unused55]', '[unused56]', '[unused57]', '[unused58]', '[unused59]',
                '[unused60]', '[unused61]', '[unused62]', '[unused63]', '[unused64]', '[unused65]', '[unused66]', '[unused67]', '[unused68]', '[unused69]',
                '[unused70]', '[unused71]', '[unused72]', '[unused73]', '[unused74]', '[unused75]', '[unused76]', '[unused77]', '[unused78]', '[unused79]',
                '[unused80]', '[unused81]', '[unused82]', '[unused83]', '[unused84]', '[unused85]', '[unused86]', '[unused87]', '[unused88]', '[unused89]',
                '[unused90]', '[unused91]', '[unused92]', '[unused93]', '[unused94]', '[unused95]', '[unused96]', '[unused97]', '[unused98]', '[unused99]',
                '[unused100]', '[unused101]', '[unused102]', '[unused103]', '[unused104]', '[unused105]', '[unused106]', '[unused107]', '[unused108]', '[unused109]',
                '[unused110]', '[unused111]', '[unused112]', '[unused113]', '[unused114]', '[unused115]', '[unused116]', '[unused117]', '[unused118]', '[unused119]',
                '[unused120]', '[unused121]', '[unused122]', '[unused123]', '[unused124]', '[unused125]', '[unused126]', '[unused127]', '[unused128]', '[unused129]',
                '[unused130]', '[unused131]', '[unused132]', '[unused133]', '[unused134]', '[unused135]', '[unused136]', '[unused137]', '[unused138]', '[unused139]',
                '[unused140]', '[unused141]', '[unused142]', '[unused143]', '[unused144]', '[unused145]', '[unused146]', '[unused147]', '[unused148]', '[unused149]',
                '[unused150]', '[unused151]', '[unused152]', '[unused153]', '[unused154]', '[unused155]', '[unused156]', '[unused157]', '[unused158]', '[unused159]',
                '[unused160]', '[unused161]', '[unused162]', '[unused163]', '[unused164]', '[unused165]', '[unused166]', '[unused167]', '[unused168]', '[unused169]',
                '[unused170]', '[unused171]', '[unused172]', '[unused173]', '[unused174]', '[unused175]', '[unused176]', '[unused177]', '[unused178]', '[unused179]',
                '[unused180]', '[unused181]', '[unused182]', '[unused183]', '[unused184]', '[unused185]', '[unused186]', '[unused187]', '[unused188]', '[unused189]',
                '[unused190]', '[unused191]', '[unused192]', '[unused193]', '[unused194]', '[unused195]', '[unused196]', '[unused197]', '[unused198]', '[unused199]'
               ]  # 스페셜 토큰

# Then train it!
tokenizer.train(files=corpus_file,
               vocab_size=vocab_size,
               min_frequency=min_frequency,  # 단어의 최소 발생 빈도, 5
               limit_alphabet=limit_alphabet,
               show_progress=True,
               special_tokens=special_tokens
               )

# And finally save it somewhere
tokenizer.save(hf_model_path)

output = tokenizer.encode("나는 오늘 아침밥을 먹었다.")
print('idx   : %s'%output.ids)
print('tokens: %s'%output.tokens)
print('offset: %s'%output.offsets)

output = tokenizer.encode("교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정")
print('idx   : %s'%output.ids)
print('tokens: %s'%output.tokens)
print('offset: %s'%output.offsets)

BertWordPieceTokenizer
idx   : [6046, 6200, 7718, 4283, 3269, 1567, 5690, 225]
tokens: ['나는', '오늘', '아침', '##밥', '##을', '먹', '##었다', '.']
offset: [(0, 2), (3, 5), (6, 8), (8, 9), (9, 10), (11, 12), (12, 14), (14, 15)]
idx   : [26007, 5786, 12757, 225, 225, 5916, 6576, 5754, 225, 225, 5672, 13705]
tokens: ['교도소', '이야기', '##구먼', '.', '.', '솔직히', '재미는', '없다', '.', '.', '평점', '조정']
offset: [(0, 3), (4, 7), (7, 9), (10, 11), (11, 12), (12, 15), (16, 19), (20, 22), (22, 23), (23, 24), (24, 26), (27, 29)]
Wall time: 6.53 s


## 3. Huggingface Tokenize test

In [14]:
from tokenizers import Tokenizer

how_to_tokenize = BertWordPieceTokenizer
# how_to_tokenize = SentencePieceBPETokenizer
vocab_size    = 32000

tokenizer = Tokenizer.from_file(hf_model_path)


output = tokenizer.encode("나는 오늘 아침밥을 먹었다.")
print('idx   : %s'%output.ids)
print('tokens: %s'%output.tokens)
print('offset: %s'%output.offsets)

output = tokenizer.encode("교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정")
print('idx   : %s'%output.ids)
print('tokens: %s'%output.tokens)
print('offset: %s'%output.offsets)

idx   : [6046, 6200, 7718, 4283, 3269, 1567, 5690, 225]
tokens: ['나는', '오늘', '아침', '##밥', '##을', '먹', '##었다', '.']
offset: [(0, 2), (3, 5), (6, 8), (8, 9), (9, 10), (11, 12), (12, 14), (14, 15)]
idx   : [26007, 5786, 12757, 225, 225, 5916, 6576, 5754, 225, 225, 5672, 13705]
tokens: ['교도소', '이야기', '##구먼', '.', '.', '솔직히', '재미는', '없다', '.', '.', '평점', '조정']
offset: [(0, 3), (4, 7), (7, 9), (10, 11), (11, 12), (12, 15), (16, 19), (20, 22), (22, 23), (23, 24), (24, 26), (27, 29)]


# Tokenze usage

## 1. SentencePiece Usage, load & 분절

In [32]:
%%time
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format(sp_model_path))
sentencepiece_tokenizer = sp.encode_as_pieces

result_tokenized_sentencepiece = [sentencepiece_tokenizer(_tmp) for _tmp in test_tokenizer[:3]]
    
for tmp in result_tokenized_sentencepiece[:3]:
    print(tmp)

['▁아', '▁더빙', '..', '▁진짜', '▁짜증나네요', '▁목소리']
['▁흠', '...', '포스터보고', '▁초딩영화', '줄', '....', '오버', '연기', '조차', '▁가볍지', '▁않', '구나']
['▁너무', '재', '밓', '었다', '그래서', '보는것', '을', '추천', '한다']
Wall time: 80 ms


## 2. Huggingface Usage, load & 분절

In [33]:
%%time
from tokenizers import Tokenizer

how_to_tokenize = BertWordPieceTokenizer
# how_to_tokenize = SentencePieceBPETokenizer
vocab_size    = 32000

tokenizer = Tokenizer.from_file(hf_model_path)

result_tokenized_sentencepiece = [tokenizer.encode(_tmp).tokens for _tmp in test_tokenizer[:3]]
for tmp in result_tokenized_sentencepiece[:3]:
    print(tmp)


['아', '더빙', '.', '.', '진짜', '짜증나네요', '목소리']
['흠', '.', '.', '.', '포스터보고', '초딩영화', '##줄', '.', '.', '.', '.', '오버', '##연기', '##조차', '가볍지', '않', '##구나']
['너무', '##재', '##밓', '##었다', '##그래서', '##보는', '##것을', '##추천', '##한다']
Wall time: 285 ms
