In [1]:
#================================================================================
# xlm-roberta-base 모델은 tokenizer가 unigram임.
# => 여기서는 SentencePieceBPETokenizer 를 이용하여 word 추출하는 방법에 대해 설명함
# => ** SentencePieceBPETokenizer를 이용하여 추출된 vocab들은 
#    'tokenizer_sample/make_mecab_vocab.ipynb' 방식 보다 명사 추출이 떨어지는것 같음
#
# -출처: Unigram tokenzier 사용방법: https://towardsdatascience.com/training-bpe-wordpiece-and-unigram-tokenizers-from-scratch-using-hugging-face-3dd174850713
# -출처 : mecab 사용한 헝태소 분석 : https://keep-steady.tistory.com/37
# -출처 : mecab 윈도우 설치 방법 : https://uwgdqo.tistory.com/363
#================================================================================

import torch
import numpy as np
import pandas as pd
import tokenizers
from tokenizers import (ByteLevelBPETokenizer,
                        CharBPETokenizer,
                        SentencePieceBPETokenizer,
                        BertWordPieceTokenizer)

import konlpy
from konlpy.tag import Mecab
from tqdm.notebook import tqdm

# 윈도우일때 사용 법
# 설치방법은 아래 참조
# 첨조 : https://uwgdqo.tistory.com/363
#mecab = Mecab(dicpath=r"C:/mecab/mecab-ko-dic")
#mecab.pos('아버지가방에들어가신다')

In [2]:
# load korean corpus for tokenizer training
corpus_path = '../../../korpora/kowiki/moco-corpus.txt'
#corpus_path = '../../../korpora/bong_eval.txt'

with open(corpus_path, 'r', encoding='utf-8') as f:
    data = f.read().split('\n')
print(data[:3])

['Refer to the V$SYSTEM_EVENT view for time waited and average waits for thefollowing actions:', 'To estimate the time waited for reads incurred by rereading data blocks that had tobe written to disk because of a request from another instance, multiply the statistic(for example, the time waited for db ﬁle sequential reads) by the percentage of readI/O caused by previous cache ﬂushes as shown in this formula:', 'Where "lock buffers for read" is the value for lock converts from N to S derived fromV$LOCK_ACTIVITY and "physical reads" is from the V$SYSSTAT view.']


In [3]:
#===================================================================
# Mecab 선언
# - 출처 : mecab 사용한 헝태소 분석 : https://keep-steady.tistory.com/37
# - 출처 : mecab 윈도우 설치 방법 : https://uwgdqo.tistory.com/363
#
# 리눅스 일때
#mecab_tokenizer = Mecab()
# 윈도우 os 일때 
mecab_tokenizer = Mecab(dicpath=r"C:/mecab/mecab-ko-dic")
#===================================================================

#'어릴때' -> '어릴, 때'   for normal case
print(f'*mecab 형태소 분석==>')
total_morph=[]
for sentence in tqdm(data):
    # 문장단위 mecab 적용(morphs = 명사)
    morph_sentence = mecab_tokenizer.morphs(sentence)
    # 문장단위 저장
    total_morph.append(morph_sentence)
                        
print(total_morph[:3])
print(len(total_morph))

# mecab 적용한 데이터 저장
print(f'*mecab 분석 데이터 저장==>')
#mecab_corpus_path = '../../../korpora/kowiki/mecab-kowiki-202206-nlp-corpus.txt'
mecab_corpus_path = '../../../korpora/kowiki/moco-corpus-mecab.txt'

# ex) 1 line: '어릴 때 보 고 지금 다시 봐도 재밌 어요 ㅋㅋ'
with open(mecab_corpus_path, 'w', encoding='utf-8') as f:
    for line in tqdm(total_morph):
        f.write(' '.join(line)+'\n')

mecab 형태소 분석==>


  0%|          | 0/3291463 [00:00<?, ?it/s]

[['Refer', 'to', 'the', 'V', '$', 'SYSTEM', '_', 'EVENT', 'view', 'for', 'time', 'waited', 'and', 'average', 'waits', 'for', 'thefollowing', 'actions', ':'], ['To', 'estimate', 'the', 'time', 'waited', 'for', 'reads', 'incurred', 'by', 'rereading', 'data', 'blocks', 'that', 'had', 'tobe', 'written', 'to', 'disk', 'because', 'of', 'a', 'request', 'from', 'another', 'instance', ',', 'multiply', 'the', 'statistic', '(', 'for', 'example', ',', 'the', 'time', 'waited', 'for', 'db', 'ﬁ', 'le', 'sequential', 'reads', ')', 'by', 'the', 'percentage', 'of', 'readI', '/', 'O', 'caused', 'by', 'previous', 'cache', 'ﬂ', 'ushes', 'as', 'shown', 'in', 'this', 'formula', ':'], ['Where', '"', 'lock', 'buffers', 'for', 'read', '"', 'is', 'the', 'value', 'for', 'lock', 'converts', 'from', 'N', 'to', 'S', 'derived', 'fromV', '$', 'LOCK', '_', 'ACTIVITY', 'and', '"', 'physical', 'reads', '"', 'is', 'from', 'the', 'V', '$', 'SYSSTAT', 'view', '.']]
3291463
mecab 분석 데이터 저장==>


  0%|          | 0/3291463 [00:00<?, ?it/s]

In [None]:
# 1. SetnecePieceBPETokenzer 정의 후 훈련 
# mecab 적용한 데이터 불러옴
mecab_corpus_path = '../../../korpora/kowiki-202206-nlp-corpus-mecab.txt'

stokenizer = SentencePieceBPETokenizer(add_prefix_space=True)

# 훈련
stokenizer.train(
    files = [mecab_corpus_path],
    vocab_size = 32000,  # 최대 vocab 계수 
    special_tokens = ["<cls>", "<eos>", "<mask>", "<unk>", "<pad>"],  # speical token 지정
    min_frequency = 100,   # 빈도수 
    show_progress = True,
    #limit_alphabet=10000, 
)

In [None]:
vocab = stokenizer.get_vocab()
print(f'vcoab 길이:{len(vocab)}')
sort_vocab = sorted(vocab, key=lambda x: vocab[x])
print(sort_vocab[0:100])  # sort 해서 vocab 출력 

In [None]:
# vocab을 파일로 저장함
from tqdm.notebook import tqdm
vocab_out = '../../../korpora/kowiki-202206-nlp-corpus-mecab-vocab-32000.txt'
with open(vocab_out, 'w', encoding='utf-8') as f:
    for word in tqdm(sort_vocab):
        f.write(word+'\n')

In [None]:
'''
# 2. 훈련한 SetnecePieceBPETokenzer 를 PreTrainedTokenizerFast 와 연동
from transformers import PreTrainedTokenizerFast
transforer_tokenizer = PreTrainedTokenizerFast(tokenizer_object=stokenizer)
'''

In [None]:
'''
# PreTrainedTokenizerFast tokenizer 저장
import os
OUT_PATH = '../../../korpora/kowiki-202206-nlp-corpus-vocab'
os.makedirs(OUT_PATH, exist_ok=True)
transforer_tokenizer.save_pretrained(OUT_PATH)
'''