#**Create Vocab file using Google Sentence Piece**
Data: from [kowiki](https://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8D%B0%EC%9D%B4%ED%84%B0%EB%B2%A0%EC%9D%B4%EC%8A%A4_%EB%8B%A4%EC%9A%B4%EB%A1%9C%EB%93%9C) download 'pages-articles.xml.bz2' and process it with [wikiextractor](https://github.com/attardi/wikiextractor).  
'kowiki.txt' here is processed data.

In [1]:
!pip install sentencepiece



In [2]:
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/My Drive/Data/ML tutorial"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import sentencepiece as spm
import shutil

In [4]:
shutil.copy(f"{path}/kowiki.txt", "kowiki.txt")

'kowiki.txt'

In [5]:
for f in os.listdir('.'):
  print(f)

.config
kowiki.txt
drive
sample_data


In [7]:
# Create Vocab
corpus = 'kowiki.txt'
prefix = 'kowiki'
vocab_size = 8000
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" + 
    " --model_type=bpe" +
    " --max_sentence_length=999999" + # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" + # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" + # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" + # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" + # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]" # 사용자 정의 토큰
)

for f in os.listdir(path):
  print(f)
shutil.copy("kowiki.model", f"{path}/kowiki.model")
shutil.copy("kowiki.vocab", f"{path}/kowiki.vocab")

kowiki.txt


'/content/drive/My Drive/Data/ML tutorial/kowiki.vocab'

In [8]:
# Test Vocab
vocab_file = f'{path}/kowiki.model'
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)

lines = [
         "따뜻한 핫초코 한 잔 정도는 괜찮잖아?",
         "봄이 오고 있나봐요, 봄바람 냄새가 나요.",
         "햇빛이 선명하게 나뭇잎을 핥고 있었다.",
]
for line in lines:
  pieces = vocab.encode_as_pieces(line)
  ids = vocab.encode_as_ids(line)
  print(line)
  print(pieces)
  print(ids)
  print()

따뜻한 핫초코 한 잔 정도는 괜찮잖아?
['▁따', '뜻', '한', '▁', '핫', '초', '코', '▁한', '▁잔', '▁정', '도는', '▁', '괜', '찮', '잖', '아', '?']
[146, 4016, 3603, 3587, 4803, 3788, 3786, 34, 1738, 35, 799, 3587, 5728, 5319, 5941, 3621, 4245]

봄이 오고 있나봐요, 봄바람 냄새가 나요.
['▁봄', '이', '▁오', '고', '▁있', '나', '봐', '요', ',', '▁봄', '바', '람', '▁', '냄', '새', '가', '▁나', '요', '.']
[3253, 3588, 62, 3600, 11, 3628, 4805, 3760, 3595, 3253, 3739, 3894, 3587, 4911, 3933, 3599, 57, 3760, 3590]

햇빛이 선명하게 나뭇잎을 핥고 있었다.
['▁', '햇', '빛', '이', '▁선', '명', '하게', '▁나', '뭇', '잎', '을', '▁', '핥', '고', '▁있었다', '.']
[3587, 5045, 4298, 3588, 55, 3692, 174, 57, 5190, 4384, 3598, 3587, 6718, 3600, 350, 3590]

