In [1]:
import os
import re
import numpy as np

from pororo import Pororo
from jamo import h2j, j2hcj

from io import StringIO
from contextlib import redirect_stdout
from koparadigm import Paradigm, prettify

from pkg.util import read_problemsheet, write_problemsheet
from pkg.parse import *
from pkg.words import *
from pkg.dataset import ProblemDataset

from problems import *

import torch, torch.nn as nn, torch.nn.functional as F
import time

import sentencepiece as spm


vocab_size = 51

dir_token = 'tokenization'
path_text = os.path.join(dir_token, 'text4token.txt')


In [43]:
def spm_train(corpus, vocab_size, prefix=dir_token + '/prob', lowercase=False, 
              add_space_token=False, user_defined_symbols='<cls>'):
    prefix += '_' + str(vocab_size)
    unicode_case_folding = 'nmt_nfkc' 
    if lowercase:
        unicode_case_folding += '_cf'
        prefix += '_lower'
    if add_space_token:
        prefix += '_'
        user_defined_symbols += ',▁'
    
    spm.SentencePieceTrainer.train(f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size}" + 
        " --model_type=unigram" +
        " --max_sentence_length=999999" + # 문장 최대 길이
        " --pad_id=0 --pad_piece=<pad>" + # pad (0)
        " --bos_id=1 --bos_piece=<bos>" + # begin of sequence (2)
        " --eos_id=2 --eos_piece=<eos>" + # end of sequence (3)
        " --unk_id=3 --unk_piece=<unk>" + # unknown (1)
        " --train_extremely_large_corpus=true"         
        f" --normalization_rule_name={unicode_case_folding}"
        f" --user_defined_symbols={user_defined_symbols}") # 사용자 정의 토큰

In [53]:
vocab_size = 512

_ws_user_defined = []
_ws_user_defined += ws_person + ws_gname + ws_variable #+ ws_vehicle + ws_location + ws_stationary + ws_food + ws_flower
#_ws_user_defined += ws_animal + ws_unit + ws_subject + ws_color # + ws_ball + ws_container + ws_sport + ws_color
_ws_user_defined += pos_mdn
_ws_user_defined += [str(i) for i in range(100)]
print('ws_predifined', len(_ws_user_defined))

user_defined_symbols = '<cls>'
user_defined_symbols += ',' + ','.join(_ws_user_defined)

print(user_defined_symbols)

for add_space_token in (True, False):
    spm_train(corpus, vocab_size, lowercase=False, add_space_token=add_space_token, 
              user_defined_symbols=user_defined_symbols)

ws_predifined 162
<cls>,남준,석진,윤기,호석,지민,태형,정국,민영,유정,은지,유나,(가),(나),(다),(라),(마),(바),(사),(아),(자),(차),(카),(타),(파),(하),A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,영,한,두,세,네,다섯,여섯,일곱,여덟,아홉,열,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99


In [61]:
vocab_size = 768

_ws_user_defined = []
_ws_user_defined += ws_person + ws_gname + ws_variable
_ws_user_defined += ws_unit
_ws_user_defined += pos_mdn
_ws_user_defined += [str(i) for i in range(100)]
print('ws_predifined', len(_ws_user_defined))

user_defined_symbols = '<cls>'
user_defined_symbols += ',' + ','.join(_ws_user_defined)

print(user_defined_symbols)

for add_space_token in (True, False):
    spm_train(corpus, vocab_size, lowercase=False, add_space_token=add_space_token, 
              user_defined_symbols=user_defined_symbols)

ws_predifined 188
<cls>,남준,석진,윤기,호석,지민,태형,정국,민영,유정,은지,유나,(가),(나),(다),(라),(마),(바),(사),(아),(자),(차),(카),(타),(파),(하),A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,km,m,cm,mm,kg,g,ton,ml,liter,마리,송이,대,그루,권,자루,쪽,살,켤레,줄,벌,타,조각,리터,점,점수,가지,영,한,두,세,네,다섯,여섯,일곱,여덟,아홉,열,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99


In [None]:
intentional error

# Generate Text Data for Tokenization

In [36]:
from pororo import Pororo
gec, gec_factory = Pororo(task="gec", lang="kr")

Pororo Factory Base, task_config TaskConfig(task='gec', lang='ko', n_model='charbert.base.ko.spacing')
Init PororoGecFactory
gec
ko
None
As of now, this beta model tries to correct spacing errors in Korean text.


In [37]:
problems = [P1_1_1, P1_1_2, P1_1_3, P1_1_4, P1_1_5, P1_1_6, P1_1_7, P1_1_8, P1_1_9, P1_1_10, P1_1_11, P1_1_12, 
            P1_2_1, P1_2_2, P1_3_1, P1_4_1, 
            P2_1_1, P2_2_2, P2_3_1, 
            P3_1_1, P3_2_1, P3_2_2, P3_3_1, 
            P4_1_1, P4_2_1, P4_2_2, P4_3_1, 
            P5_1_1, P5_2_1, P5_3_1,
            P6_1_1, P6_3_1, P6_4_1,
            P7_1_1, P7_1_2, P7_3_1,
            P8_1_1, P8_2_1, P8_3_1, 
            P9_1_1, P9_2_1, P9_2_2, P9_3_1, P9_3_2]

In [38]:
ds = ProblemDataset(problems, batch_size=64, gec=gec)

In [39]:
texts = []
start = time.time()
count = 0
for i in range(500):
    for j in range(len(ds)):
        batch = ds[j]
        texts.extend(batch['text'])
        count += len(batch['text'])
print(time.time() - start)
print('num of lines', count)
with open(path_text, 'w') as f:
    f.write('\n'.join(texts))

4490.60777592659
num of lines 1408000
