In [1]:
!pip install sentencepiece



In [2]:
!wget https://dumps.wikimedia.org/other/cirrussearch/20190422/jawiki-20190422-cirrussearch-content.json.gz

--2019-04-27 02:31:42--  https://dumps.wikimedia.org/other/cirrussearch/20190422/jawiki-20190422-cirrussearch-content.json.gz
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:4:208:80:155:106, 208.80.155.106
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:4:208:80:155:106|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7333820534 (6.8G) [application/octet-stream]
Saving to: ‘jawiki-20190422-cirrussearch-content.json.gz’


2019-04-27 03:30:26 (1.99 MB/s) - ‘jawiki-20190422-cirrussearch-content.json.gz’ saved [7333820534/7333820534]



In [10]:
import json
import gzip
import itertools
import re

In [20]:
def extract_text(input_path, output_path, block_size=100000):
    """
    text processes
    1. lower
    2. (split by newlines)
    3. strip
    """
    
    def flush(text_lines):
        with open(output_path, 'a') as f:
            ret = f.write('\n'.join(filter(
                    lambda _:len(_)>0, 
                    (l.strip() for l in itertools.chain.from_iterable(text_lines))
                )))
            print('write:%s'%(ret))
        text_lines.clear()

    with gzip.open(input_path) as f:
        text_lines = []
        for line in f:
            json_line = json.loads(line)
            if "text" in json_line:
                text = json_line["text"]
                text_lines.append(text.lower().split('\n'))
                if len(text_lines) > block_size:
                    flush(text_lines)
        if len(text_lines) > 0:
            flush(text_lines)
    print('done')

In [22]:
extract_text('jawiki-20190422-cirrussearch-content.json.gz', 'jawiki.txt')

write:206537272
write:183202756
write:186570109
write:181526538
write:198664898
write:192250534
write:234313888
write:225161454
write:229207931
write:212864148
write:223760169
write:113536913
done


In [1]:
!shuf -n800000 jawiki.txt > jawiki_800000.txt

In [1]:
import sentencepiece as sp

vocab_size=30000
vocab_reserved=16
vocab_reserved_used=3 # account for pad, unk, space
vocab_reserved_unused=vocab_reserved - vocab_reserved_used
model_prefix='sp_uncase_ja_%d'%(vocab_size)
_input='jawiki_800000.txt'

command = ' '.join((
    '--pad_id=0',
    '--unk_id=1',
    '--bos_id=-1', 
    '--eos_id=-1',
    '--add_dummy_prefix=False',
    '--user_defined_symbols=\u2581',
    '--input_sentence_size=500000',
    '--vocab_size=%d'%(vocab_size-vocab_reserved_unused), 
    '--model_prefix=%s'%(model_prefix),
    '--input=%s'%(_input),
))
# train ran on python interpreter
# sp.SentencePieceTrainer.Train(command)
"""
sentencepiece_trainer.cc(116) LOG(INFO) Running command: --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --add_dummy_prefix=False --user_defined_symbols=▁ --input_sentence_size=500000 --vocab_size=29987 --model_prefix=sp_uncase_ja_30000 --input=jawiki_800000.txt
sentencepiece_trainer.cc(49) LOG(INFO) Starts training with :
TrainerSpec {
  input: jawiki_800000.txt
  input_format:
  model_prefix: sp_uncase_ja_30000
  model_type: UNIGRAM
  vocab_size: 29987
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 500000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  treat_whitespace_as_suffix: 0
  user_defined_symbols: ▁
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: -1
  eos_id: -1
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇
}
NormalizerSpec {
  name: nmt_nfkc
  add_dummy_prefix: 0
  remove_extra_whitespaces: 1
  escape_whitespaces: 1
  normalization_rule_tsv:
}

trainer_interface.cc(267) LOG(INFO) Loading corpus: jawiki_800000.txt
trainer_interface.cc(287) LOG(WARNING) Found too long line (5029 > 4192).
trainer_interface.cc(289) LOG(WARNING) Too long lines are skipped in the training.
trainer_interface.cc(290) LOG(WARNING) The maximum length can be changed with --max_sentence_length=<size> flag.
trainer_interface.cc(317) LOG(INFO) Sampled 500000 sentences from 566490 sentences.
trainer_interface.cc(321) LOG(INFO) Skipped 233510 too long sentences.
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <pad>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: ▁
trainer_interface.cc(335) LOG(INFO) Normalizing sentences...
trainer_interface.cc(384) LOG(INFO) all chars count=356559894
trainer_interface.cc(392) LOG(INFO) Done: 99.95% characters are covered.
trainer_interface.cc(402) LOG(INFO) Alphabet size=4792
trainer_interface.cc(403) LOG(INFO) Final character coverage=0.9995
trainer_interface.cc(435) LOG(INFO) Done! preprocessed 500000 sentences.
unigram_model_trainer.cc(129) LOG(INFO) Making suffix array...
unigram_model_trainer.cc(133) LOG(INFO) Extracting frequent sub strings...
unigram_model_trainer.cc(184) LOG(INFO) Initialized 1000000 seed sentencepieces
trainer_interface.cc(441) LOG(INFO) Tokenizing input sentences with whitespace: 500000
trainer_interface.cc(451) LOG(INFO) Done! 499927
unigram_model_trainer.cc(470) LOG(INFO) Using 499927 sentences for EM training
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=754660 obj=3846.39 num_tokens=150019983 num_tokens/piece=198.791
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=634831 obj=3751.48 num_tokens=152882454 num_tokens/piece=240.824
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=476004 obj=3751.04 num_tokens=154629416 num_tokens/piece=324.849
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=475258 obj=3747.82 num_tokens=154974599 num_tokens/piece=326.085
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=356442 obj=3693.68 num_tokens=155843177 num_tokens/piece=437.219
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=356430 obj=3727.13 num_tokens=155941246 num_tokens/piece=437.509
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=267319 obj=3705.3 num_tokens=157876067 num_tokens/piece=590.591
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=267318 obj=3758.53 num_tokens=157895542 num_tokens/piece=590.666
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=200487 obj=3726.27 num_tokens=160790735 num_tokens/piece=802.001
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=200487 obj=3718.87 num_tokens=160796597 num_tokens/piece=802.03
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=150365 obj=3756.64 num_tokens=164323475 num_tokens/piece=1092.83
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=150365 obj=3748.22 num_tokens=164331323 num_tokens/piece=1092.88
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=112773 obj=3794.45 num_tokens=168342289 num_tokens/piece=1492.75
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=112773 obj=3785.19 num_tokens=168347417 num_tokens/piece=1492.8
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=84579 obj=3838.9 num_tokens=172792702 num_tokens/piece=2042.97
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=84579 obj=3828.82 num_tokens=172805805 num_tokens/piece=2043.13
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=63434 obj=3887.71 num_tokens=177664984 num_tokens/piece=2800.78
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=63434 obj=3876.88 num_tokens=177668862 num_tokens/piece=2800.85
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=47575 obj=3942.01 num_tokens=183113762 num_tokens/piece=3848.95
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=47575 obj=3930.28 num_tokens=183112525 num_tokens/piece=3848.92
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=35681 obj=4001.54 num_tokens=189085471 num_tokens/piece=5299.33
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=35681 obj=3988.38 num_tokens=189084926 num_tokens/piece=5299.32
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=32985 obj=4007.85 num_tokens=190813142 num_tokens/piece=5784.85
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=32985 obj=4004.1 num_tokens=190819107 num_tokens/piece=5785.03
trainer_interface.cc(507) LOG(INFO) Saving model: sp_uncase_ja_30000.model
trainer_interface.cc(531) LOG(INFO) Saving vocabs: sp_uncase_ja_30000.vocab
"""
True

True

In [2]:
# Modification of .vocab file to append reserved place holders
with open(model_prefix + '.vocab') as f:
    lines = [_ for _ in f.read().split('\n') if len(_) > 0]

import shutil
shutil.copy(model_prefix + '.vocab', model_prefix + '.old.vocab')
    
lines_reserved = lines[:vocab_reserved_used]
lines_unused = ['unused_%d\t0'%i for i in range(0, vocab_reserved_unused)]
lines_normal = lines[vocab_reserved_used:]
lines = lines_reserved + lines_unused + lines_normal
assert len(lines) == vocab_size, 'len(lines)=%d vs vocab_size=%d'%(len(lines), vocab_size)

with open(model_prefix + '.vocab', 'w') as f:
    f.write('\n'.join(lines))