In [1]:
!pip install sentencepiece



In [2]:
!wget https://dumps.wikimedia.org/other/cirrussearch/20190422/jawiki-20190422-cirrussearch-content.json.gz

--2019-04-27 02:31:42--  https://dumps.wikimedia.org/other/cirrussearch/20190422/jawiki-20190422-cirrussearch-content.json.gz
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:4:208:80:155:106, 208.80.155.106
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:4:208:80:155:106|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7333820534 (6.8G) [application/octet-stream]
Saving to: ‘jawiki-20190422-cirrussearch-content.json.gz’


2019-04-27 03:30:26 (1.99 MB/s) - ‘jawiki-20190422-cirrussearch-content.json.gz’ saved [7333820534/7333820534]



In [7]:
!wget https://dumps.wikimedia.org/other/cirrussearch/20190422/enwiki-20190422-cirrussearch-content.json.gz

--2019-04-27 04:09:59--  https://dumps.wikimedia.org/other/cirrussearch/20190422/enwiki-20190422-cirrussearch-content.json.gz
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:4:208:80:155:106, 208.80.155.106
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:4:208:80:155:106|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28155129067 (26G) [application/octet-stream]
Saving to: ‘enwiki-20190422-cirrussearch-content.json.gz’


2019-04-27 07:59:13 (1.95 MB/s) - ‘enwiki-20190422-cirrussearch-content.json.gz’ saved [28155129067/28155129067]



In [10]:
import json
import gzip
import itertools
import re

In [20]:
def extract_text(input_path, output_path, block_size=100000):
    """
    text processes
    1. lower
    2. (split by newlines)
    3. strip
    """
    
    def flush(text_lines):
        with open(output_path, 'a') as f:
            ret = f.write('\n'.join(filter(
                    lambda _:len(_)>0, 
                    (l.strip() for l in itertools.chain.from_iterable(text_lines))
                )))
            print('write:%s'%(ret))
        text_lines.clear()

    with gzip.open(input_path) as f:
        text_lines = []
        for line in f:
            json_line = json.loads(line)
            if "text" in json_line:
                text = json_line["text"]
                text_lines.append(text.lower().split('\n'))
                if len(text_lines) > block_size:
                    flush(text_lines)
        if len(text_lines) > 0:
            flush(text_lines)
    print('done')

In [21]:
extract_text('enwiki-20190422-cirrussearch-content.json.gz', 'enwiki.txt')

write:369702687
write:358255001
write:339906391
write:329798261
write:317134541
write:325287199
write:314865612
write:303227519
write:313747929
write:297361084
write:271422697
write:262897437
write:289770085
write:302961892
write:279494464
write:245401108
write:234794586
write:232699464
write:260179880
write:268285622
write:243880294
write:283496239
write:230539175
write:232695503
write:318426960
write:307303468
write:253428083
write:264312258
write:338572474
write:336930229
write:289385204
write:228614831
write:260443451
write:230449676
write:233468237
write:218666297
write:222052512
write:217431745
write:220272896
write:218042917
write:235347744
write:331820823
write:334924592
write:358884553
write:465671752
write:544773966
write:521944186
write:598539592
write:513487035
write:652594881
write:621448485
write:590982259
write:575114906
write:490271620
write:391809125
write:536767348
write:760846210
write:786657312
write:350909358
done


In [22]:
extract_text('jawiki-20190422-cirrussearch-content.json.gz', 'jawiki.txt')

write:206537272
write:183202756
write:186570109
write:181526538
write:198664898
write:192250534
write:234313888
write:225161454
write:229207931
write:212864148
write:223760169
write:113536913
done


In [2]:
# I'll use 600,000 inputs because of memory capacity.
# preparing 600,000 sentences in Japanese and English respectively 
# in preparation for omitting too long sentences.
!shuf -n600000 enwiki.txt > enwiki_600000.txt
!shuf -n600000 jawiki.txt > jawiki_600000.txt
!cat enwiki_600000.txt jawiki_600000.txt > inputs.txt

In [None]:
import sentencepiece as sp

vocab_size=40000
vocab_reserved=16
vocab_reserved_used=3 # account for pad, unk, space
vocab_reserved_unused=vocab_reserved - vocab_reserved_used
model_prefix='sp_uncase_en_ja_%d'%(vocab_size)
_input='inputs.txt'

command = ' '.join((
    '--pad_id=0',
    '--unk_id=1',
    '--bos_id=-1', 
    '--eos_id=-1',
    '--add_dummy_prefix=False',
    '--user_defined_symbols=\u2581',
    '--input_sentence_size=600000',
    '--vocab_size=%d'%(vocab_size-vocab_reserved_unused), 
    '--model_prefix=%s'%(model_prefix),
    '--input=%s'%(_input),
))
#sp.SentencePieceTrainer.Train(command)

# Train method in this cell ran in a local shell to show logs.
"""
sentencepiece_trainer.cc(116) LOG(INFO) Running command: --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --add_dummy_prefix=False --user_defined_symbols=? --input_sentence_size=600000 --vocab_size=39987 --model_prefix=sp_uncase_en_ja_40000 --input=inputs.txt
sentencepiece_trainer.cc(49) LOG(INFO) Starts training with :
TrainerSpec {
  input: inputs.txt
  input_format:
  model_prefix: sp_uncase_en_ja_40000
  model_type: UNIGRAM
  vocab_size: 39987
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 600000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  treat_whitespace_as_suffix: 0
  user_defined_symbols: ?
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: -1
  eos_id: -1
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ?
}
NormalizerSpec {
  name: nmt_nfkc
  add_dummy_prefix: 0
  remove_extra_whitespaces: 1
  escape_whitespaces: 1
  normalization_rule_tsv:
}

trainer_interface.cc(267) LOG(INFO) Loading corpus: inputs.txt
trainer_interface.cc(287) LOG(WARNING) Found too long line (14095 > 4192).
trainer_interface.cc(289) LOG(WARNING) Too long lines are skipped in the training.
trainer_interface.cc(290) LOG(WARNING) The maximum length can be changed with --max_sentence_length=<size> flag.
trainer_interface.cc(317) LOG(INFO) Sampled 600000 sentences from 893819 sentences.
trainer_interface.cc(321) LOG(INFO) Skipped 306181 too long sentences.
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <pad>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: ?
trainer_interface.cc(335) LOG(INFO) Normalizing sentences...
trainer_interface.cc(384) LOG(INFO) all chars count=620184844
trainer_interface.cc(392) LOG(INFO) Done: 99.95% characters are covered.
trainer_interface.cc(402) LOG(INFO) Alphabet size=3843
trainer_interface.cc(403) LOG(INFO) Final character coverage=0.9995
trainer_interface.cc(435) LOG(INFO) Done! preprocessed 600000 sentences.
unigram_model_trainer.cc(129) LOG(INFO) Making suffix array...
unigram_model_trainer.cc(133) LOG(INFO) Extracting frequent sub strings...
unigram_model_trainer.cc(184) LOG(INFO) Initialized 1000000 seed sentencepieces
trainer_interface.cc(441) LOG(INFO) Tokenizing input sentences with whitespace: 600000
trainer_interface.cc(451) LOG(INFO) Done! 599929
unigram_model_trainer.cc(470) LOG(INFO) Using 599929 sentences for EM training
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=739264 obj=6257.83 num_tokens=237723486 num_tokens/piece=321.568
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=620449 obj=6522.22 num_tokens=239478461 num_tokens/piece=385.976
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=465317 obj=6504.44 num_tokens=240315495 num_tokens/piece=516.455
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=465197 obj=6516.07 num_tokens=240666115 num_tokens/piece=517.342
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=348896 obj=6349.2 num_tokens=241613356 num_tokens/piece=692.508
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=348892 obj=6517.09 num_tokens=241672250 num_tokens/piece=692.685
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=261668 obj=6354.92 num_tokens=243677636 num_tokens/piece=931.247
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=261666 obj=6359.98 num_tokens=243686607 num_tokens/piece=931.289
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=196248 obj=6380.03 num_tokens=246591897 num_tokens/piece=1256.53
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=196248 obj=6374.55 num_tokens=246589702 num_tokens/piece=1256.52
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=147186 obj=6401.31 num_tokens=250098677 num_tokens/piece=1699.2
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=147186 obj=6398.36 num_tokens=250103244 num_tokens/piece=1699.23
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=110389 obj=6434.43 num_tokens=254137323 num_tokens/piece=2302.2
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=110389 obj=6435.06 num_tokens=254149810 num_tokens/piece=2302.31
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=82791 obj=6479 num_tokens=258699951 num_tokens/piece=3124.74
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=82791 obj=6470.84 num_tokens=258698437 num_tokens/piece=3124.72
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=62093 obj=6517.19 num_tokens=263779515 num_tokens/piece=4248.14
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=62093 obj=6510.74 num_tokens=263786381 num_tokens/piece=4248.25
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=46569 obj=6564.45 num_tokens=269611399 num_tokens/piece=5789.5
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=46569 obj=6554.53 num_tokens=269610046 num_tokens/piece=5789.47
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=43985 obj=6570.5 num_tokens=270864362 num_tokens/piece=6158.11
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=43985 obj=6564.9 num_tokens=270866642 num_tokens/piece=6158.16
trainer_interface.cc(507) LOG(INFO) Saving model: sp_uncase_en_ja_40000.model
trainer_interface.cc(531) LOG(INFO) Saving vocabs: sp_uncase_en_ja_40000.vocab
True
"""

In [14]:
# Modification of .vocab file to append reserved place holders
with open(model_prefix + '.vocab') as f:
    lines = [_ for _ in f.read().split('\n') if len(_) > 0]

import shutil
shutil.copy(model_prefix + '.vocab', model_prefix + '.old.vocab')
    
lines_reserved = lines[:vocab_reserved_used]
lines_unused = ['unused_%d\t0'%i for i in range(0, vocab_reserved_unused)]
lines_normal = lines[vocab_reserved_used:]
lines = lines_reserved + lines_unused + lines_normal
assert len(lines) == vocab_size, 'len(lines)=%d vs vocab_size=%d'%(len(lines), vocab_size)

with open(model_prefix + '.vocab', 'w') as f:
    f.write('\n'.join(lines))