In [1]:
!pip install sentencepiece



In [7]:
!wget https://dumps.wikimedia.org/other/cirrussearch/20190422/enwiki-20190422-cirrussearch-content.json.gz

--2019-04-27 04:09:59--  https://dumps.wikimedia.org/other/cirrussearch/20190422/enwiki-20190422-cirrussearch-content.json.gz
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:4:208:80:155:106, 208.80.155.106
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:4:208:80:155:106|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28155129067 (26G) [application/octet-stream]
Saving to: ‘enwiki-20190422-cirrussearch-content.json.gz’


2019-04-27 07:59:13 (1.95 MB/s) - ‘enwiki-20190422-cirrussearch-content.json.gz’ saved [28155129067/28155129067]



In [10]:
import json
import gzip
import itertools
import re

In [20]:
def extract_text(input_path, output_path, block_size=100000):
    """
    text processes
    1. lower
    2. (split by newlines)
    3. strip
    """
    
    def flush(text_lines):
        with open(output_path, 'a') as f:
            ret = f.write('\n'.join(filter(
                    lambda _:len(_)>0, 
                    (l.strip() for l in itertools.chain.from_iterable(text_lines))
                )))
            print('write:%s'%(ret))
        text_lines.clear()

    with gzip.open(input_path) as f:
        text_lines = []
        for line in f:
            json_line = json.loads(line)
            if "text" in json_line:
                text = json_line["text"]
                text_lines.append(text.lower().split('\n'))
                if len(text_lines) > block_size:
                    flush(text_lines)
        if len(text_lines) > 0:
            flush(text_lines)
    print('done')

In [21]:
extract_text('enwiki-20190422-cirrussearch-content.json.gz', 'enwiki.txt')

write:369702687
write:358255001
write:339906391
write:329798261
write:317134541
write:325287199
write:314865612
write:303227519
write:313747929
write:297361084
write:271422697
write:262897437
write:289770085
write:302961892
write:279494464
write:245401108
write:234794586
write:232699464
write:260179880
write:268285622
write:243880294
write:283496239
write:230539175
write:232695503
write:318426960
write:307303468
write:253428083
write:264312258
write:338572474
write:336930229
write:289385204
write:228614831
write:260443451
write:230449676
write:233468237
write:218666297
write:222052512
write:217431745
write:220272896
write:218042917
write:235347744
write:331820823
write:334924592
write:358884553
write:465671752
write:544773966
write:521944186
write:598539592
write:513487035
write:652594881
write:621448485
write:590982259
write:575114906
write:490271620
write:391809125
write:536767348
write:760846210
write:786657312
write:350909358
done


In [3]:
!shuf -n800000 enwiki.txt > enwiki_800000.txt

In [5]:
import sentencepiece as sp

vocab_size=30000
vocab_reserved=16
vocab_reserved_used=2 # account for pad, unk
vocab_reserved_unused=vocab_reserved - vocab_reserved_used
model_prefix='sp_uncase_en_%d'%(vocab_size)
_input='enwiki_800000.txt'

command = ' '.join((
    '--pad_id=0',
    '--unk_id=1',
    '--bos_id=-1', 
    '--eos_id=-1',
    '--add_dummy_prefix=True',
    '--input_sentence_size=500000',
    '--max_sentence_length=4192',
    '--vocab_size=%d'%(vocab_size-vocab_reserved_unused), 
    '--model_prefix=%s'%(model_prefix),
    '--input=%s'%(_input),
))
#sp.SentencePieceTrainer.Train(command)

# Train method in this cell ran in a local shell to show logs.
"""
sentencepiece_trainer.cc(116) LOG(INFO) Running command: --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --add_dummy_prefix=True --input_sentence_size=500000 --max_sentence_length=4192 --vocab_size=29986 --model_prefix=sp_uncase_en_30000 --input=enwiki_800000.txt
sentencepiece_trainer.cc(49) LOG(INFO) Starts training with :
TrainerSpec {
  input: enwiki_800000.txt
  input_format:
  model_prefix: sp_uncase_en_30000
  model_type: UNIGRAM
  vocab_size: 29986
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 500000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  treat_whitespace_as_suffix: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: -1
  eos_id: -1
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇
}
NormalizerSpec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces: 1
  escape_whitespaces: 1
  normalization_rule_tsv:
}

trainer_interface.cc(267) LOG(INFO) Loading corpus: enwiki_800000.txt
trainer_interface.cc(287) LOG(WARNING) Found too long line (8115 > 4192).
trainer_interface.cc(289) LOG(WARNING) Too long lines are skipped in the training.
trainer_interface.cc(290) LOG(WARNING) The maximum length can be changed with --max_sentence_length=<size> flag.
trainer_interface.cc(317) LOG(INFO) Sampled 500000 sentences from 625649 sentences.
trainer_interface.cc(321) LOG(INFO) Skipped 174351 too long sentences.
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <pad>
trainer_interface.cc(330) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(335) LOG(INFO) Normalizing sentences...
trainer_interface.cc(384) LOG(INFO) all chars count=661312249
trainer_interface.cc(392) LOG(INFO) Done: 99.95% characters are covered.
trainer_interface.cc(402) LOG(INFO) Alphabet size=691
trainer_interface.cc(403) LOG(INFO) Final character coverage=0.9995
trainer_interface.cc(435) LOG(INFO) Done! preprocessed 500000 sentences.
unigram_model_trainer.cc(129) LOG(INFO) Making suffix array...
unigram_model_trainer.cc(133) LOG(INFO) Extracting frequent sub strings...
unigram_model_trainer.cc(184) LOG(INFO) Initialized 1000000 seed sentencepieces
trainer_interface.cc(441) LOG(INFO) Tokenizing input sentences with whitespace: 500000
trainer_interface.cc(451) LOG(INFO) Done! 4468864
unigram_model_trainer.cc(470) LOG(INFO) Using 4468864 sentences for EM training
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=656359 obj=13.1717 num_tokens=14744373 num_tokens/piece=22.4639
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=552683 obj=10.726 num_tokens=14702348 num_tokens/piece=26.6018
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=414484 obj=10.6869 num_tokens=14813221 num_tokens/piece=35.7389
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=414289 obj=10.6779 num_tokens=14831138 num_tokens/piece=35.799
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=310716 obj=10.6906 num_tokens=15115184 num_tokens/piece=48.6463
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=310712 obj=10.6859 num_tokens=15118548 num_tokens/piece=48.6578
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=233034 obj=10.7225 num_tokens=15530424 num_tokens/piece=66.6445
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=233032 obj=10.7176 num_tokens=15530594 num_tokens/piece=66.6458
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=174774 obj=10.7758 num_tokens=16018682 num_tokens/piece=91.6537
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=174774 obj=10.7568 num_tokens=16017777 num_tokens/piece=91.6485
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=131080 obj=10.8469 num_tokens=16489249 num_tokens/piece=125.795
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=131079 obj=10.8303 num_tokens=16487205 num_tokens/piece=125.781
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=98309 obj=10.9328 num_tokens=16969299 num_tokens/piece=172.612
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=98309 obj=10.9103 num_tokens=16968886 num_tokens/piece=172.608
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=73731 obj=11.0365 num_tokens=17483639 num_tokens/piece=237.127
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=73731 obj=11.0113 num_tokens=17483325 num_tokens/piece=237.123
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=55298 obj=11.1549 num_tokens=18057591 num_tokens/piece=326.551
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=55298 obj=11.1257 num_tokens=18059618 num_tokens/piece=326.587
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=41467 obj=11.2863 num_tokens=18667518 num_tokens/piece=450.178
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=41451 obj=11.2634 num_tokens=18667016 num_tokens/piece=450.339
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=0 size=32984 obj=11.3992 num_tokens=19193520 num_tokens/piece=581.904
unigram_model_trainer.cc(486) LOG(INFO) EM sub_iter=1 size=32984 obj=11.3677 num_tokens=19197473 num_tokens/piece=582.024
trainer_interface.cc(507) LOG(INFO) Saving model: sp_uncase_en_30000.model
trainer_interface.cc(531) LOG(INFO) Saving vocabs: sp_uncase_en_30000.vocab
True
"""
True

True

In [6]:
# Modification of .vocab file to append reserved place holders
with open(model_prefix + '.vocab') as f:
    lines = [_ for _ in f.read().split('\n') if len(_) > 0]

import shutil
shutil.copy(model_prefix + '.vocab', model_prefix + '.old.vocab')
    
lines_reserved = lines[:vocab_reserved_used]
lines_unused = ['unused_%d\t0'%i for i in range(0, vocab_reserved_unused)]
lines_normal = lines[vocab_reserved_used:]
lines = lines_reserved + lines_unused + lines_normal
assert len(lines) == vocab_size, 'len(lines)=%d vs vocab_size=%d'%(len(lines), vocab_size)

with open(model_prefix + '.vocab', 'w') as f:
    f.write('\n'.join(lines))