In [1]:
import os
import numpy as np
import sentencepiece as spm

from tqdm import tqdm

In [2]:
!pwd

/home/imanol/languini-kitchen/languini/vocabs


In [3]:
BOOKS3 = "../../data/books3"
DATA_PATH = "../../data/books"

In [4]:
train_files = np.load(os.path.join(DATA_PATH, "file_list_train.npy"))[:10,0]
train_files.shape

(10,)

In [5]:
names = [os.path.splitext(os.path.basename(fn))[0] for fn in train_files]
names[:5]

['Roots, Radicals and Rockers - Billy Bragg.epub',
 'A History of the World Cup_ 1930-2014 - Clemente A. Lisi (retail).epub',
 'Gently Where She Lay - Alan Hunter.epub',
 'The Weirdness - Jeremy P. Bushnell.epub',
 'Dog Dish of Doom--An Agent to the Paws Mys - E.J. Copperman.epub']

In [6]:
def get_books3_file_paths(books3_dir, file_names):
    """
    Walks through the raw books3 folder and checks if the filenames exist.

    Args:
        books3_dir (str): Path to the books3 root folder.
        file_names (list): List of the file names.

    Raises>
        ValueError: If a file name is not found.
    """
    all_files = {}
    for root, dirs, files in tqdm(list(os.walk(books3_dir))):
        for file_name in files:
            all_files[os.path.splitext(file_name)[0]] = os.path.join(root, file_name)
    
    books3_paths = []
    for fn in file_names:
        if not fn in all_files.keys():
            raise ValueError(f"Filename {fn} not found when walking through {books3_dir}")
        else:
            books3_paths.append(all_files[fn])
    
    return list(books3_paths)

In [7]:
book3_paths = get_books3_file_paths(BOOKS3, names)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 53.25it/s]


In [8]:
def get_contents(file_absolute_paths):   
    contents = []
    for filename in file_absolute_paths:
        with open(filename, "rb") as f:
            content = f.read()
        # replace all the \r\n with \n 
        content = content.replace(b"\r\n", b"\n")
        # trim the text to remove empty space at the beginning and end
        content = content.strip()

        contents.append(content)
    print(f"Loaded {len(contents)} files!")
    return contents

In [9]:
contents = get_contents(book3_paths)

Loaded 10 files!


In [10]:
NEW_DOC_TOKEN = b"<D>"
concat_contents = NEW_DOC_TOKEN + NEW_DOC_TOKEN.join(contents)

In [11]:
print(f"Total size of the contents that will be used for vocab generation is {len(concat_contents):,} bytes.")

Total size of the contents that will be used for vocab generation is 6,309,760 bytes.


In [12]:
# Save the concatenated contents to a file
TMP_FILENAME = ".tmp_concat_books3.txt"
if os.path.exists(TMP_FILENAME):
    os.remove(TMP_FILENAME)
with open(TMP_FILENAME, 'wb') as f:
    f.write(concat_contents)

In [13]:
# this will take a while for large vocabs
spm.SentencePieceTrainer.Train(
    f" --input={TMP_FILENAME}" +
    f" --model_prefix=spm_models/test_2048" +
    f" --model_type=bpe" +
    f" --vocab_size=2048" +
    " --hard_vocab_limit=True" +
    " --input_sentence_size=2000000" +
    " --unk_id=1" + 
    " --bos_id=-1" +
    " --eos_id=-1" +
    " --pad_id=0" +
    " --max_sentencepiece_length=99" +
    " --split_by_unicode_script=True" +  
    " --split_by_number=True" +
    " --split_by_whitespace=True" +
    " --add_dummy_prefix=False" + 
    " --byte_fallback=True" +
    " --remove_extra_whitespaces=False" +
    " --allow_whitespace_only_pieces=True" +
    " --normalization_rule_name=identity" +
    " --user_defined_symbols=<D>" +
    " --split_digits=True" +
    " --vocabulary_output_piece_score=False"
)

sentencepiece_trainer.cc(177) LOG(INFO) Running command:  --input=.tmp_concat_books3.txt --model_prefix=spm_models/test_2048 --model_type=bpe --vocab_size=2048 --hard_vocab_limit=True --input_sentence_size=2000000 --unk_id=1 --bos_id=-1 --eos_id=-1 --pad_id=0 --max_sentencepiece_length=99 --split_by_unicode_script=True --split_by_number=True --split_by_whitespace=True --add_dummy_prefix=False --byte_fallback=True --remove_extra_whitespaces=False --allow_whitespace_only_pieces=True --normalization_rule_name=identity --user_defined_symbols=<D> --split_digits=True --vocabulary_output_piece_score=False
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: .tmp_concat_books3.txt
  input_format: 
  model_prefix: spm_models/test_2048
  model_type: BPE
  vocab_size: 2048
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 2000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_len

In [14]:
# Lossless Test of the generated SentencePiece model vocab
sp = spm.SentencePieceProcessor()
spm_model_path = f'spm_models/test_2048.model'
print(f'Loading {spm_model_path} spm file...')
sp.Load(spm_model_path)

# this takes a few minutes ...
res = []
sample_contents = contents[:1_000]  # Sample only some contents s.t. this doesn't take forever
print(f"Testing lossless reconstruction of {len(sample_contents)} documents...")
for x in tqdm(sample_contents):
    enc_x = sp.Encode(x)
    dec_enc_x = bytes(sp.Decode(enc_x), 'utf-8')
    res.append(x == dec_enc_x)

# True if all docs were successfully reconstructed
if all(res):
    print(f"All docs were successfully reconstructed!")
else:
    print(f"WARNING: Some docs were not successfully reconstructed!")
    print(f'Number of docs that were not successfully reconstructed: {len([x for x in res if not x])}')


rential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy_clipping_threshold: 0
}
normalizer_spec {
  name: identity
  add_dummy_prefix: 0
  remove_extra_whitespaces: 0
  escape_whitespaces: 1
  normalization_rule_tsv: 
}
denormalizer_spec {}
trainer_interface.cc(351) LOG(INFO) SentenceIterator is not specified. Using MultiFileSentenceIterator.
trainer_interface.cc(183) LOG(INFO) Loading corpus: .tmp_concat_books3.txt
trainer_interface.cc(407) LOG(INFO) Loaded all 34832 sentences
trainer_interface.cc(414) LOG(INFO) Skipped 1 too long sentences.
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <pad>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <D>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x00>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x01>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x02>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x03

Loading spm_models/test_2048.model spm file...
Testing lossless reconstruction of 10 documents...


interface.cc(423) LOG(INFO) Adding meta_piece: <0x40>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x41>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x42>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x43>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x44>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x45>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x46>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x47>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x48>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x49>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x4A>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x4B>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x4C>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x4D>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x4E>
trainer_interface.cc(423) LOG(INFO) Adding meta_piece: <0x4F>
trainer_interfac

All docs were successfully reconstructed!





In [15]:
os.remove(TMP_FILENAME)