## Load datasets and convert to sentence files

In [None]:
import datasets
from src.make_datasets import make_sentence_files

# load the japanese dataset
dataset_ja = datasets.load_dataset("cc100", lang="ja")

# sample a data entry
dataset_ja["train"][961563]

# and split into sentences
# which are written into files 10M data entries per file
# one sentence per line
make_sentence_files(dataset_ja["train"])

In [None]:
import datasets
from src.make_datasets import make_sentence_files

# load the english dataset
dataset_en = datasets.load_dataset("cc100", lang="en")

# sample a data entry
dataset_en["train"][961563]

# and split into sentences
# which are written into files 10M data entries per file
# one sentence per line
make_sentence_files(dataset_en["train"],data_dir = cfg.EN_SENTENCES_DIR)

## Train Japanese tokenizers

In [5]:
import cfg
from src.make_datasets import sample_and_make_tempfile

# sample num_files from all files and combine to a single file
tempfile_path = sample_and_make_tempfile(
                                        sentences_dir = cfg.JP_SENTENCES_DIR
                                        , num_files = 5)

In [None]:
import sentencepiece as spm
import time

# train for these sizes
vocab_sizes = [8000, 16000, 32000, 48000]

def train_jp(vocab_size):

    start = time.time()
    model_prefix = "cc100_jp" + "_vocab_" + str(vocab_size)
    spm.SentencePieceTrainer.train(input=tempfile_path
                                           , model_prefix=model_prefix
                                           , vocab_size=vocab_size
                                           , character_coverage = 0.9995
                                           , num_threads=60
                                           , train_extremely_large_corpus=True
                                          ) 
    print("Trained {} in {} seconds".format(model_prefix, time.time()-start))

# train
for vocab_size in vocab_sizes:
    train_jp(vocab_size)

## Train english tokenizers

In [1]:
import cfg
from src.make_datasets import sample_and_make_tempfile

# sample num_files from all files and combine to a single file
tempfile_path = sample_and_make_tempfile(
                                        sentences_dir = cfg.EN_SENTENCES_DIR
                                        , num_files = 5)

['/work/data/en_sentences/sent_9.txt', '/work/data/en_sentences/sent_31.txt', '/work/data/en_sentences/sent_13.txt', '/work/data/en_sentences/sent_18.txt', '/work/data/en_sentences/sent_34.txt', '/work/data/en_sentences/sent_15.txt', '/work/data/en_sentences/sent_40.txt', '/work/data/en_sentences/sent_19.txt', '/work/data/en_sentences/sent_4.txt', '/work/data/en_sentences/sent_7.txt']
number of lines sampled: 73808991


100%|██████████| 73808991/73808991 [13:13<00:00, 93063.17it/s] 


In [None]:
import sentencepiece as spm
import time

# train for these sizes
vocab_sizes = [8000, 16000, 32000, 48000]

def train_en(vocab_size):

    start = time.time()
    model_prefix = "cc100_en" + "_vocab_" + str(vocab_size)
    spm.SentencePieceTrainer.train(input=tempfile_path
                                           , model_prefix=model_prefix
                                           , vocab_size=vocab_size
                                           , character_coverage = 1.0
                                           , num_threads=60
                                           , train_extremely_large_corpus=True
                                          ) 
    print("Trained {} in {} seconds".format(model_prefix, time.time()-start))
    
for vocab_size in vocab_sizes:
    train_en(vocab_size)