In [2]:
# datasets from huggingface
from datasets import load_dataset # huggingface datasets

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# get datasets
dataset_raw  = load_dataset('opus_books', 'en-it') # have to specify language pair
print(dataset_raw)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 32332
    })
})


In [9]:
print(dataset_raw['train'][0])

{'id': '0', 'translation': {'en': 'Source: Project Gutenberg', 'it': 'Source: www.liberliber.it/Audiobook available here'}}


In [10]:
dataset_split = load_dataset('opus_books', 'en-it', split='train')
print(dataset_split)

Dataset({
    features: ['id', 'translation'],
    num_rows: 32332
})


In [6]:
print(dataset_split[0])

{'id': '0', 'translation': {'en': 'Source: Project Gutenberg', 'it': 'Source: www.liberliber.it/Audiobook available here'}}


In [7]:
print(dataset_split[1])

{'id': '1', 'translation': {'en': 'Jane Eyre', 'it': 'Jane Eyre'}}


In [11]:
from tokenizers import Tokenizer # tokenizers
from tokenizers.models import WordLevel # tokenizers models
from tokenizers.trainers import WordLevelTrainer # tokenizers trainers
from tokenizers.pre_tokenizers import Whitespace # tokenizers pre-tokenizers

In [12]:
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) # initialize tokenizer with WordLevel model
tokenizer.pre_tokenizer = Whitespace() # set pre-tokenizer to Whitespace
trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[EOS]", "[SOS]"]) # initialize trainer with special tokens
# special tokens
# [UNK] - unknown token
# [CLS] - classification token
# [SEP] - separator token
# [PAD] - padding token
# [MASK] - mask token
# [SOS] - start of sentence token
# [EOS] - end of sentence token

In [16]:
def get_all_sentences(dataset, lang):
    for item in dataset:
        yield item['translation'][lang]

In [26]:
# generate iterator for all sentences
sentences = get_all_sentences(dataset_split, 'en')
print(next(sentences))
print(next(sentences))

Source: Project Gutenberg
Jane Eyre


In [27]:
tokenizer.train_from_iterator(sentences, trainer=trainer) # train tokenizer from iterator
tokenizer.save("en-it-wordlevel.json") # save tokenizer

In [30]:
import sys

# delete en-it-wordlevel.json
if sys.platform == "linux":
    !rm en-it-wordlevel.json
elif sys.platform == "win32":
    !del en-it-wordlevel.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [28]:
test_sentence = "This is a test sentence."
encoded = tokenizer.encode(test_sentence) # encode sentence
print(encoded.ids) # print encoded sentence

[260, 37, 14, 8498, 1888, 10]


In [29]:
test_sentence = "Morning Seaside Coffe Shop - Relaxing Jazz & Bossa Nova Music - Piano Jazz for Studying, Sleep, Work"
encoded = tokenizer.encode(test_sentence) # encode sentence
print(encoded.ids) # print encoded sentence

[0, 0, 0, 17991, 32, 0, 0, 1857, 0, 17464, 17405, 32, 0, 0, 29, 0, 7, 8767, 7, 10294]
