In [24]:
from tokenizers import Tokenizer
from tokenizers import decoders
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace, Sequence
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import BpeTrainer
import numpy as np

In [34]:
# Initialize a tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]", continuing_subword_prefix="##"))
trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    continuing_subword_prefix="##",
    vocab_size=5000,
    limit_alphabet=1000,
    min_frequency=500,
)

tokenizer.pre_tokenizer = Sequence([Whitespace()])
tokenizer.decoder = decoders.WordPiece(prefix="##")

# Then train it!
tokenizer.train([ "decretum.txt", "corpus_thomisticum.txt", "misc_medieval.txt", "cases_training_lines.txt" ], trainer)

tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ],
    )

# Now, let's use it:
encoded = tokenizer.encode("domini regis die dominica proxima post festum Sancti Hyllarii duobus annis elapsis, venit predicta Juliana cum forcia sua")

# And finally save it somewhere
tokenizer.save("latin_tokenizer.json")






In [3]:
tokenizer

<tokenizers.Tokenizer at 0x55805e57be80>

In [4]:
print(encoded)

Encoding(num_tokens=36, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [11]:
print(encoded.ids)

[1, 1341, 2133, 1347, 559, 1922, 1829, 159, 708, 68, 393, 288, 4128, 41, 1787, 175, 2907, 2400, 1337, 295, 1064, 159, 400, 295, 14, 2040, 4357, 413, 43, 863, 2311, 416, 503, 4458, 987, 2]


In [16]:
test = list(encoded.ids) + [1922]
tokenizer.decode(test, skip_special_tokens=False)

'[CLS] domini regis die dominica proxima post festum Sancti Hyllarii duobus annis elapsis, venit predicta Juliana cum forcia sua [SEP]ica'

In [15]:
help(tokenizer.decode)

Help on built-in function decode:

decode(self, ids, skip_special_tokens=True) method of tokenizers.Tokenizer instance
    Decode the given list of ids back to a string
    
    This is used to decode anything coming back from a Language Model
    
    Args:
        ids (A :obj:`List/Tuple` of :obj:`int`):
            The list of ids that we want to decode
    
        skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
            Whether the special tokens should be removed from the decoded string
    
    Returns:
        :obj:`str`: The decoded string



In [33]:
#ids = encoded.ids
#print(ids)
#print(tokenizer.token_to_id("[SEP]"))
test.append(2)
print(test.index(tokenizer.token_to_id("[SEP]")))
#print(test)

35


In [28]:
type(ids)

list

In [34]:
with open("decretum.txt") as f:
    decretum_encoded = tokenizer.encode(f.read())
    print(decretum_encoded)

Encoding(num_tokens=416902, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [35]:
decretum_encoded.tokens

['C',
 '##ON',
 '##C',
 '##OR',
 '##D',
 '##IA',
 'D',
 '##IS',
 '##C',
 '##OR',
 '##D',
 '##A',
 '##N',
 '##TI',
 '##UM',
 'CA',
 '##N',
 '##ON',
 '##UM',
 'A',
 '##C',
 'P',
 '##R',
 '##I',
 '##M',
 '##UM',
 'D',
 '##E',
 'I',
 '##UR',
 '##E',
 'N',
 '##A',
 '##T',
 '##UR',
 '##A',
 '##E',
 'E',
 '##T',
 'C',
 '##ON',
 '##S',
 '##TI',
 '##T',
 '##U',
 '##TI',
 '##ON',
 '##IS',
 '(',
 'D',
 '.',
 '1',
 'd',
 '.',
 'a',
 '.',
 'c',
 '.',
 '1',
 ')',
 '¶',
 'H',
 '##um',
 '##anum',
 'genus',
 'duobus',
 're',
 '##gitur',
 ',',
 'naturali',
 'videlicet',
 'iur',
 '##e',
 'et',
 'mor',
 '##ibus',
 '.',
 'I',
 '##us',
 'naturae',
 'est',
 ',',
 'quod',
 'in',
 'lege',
 'et',
 'evang',
 '##eli',
 '##o',
 'continetur',
 ',',
 'quo',
 'quis',
 '##que',
 'iub',
 '##etur',
 'alii',
 'facere',
 ',',
 'quod',
 'sibi',
 'vult',
 'fieri',
 ',',
 'et',
 'prohib',
 '##etur',
 'alii',
 'inf',
 '##erre',
 ',',
 'quod',
 'sibi',
 'nol',
 '##it',
 'fieri',
 '.',
 'Unde',
 'Christus',
 'in',
 'evang',
 '#