In [1]:
import re
import csv, os, sys
import spacy
import pandas as pd

sys.path.append('../')

from src import default
from src.data import download as dl, data_preprocessing as dpp, tokenization as tkn

In [3]:
def tokenizer_examples(tokenizer, raw_tokenizer=True, title='default'):
    """
    Example of a "Raw tokenizer":
        tokenizer = Tokenizer.from_file(tpath)
    Example of "not Raw tokenizer":
        from transformers import PreTrainedTokenizerFast
        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file=tpath)
    """
    title_break = '\n****************************************************************'
    # example text
    text0 = "Hello, y'all! How are you 😁 ?"
    text1 = "Here is some code spaghetti"
    text2 = "configuration interaction (CI) wave functions is examined"
    text3 = "By analogy with the pseudopotential approach for electron-ion interactions"
    text4 = "Welcome to the 🤗 Tokenizers library."
    examples = [text0, text1, text2, text3, text4]

    if raw_tokenizer:
        print('Tokenizer examples (raw_tokenizer=True): %s%s' % (title, title_break))
        for idx, text in enumerate(examples):
            pre = '(Ex %d)' % idx
            print('%s input: %s' % (pre, text))
            output = tokenizer.encode(text)
            print('%s output type & output.tokens: %s, %s' % (pre, type(output), output.tokens))
            print('%s decode(output.ids): %s' % (pre, tokenizer.decode(output.ids)))

            # "use proper decoder" https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
            print('%s decoder on output.ids: %s' % (pre, tokenizer.decode(output.ids)))
            print()
    else:
        print('Tokenizer examples (raw_tokenizer=False): %s%s' % (title, title_break))
        for idx, text in enumerate(examples):
            pre = '(Ex %d)' % idx
            print('%s input: %s' % (idx, text))
            output = tokenizer.encode(text)
            print('%s output type & output: %s, %s' % (pre, type(output), output))
            print('%s decode w/ no cleanup: %s' %
                  (pre, tokenizer.decode(output, clean_up_tokenization_spaces=False)))
            print('%s decode w/ cleanup: %s' %
                  (pre, tokenizer.decode(output, clean_up_tokenization_spaces=True)))
            print()
    return

## Example arxiv data

In [6]:
# download data
filename = dl.arxiv_api( default.RAW_DATA_DIR )
print(f'>> Using {filename} for training <<')

# preprocessing
proc_data = dpp.arxiv_preprocess_abstract(default.RAW_DATA_DIR
                                , default.PROC_DATA_DIR, filename, True )

# convert to list/iterator
arxiv_iter = dpp.arxiv_abstract_iterator( proc_data )
fname_strip_csv = filename[:-4]
arxiv_tknzr = tkn.train_custom_tokenizer('BPE', arxiv_iter, fname_strip_csv, default.TOK_DIR
                                , **default.special_token_lst)

>> Using arxiv_10.csv for training <<


## Example wiki

In [7]:
# download : already happened.
file_dir = 'wikitext-103-raw'
print(f'>> Using {file_dir} for training <<')

# preprocessing : None for now

# convert to list/iterator
wiki_iter = dpp.wiki_iterator( file_dir )
fname_strip = file_dir[:-4]

wiki_tknzr = tkn.train_custom_tokenizer('BPE', wiki_iter, fname_strip, default.TOK_DIR
                                        , **default.special_token_lst)

>> Using wikitext-103-raw for training <<


## Comparing

In [8]:
tokenizer_examples(arxiv_tknzr, raw_tokenizer=True, title='trained arxviv BPE')
tokenizer_examples(wiki_tknzr, raw_tokenizer=True, title='trained wiki BPE')

Tokenizer examples (raw_tokenizer=True): trained arxviv BPE
****************************************************************
(Ex 0) input: Hello, y'all! How are you 😁 ?
(Ex 0) output type & output.tokens: <class 'tokenizers.Encoding'>, ['<s>', 'ĠH', 'el', 'lo', ',', 'Ġ', 'y', '<unk>', 'all', '<unk>', 'ĠH', 'ow', 'Ġare', 'Ġ', 'y', 'o', 'u', 'Ġ', '<unk>', '<unk>', '<unk>', '<unk>', 'Ġ', '<unk>', '<\\s>']
(Ex 0) decode(output.ids): ĠH el lo , Ġ y all ĠH ow Ġare Ġ y o u Ġ Ġ
(Ex 0) WordPiece decoder on output.ids: ĠH el lo, Ġ y all ĠH ow Ġare Ġ y o u Ġ Ġ

(Ex 1) input: Here is some code spaghetti
(Ex 1) output type & output.tokens: <class 'tokenizers.Encoding'>, ['<s>', 'ĠH', 'ere', 'Ġis', 'Ġs', 'om', 'e', 'Ġco', 'de', 'Ġsp', 'a', 'gh', 'et', 't', 'i', '<\\s>']
(Ex 1) decode(output.ids): ĠH ere Ġis Ġs om e Ġco de Ġsp a gh et t i
(Ex 1) WordPiece decoder on output.ids: ĠH ere Ġis Ġs om e Ġco de Ġsp a gh et t i

(Ex 2) input: configuration interaction (CI) wave functions is examined
(Ex 2) ou

## Fast and Bert Tokenizer

In [64]:
from transformers import PreTrainedTokenizerFast



# For now
from tokenizers import BertWordPieceTokenizer


['Electron', 'temperature', 'anisotropies', 'and', 'electron', 'beams', 'are', 'nonthermal', 'features', 'of', 'the', 'observed', 'nonequilibrium', 'electron', 'velocity', 'distributions', 'in', 'the', 'solar', 'wind.', 'In', 'collision-poor', 'plasmas', 'these', 'nonequilibrium', 'distributions', 'are', 'expected', 'to', 'be', 'regulated', 'by', 'kinetic', 'instabilities', 'through', 'wave-particle', 'interactions.', 'This', 'study', 'considers', 'electron', 'instabilities', 'driven', 'by', 'the', 'interplay', 'of', 'core', 'electron', 'temperature', 'anisotropies', 'and', 'the', 'electron', 'beam,', 'and', 'firstly', 'gives', 'a', 'comprehensive', 'analysis', 'of', 'instabilities', 'in', 'arbitrary', 'directions', 'to', 'the', 'background', 'magnetic', 'field.', 'It', 'clarifies', 'the', 'dominant', 'parameter', 'regime', '(e.g.,', 'parallel', 'core', 'electron', 'plasma', 'beta', '$\\beta_{\\mathrm{ec\\parallel}}$,', 'core', 'electron', 'temperature', 'anisotropy', '$A_{\\mathrm{ec}

## Tokenizer

In [78]:
from tokenizers import Tokenizer, normalizers, pre_tokenizers, decoders, processors
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.normalizers import NFD, NFKD, NFC, NFKC, Lowercase, StripAccents
from tokenizers.pre_tokenizers import ByteLevel, Whitespace, WhitespaceSplit, Punctuation, Metaspace,\
                                        CharDelimiterSplit
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordPieceTrainer, WordLevelTrainer
from pathlib import Path

#from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer

In [66]:
bos_token = "<s>"
pad_token = "<pad>"
eos_token = "</s>"
unk_token = "<unk>"
mask_token = "<mask>"

special_token_list = [bos_token, pad_token, eos_token, unk_token, mask_token]

class BPE_token(object):
    def __init__(self):
        # instantiate
        self.tokenizer = Tokenizer(BPE())
        
        # normalization
        self.tokenizer.normalizer = Sequence([
            NFKC()
        ])
        
        # pre-tokenizer
        self.tokenizer.pre_tokenizer = ByteLevel()
        
        # decoder
        self.tokenizer.decoder = ByteLevelDecoder()

    def bpe_train(self, iterator):
        trainer = BpeTrainer(vocab_size=50000, show_progress=True, inital_alphabet=ByteLevel.alphabet()
                                             , special_tokens=special_token_list)
        self.tokenizer.train_from_iterator(trainer=trainer, iterator=iterator) # paths is iterator

    def save_tokenizer(self, location, prefix=None):
        if not os.path.exists(location):
            os.makedirs(location)
        self.tokenizer.model.save(location, prefix)

In [81]:
def train_custom_tokenizer(token_model, data_iterator, token_dir, token_filename, vocab_size=30000, vocab=None
                          , max_input_chars_per_word=None):
    """
    Building a Tokenizer using HuggingFace library. The pipeline seems to be:
    
        - Model           : algorithm that tokenizes, it is a mandatory component. There are
                            only 4 models implemented (BPE, Unigram, WordLevel, WordPiece)
        - Normalizer      : some preprocessing that could happen before, but doesn't necessarily
        - Pre-Tokenizer   : splitting the input according to some rules
        - Post-Processing : needing to add some tokens/input after (mostly seems to be eos
                            , bos tokens)
        - Decoder         : certain previous pipeline steps need to be reversed for proper
                            decoding
        - Trainer         : The corresponding training algorithm for the model
    
    Note : Some pre-processing might need to happen beforehand in previous functions (might
            be easier using pandas)
    
    Input
        token_model              : algorithm to use for tokenization
        data_iterator            : a python iterator that goes through the data to be used for 
                                    training
        token_dir                : directory with tokenizers
        vocab_size               : size of the vocabulary to use
        token_filename           : filename of particular token we want to train. Will overwrite
                                    previously save files.
        vocab                    : models other than BPE can use non-mandatory vocab as input
        max_input_chars_per_word : used for WordPiece
        
    Output
        tokenizer                : huggingFace Tokenizer object, our fully trainer tokenizer
            
    """
    special_token_lst = [unk_token, bos_token, eos_token, pad_token, mask_token]
    
    normalizer_lst = [NFKC()]; pre_tokenizer_lst = [ByteLevel()]; decoder_lst = []
    
    bos_idx = special_token_list.index(bos_token); eos_idx = special_token_list.index(eos_token)
    
    if token_model == 'BPE':
        model   = BPE(unk_token=unk_token) 
        Trainer = BpeTrainer
    elif token_model == 'Unigram':
        model   = Unigram(vocab=vocab) 
        Trainer = UnigramTrainer
    elif token_model == 'WordLevel':
        model   = WordLevel(unk_token=unk_token,vocab=vocab)
        Trainer = WordLevelTrainer
    elif token_model == 'WordPiece':
        model   = WordPiece(unk_token=unk_token,vocab=vocab, max_input_chars_per_word=max_input_chars_per_word)
        Trainer = WordPieceTrainer
        decoder_lst.append( decoders.WordPiece())
    else:
        error_msg = f'Error: token_model ({token_model}) not an algorithm in [BPE, Unigram, WordLevel, WordPiece]'
        raise SystemExit(error_msg)       
    
    # instantiation
    tokenizer = Tokenizer(model)
    
    # trainer 
    trainer = Trainer(vocab_size=vocab_size, show_progress=True, special_tokens=special_tokens_lst)
    
    # normalizer
    tokenizer.normalizer = normalizers.Sequence( normalizer_lst )
    
    # pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence( pre_tokenizer_lst )
    
    # post-processing
    tokenizer.post_processor = processors.TemplateProcessing( single=bos_token+" $A "+eos_token
                                                    #, pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1"
                                                    , special_tokens=[(bos_token, bos_idx),(eos_token, eos_idx)]
                                                    )
    
    # decoder
    if ByteLevel() in pre_tokenizer_lst: decoder_lst.append( decoders.ByteLevel() )
    if Metaspace() in pre_tokenizer_lst: decoder_lst.append( decoders.Metaspace() ) 
    tokenizer.decoder = decoders.Sequence( decoder_lst )

    tokenizer.train_from_iterator(trainer=trainer, iterator=data_iterator)
    
    if not os.path.exists( token_dir ):
        os.makedirs( token_dir )
    if os.path.exists( token_dir + os.sep + token_filename ):
        print(f"Warning : overwriting previously save tokenizer with same filename ( {token_filename} ).")
    tokenizer.save( token_dir + os.sep + token_filename )
        
    # TODO : Should I add PreTrained and Fast Tokenizer here? Seems like it might be appropriate.
    transformer = False; fast = False
    function_from_transformer_todo = None
    if transformer:
        raise SystemExit("HuggingFace transformers library not yet implemented here!")
        if fast: tokenizer = function_from_transformer_todo
        else: tokenizer = function_from_transformer_todo
                  
    return tokenizer
    
    
def load_custom_tokenizer(token_dir, token_filename, transformer=False, fast=False):
    """
    Input
        token_dir      : directory with tokenizers saved
        token_filename : trained tokenizer that we want to load
        transformer    : (bool) whether to use HuggingFace transformers library implementation
        fast           : (bool) whether to use HuggingFace transformers fast implementation
    Output
        tokenizer      : tokenizer from Tokenizer class to be passed to rest of algorithm
    """
    tokenizer = Tokenizer.from_file(token_dir + os.sep + token_filename)
    
    function_from_transformer_todo = None
    if function_from_transformer != None:
        if transformer:
            raise SystemExit("HuggingFace transformers library not yet implemented here!")
            if fast: tokenizer = function_from_transformer_todo
            else: tokenizer = function_from_transformer_todo
    
    return tokenizer

In [68]:
# the folder 'text' contains all the files
data_iter = iter(raw_data.summary.tolist())

tokenizer = BPE_token()

# train the tokenizer model
tokenizer.bpe_train(data_iter)

# saving the tokenized data in our specified folder 
save_path = 'tokenized_data'
tokenizer.save_tokenizer(save_path)

string_tokenized = tokenizer.tokenizer.encode(bos_token + ex_abstract + eos_token )
decoded = tokenizer.tokenizer.decode(string_tokenized.ids)
print(string_tokenized.ids)
print(string_tokenized.tokens)
print(decoded)

[0, 1536, 244, 594, 115, 94, 745, 201, 1529, 1470, 108, 93, 589, 832, 94, 830, 834, 101, 93, 523, 701, 11, 441, 1460, 10, 947, 1276, 516, 832, 834, 201, 1380, 143, 159, 1211, 203, 827, 564, 809, 383, 10, 735, 333, 11, 730, 822, 1351, 94, 564, 697, 203, 93, 820, 108, 419, 94, 244, 594, 115, 93, 94, 240, 9, 115, 1467, 1200, 88, 826, 1336, 108, 564, 101, 1447, 1401, 143, 93, 815, 1106, 343, 11, 442, 1539, 93, 1399, 1528, 342, 209, 56, 11, 58, 608, 781, 419, 94, 586, 1076, 438, 395, 215, 214, 78, 246, 49, 382, 765, 419, 94, 244, 1304, 188, 26, 215, 214, 78, 246, 569, 1305, 156, 215, 214, 78, 246, 49, 1162, 1188, 43, 215, 214, 78, 246, 49, 382, 765, 115, 94, 240, 830, 188, 45, 215, 214, 78, 623, 1257, 139, 542, 754, 108, 94, 224, 209, 56, 11, 58, 608, 93, 94, 240, 10, 622, 94, 670, 12, 784, 224, 9, 93, 94, 240, 10, 622, 749, 224, 9, 93, 806, 94, 1463, 224, 9, 93, 94, 1479, 224, 9, 93, 94, 1514, 224, 9, 115, 93, 1513, 10, 1425, 224, 839, 442, 772, 189, 93, 94, 240, 323, 1507, 94, 670, 12, 78