In [None]:
import json
import os
import importlib
import time
from transformers import T5Tokenizer
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer, Regex
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import Metaspace, Split, WhitespaceSplit
from huggingface_hub import login

import re
import src.paper_to_equation.Generation.Equation_BaseDataset 


In [13]:
importlib.reload(src.paper_to_equation.Generation.Equation_BaseDataset)
from src.paper_to_equation.Generation.Equation_BaseDataset import BaseDataset

Confirming the default tokenizer is unsuitable

In [37]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

mml = """
<mml:mi>h</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
</mml:mrow>
  """

py = """
h = Symbol('h')
h_g = Symbol('h_g')
h_c = Symbol('h_c')
e = Eq(h, h_g + h_c)"""

tokens = tokenizer.tokenize(mml)
print(tokens)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


['▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'h', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', 'mm', 'l', ':', 'm', 'o', '>', '=', '<', '/', 'mm', 'l', ':', 'm', 'o', '>', '▁', '<', 'mm', 'l', ':', 'm', 'row', '>', '▁', '<', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'h', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'c', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', '/', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', 'mm', 'l', ':', 'm', 'o', '>', '+', '<', '/', 'mm', 'l', ':', 'm', 'o', '>', '▁', '<', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'h', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'g', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', '/', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', '/', 'mm', 'l', ':', 'm', 'row', '>']


Dataset class

In [14]:
class TokenizerDataset(BaseDataset):
    def __init__(self, num):
        super().__init__(num)

    def get_columns(self):
        return ["mathml", "python"]

    def map_atomic_tokens(self, dataset):

        tag_map = {"<mml:mo>": "<MO>", "</mml:mo>": "</MO>",
                   "<mml:mi>": "<MI>", "</mml:mi>": "</MI>",
                   "<mml:msub>": "<MSUB>", "</mml:msub>": "</MSUB>",
                   "<mml:msup>": "<MSUP>", "</mml:msup>": "</MSUP>",
                   "<mml:mrow>": "<MROW>", "</mml:mrow>": "</MROW>", 
                   "<mml:mfrac>": "<MFRAC>", "</mml:mfrac>": "</MFRAC>"}
        
        for entry in dataset:
            mathml = entry["mathml"]
            for tag, token in tag_map.items():
                mathml = mathml.replace(tag, token)
            entry["mathml"] = mathml
        
        return dataset
    
    def get_tag_list(self):
        return ["<mml:mo>", "</mml:mo>",
                "<mml:mi>", "</mml:mi>",
                "<mml:msub>", "</mml:msub>",
                "<mml:msup>", "</mml:msup>",
                "<mml:mrow>", "</mml:mrow>",
                "<mml:mfrac>", "</mml:mfrac>",
                "<mml:mtext>", "</mml:mtext>"]
    
    def extract_tags(self, data):
        tags = set()
        for entry in data:
            mathml = entry["mathml"]
            for tag in mathml:
                if tag not in tags:
                    tags.append(tag)
        return tags
        
    def data_iterator(self, batch_size):
        columns = self.get_columns()
        for i in range(0, len(self.dataset), batch_size):
            yield [f"{data[columns[0]]} {data[columns[1]]} "for data in self.dataset[i:i+batch_size]]  

Class method (unsuccessful)

In [3]:
tokenizer = Tokenizer(models.BPE())                  

class MathMLPyTokenizer(pre_tokenizers.PreTokenizer):
    def __init__(self):
        pass
    
    def pre_tokenize(self, data):
        
        patterns = [
            # HTML/XML tags
            r'<[^>]+>',
            # Python string literals
            r'"""[^"]*"""', r"'''[^']*'''", r'"[^"]*"', r"'[^']*'",
            # Python keywords and operators
            r'\bdef\b', r'\bclass\b', r'\bfor\b', r'\bwhile\b', r'\bif\b', r'\belif\b', r'\belse\b',
            r'\breturn\b', r'\bimport\b', r'\bfrom\b', r'\bas\b', r'\bwith\b', r'\btry\b', r'\bexcept\b',
            # Common Python syntax elements
            r'==', r'!=', r'<=', r'>=', r'\+=', r'-=', r'\*=', r'/=', 
            r'=>', r'->',  # Function type hints and lambdas
            r'\bSymbol\b', r'\bEq\b', r'\bexp\b', r'\bsin\b', r'\bcos\b', r'\btan\b', r'\bdiff\b',
            
            # Indentation (important for Python)
            r'^\s+'
        ]

        combined_pattern = '|'.join(f'({p})' for p in patterns)
        regex = re.compile(combined_pattern, re.MULTILINE)
        
        splits = []
        for text, offset in data:
            last_end = 0
            for match in regex.finditer(text):
                start, end = match.span()
                
                if start > last_end:
                    # Add text before the special token
                    splits.append((text[last_end:start], last_end + offset))
                
                # Add the special token as a whole
                splits.append((text[start:end], start + offset))
                last_end = end
            
            if last_end < len(text):
                # Add remaining text
                splits.append((text[last_end:], last_end + offset))
                
        return splits

x = MathMLPyTokenizer()

tokens = tokenizer.tokenize(mml)
print(tokens)


TypeError: No constructor defined

Dataset creation timing

In [17]:
if __name__ == "__main__":
    start1 = time.time()
    td1 = TokenizerDataset(10000)
    td1.create_dataset()
    end1 = time.time()

    start2 = time.time()
    td2 = TokenizerDataset(10000)
    td2.create_dataset_mthread()
    end2 = time.time()

    print("Time 1: ", end1 - start1)
    print("Time 2: ", end2 - start2)

Generating dataset: 100%|██████████| 10000/10000 [01:39<00:00, 100.33it/s]
Generating dataset:  92%|█████████▏| 9223/10000 [01:28<00:07, 104.64it/s] 

Time 1:  99.67798781394958
Time 2:  90.23344993591309





In [None]:
td = TokenizerDataset(10000)
td.create_dataset()
print(len(td.dataset))
data_iterator = td.data_iterator(1000)

Generating dataset: 0it [00:00, ?it/s]

Training

In [54]:
# Create a base tokenizer (BPE, WordPiece, or Unigram)
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

# Define patterns we want to protect during tokenization
text_patterns = [
            # HTML/XML tags
            r'<[^>]+>',
            # Python string literals
            r'"""[^"]*"""', r"'''[^']*'''", r'"[^"]*"', r"'[^']*'",
            # Python keywords and operators
            r'\bdef\b', r'\bclass\b', r'\bfor\b', r'\bwhile\b', r'\bif\b', r'\belif\b', r'\belse\b',
            r'\breturn\b', r'\bimport\b', r'\bfrom\b', r'\bas\b', r'\bwith\b', r'\btry\b', r'\bexcept\b',
            # Common Python syntax elements
            r'==', r'!=', r'<=', r'>=', r'\+=', r'-=', r'\*=', r'/=', 
            r'=>', r'->',  # Function type hints and lambdas
            r'\bSymbol\b', r'\bEq\b', r'\bexp\b', r'\bsin\b', r'\bcos\b', r'\btan\b', r'\bdiff\b', # MathML
            # Indentation (important for Python)
            r'^\s+',
            r'\s+' # Whitespace
        ]

# Set up the pre-tokenizer using Split with pattern
pattern = '|'.join(text_patterns)
split_pre_tokenizer = Split(pattern=Regex(pattern), behavior="isolated")
tokenizer.pre_tokenizer = split_pre_tokenizer

# Set up other components 
tokenizer.normalizer = Sequence([NFKC()])
tokenizer.decoder = decoders.ByteLevel()

tag_list = td.get_tag_list()
special_tokens = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"] + tag_list

trainer = trainers.BpeTrainer(vocab_size=10000, special_tokens=special_tokens)
tokenizer.train_from_iterator(iterator=data_iterator, trainer=trainer)



In [26]:
hf_login_key = os.environ.get("HF_LOGIN_KEY")
login(token=hf_login_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\kyanj\.cache\huggingface\token
Login successful


In [55]:
tokenizer.save("Tokenizer_Files/mathml-py-tokenizer.json")

Testing

In [None]:
test_text = """
def my_function(x, y): return x + y
class MyClass:
    def method(self): pass
import numpy as np
<math><msup><mi>x</mi><mn>2</mn></msup></math>
"""

check = tokenizer.pre_tokenizer.pre_tokenize_str(mml)
print(check)
output = tokenizer.encode(test_text)
print(output.tokens)
print(output.ids)

In [57]:
output_mathml = tokenizer.encode(td.dataset[0]["mathml"])
output_py = tokenizer.encode(td.dataset[0]["python"])
# print(output_py.tokens)
# print(output_py.tokens)
# print(output_py.ids)
# print(tokenizer.decode(output_py.ids))
output_test = tokenizer.encode(mml)
print(output_test.tokens)

['\n', '<mml:mi>', 'h', '</mml:mi>', '\n', '<mml:mo>', '=', '</mml:mo>', '\n', '<mml:mrow>', '\n', '<mml:msub>', '\n', '<mml:mi>', 'h', '</mml:mi>', '\n', '<mml:mi>', 'c', '</mml:mi>', '\n', '</mml:msub>', '\n', '<mml:mo>', '+', '</mml:mo>', '\n', '<mml:msub>', '\n', '<mml:mi>', 'h', '</mml:mi>', '\n', '<mml:mi>', 'g', '</mml:mi>', '\n', '</mml:msub>', '\n', '</mml:mrow>', '\n', ' ', ' ']
