In [21]:
import json
import importlib
from transformers import T5Tokenizer
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer, Regex
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import Metaspace, Split, WhitespaceSplit

import re
import src.paper_to_equation.Generation.Equation_BaseDataset 


In [2]:
importlib.reload(src.paper_to_equation.Generation.Equation_BaseDataset)
from src.paper_to_equation.Generation.Equation_BaseDataset import BaseDataset

Confirming the default tokenizer is unsuitable

In [25]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

mml = """
<mml:mi>h</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
</mml:mrow>
  """

py = """
h = Symbol('h')
h_g = Symbol('h_g')
h_c = Symbol('h_c')
e = Eq(h, h_g + h_c)"""

tokens = tokenizer.tokenize(mml)
print(tokens)


['▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'h', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', 'mm', 'l', ':', 'm', 'o', '>', '=', '<', '/', 'mm', 'l', ':', 'm', 'o', '>', '▁', '<', 'mm', 'l', ':', 'm', 'row', '>', '▁', '<', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'h', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'c', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', '/', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', 'mm', 'l', ':', 'm', 'o', '>', '+', '<', '/', 'mm', 'l', ':', 'm', 'o', '>', '▁', '<', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'h', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'g', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', '/', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', '/', 'mm', 'l', ':', 'm', 'row', '>']


Dataset class

In [9]:
class TokenizerDataset(BaseDataset):
    def __init__(self, num, filepath):
        super().__init__(num, filepath)

    def get_columns(self):
        return ["mathml", "python"]

    def map_atomic_tokens(self, dataset):

        tag_map = {"<mml:mo>": "<MO>", "</mml:mo>": "</MO>",
                   "<mml:mi>": "<MI>", "</mml:mi>": "</MI>",
                   "<mml:msub>": "<MSUB>", "</mml:msub>": "</MSUB>",
                   "<mml:msup>": "<MSUP>", "</mml:msup>": "</MSUP>",
                   "<mml:mrow>": "<MROW>", "</mml:mrow>": "</MROW>", 
                   "<mml:mfrac>": "<MFRAC>", "</mml:mfrac>": "</MFRAC>"}
        
        for entry in dataset:
            mathml = entry["mathml"]
            for tag, token in tag_map.items():
                mathml = mathml.replace(tag, token)
            entry["mathml"] = mathml
        
        return dataset
    
    def json_to_dataset(self):
        with open(self.filepath) as f:
            data = json.load(f)
        
        return data
    
    def get_tag_list():
        return ["<mml:mo>", "</mml:mo>",
                "<mml:mi>", "</mml:mi>",
                "<mml:msub>", "</mml:msub>",
                "<mml:msup>", "</mml:msup>",
                "<mml:mrow>", "</mml:mrow>",
                "<mml:mfrac>", "</mml:mfrac>"
                "<mml:mtext>", "</mml:mtext>"]
    
    
    def extract_tags(self, data):
        tags = set()
        for entry in data:
            mathml = entry["mathml"]
            for tag in mathml:
                if tag not in tags:
                    tags.append(tag)
        return tags
        
    def data_iterator(self, data, batch_size):
        for i in range(0, len(data), batch_size):
            yield [sample["mathml"] for sample in data[i:i+batch_size]]  

In [12]:
td = TokenizerDataset(10, "Data/tokenizer_dataset.json")
# td.create()
data = td.json_to_dataset()
# mapped_data = td.map_atomic_tokens(data)
for entry in td.data_iterator(data, 10):
    print(entry)




['<mml:mi>h</mml:mi>\n<mml:mo>=</mml:mo>\n<mml:mrow>\n<mml:msup>\n<mml:mtext>exp</mml:mtext>\n<mml:mrow>\n<mml:mrow>\n<mml:mo>-</mml:mo>\n<mml:mi>Ε</mml:mi>\n</mml:mrow>\n<mml:mo>+</mml:mo>\n<mml:mi>ν</mml:mi>\n</mml:mrow>\n</mml:msup>\n<mml:mo>-</mml:mo>\n<mml:mrow>\n<mml:mi>sin</mml:mi>\n<mml:mfenced>\n<mml:msub>\n<mml:mi>f</mml:mi>\n<mml:mrow>\n<mml:mi>t</mml:mi>\n<mml:mi>S</mml:mi>\n</mml:mrow>\n</mml:msub>\n</mml:mfenced>\n</mml:mrow>\n<mml:mo>+</mml:mo>\n<mml:mrow>\n<mml:mi>tan</mml:mi>\n<mml:mfenced>\n<mml:msub>\n<mml:mi>ο</mml:mi>\n<mml:mrow>\n<mml:mi>T</mml:mi>\n<mml:mi>λ</mml:mi>\n</mml:mrow>\n</mml:msub>\n</mml:mfenced>\n</mml:mrow>\n</mml:mrow>', '<mml:mi>ν</mml:mi>\n<mml:mo>=</mml:mo>\n<mml:mrow>\n<mml:munderover>\n<mml:mo>∑</mml:mo>\n<mml:mrow>\n<mml:mi>κ</mml:mi>\n<mml:mo>=</mml:mo>\n<mml:mn>9</mml:mn>\n</mml:mrow>\n<mml:msub>\n<mml:mi>y</mml:mi>\n<mml:mi>Δ</mml:mi>\n</mml:msub>\n</mml:munderover>\n<mml:mfenced>\n<mml:mrow>\n<mml:msqrt>\n<mml:mrow>\n<mml:mi>Λ</mml:mi>\n<

In [22]:
tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.Replace("``", '"'), normalizers.Replace("''", '"')] # don't lowercase
)
tokenizer.normalizer.normalize_str(py)

tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
tokenizer.pre_tokenizer.pre_tokenize_str(py)



[('▁\nh', (0, 2)),
 ('▁=', (2, 4)),
 ("▁Symbol('h')\nh_g", (4, 20)),
 ('▁=', (20, 22)),
 ("▁Symbol('h_g')\nh_c", (22, 40)),
 ('▁=', (40, 42)),
 ("▁Symbol('h_c')\ne", (42, 58)),
 ('▁=', (58, 60)),
 ('▁Eq(h,', (60, 66)),
 ('▁h_g', (66, 70)),
 ('▁+', (70, 72)),
 ('▁h_c)', (72, 77))]

In [None]:
class MathMLTokenizer(Tokenizer):
    def __init__(self, data):
        self.data = data
       
    def normalise(self):
        pass

In [6]:
tokenizer = Tokenizer(models.BPE())                  

class MathMLPyTokenizer(pre_tokenizers.PreTokenizer):
    def __init__(self):
        pass
    
    def pre_tokenize(self, data):
        
        patterns = [
            # HTML/XML tags
            r'<[^>]+>',
            # Python string literals
            r'"""[^"]*"""', r"'''[^']*'''", r'"[^"]*"', r"'[^']*'",
            # Python keywords and operators
            r'\bdef\b', r'\bclass\b', r'\bfor\b', r'\bwhile\b', r'\bif\b', r'\belif\b', r'\belse\b',
            r'\breturn\b', r'\bimport\b', r'\bfrom\b', r'\bas\b', r'\bwith\b', r'\btry\b', r'\bexcept\b',
            # Common Python syntax elements
            r'==', r'!=', r'<=', r'>=', r'\+=', r'-=', r'\*=', r'/=', 
            r'=>', r'->',  # Function type hints and lambdas
            r'\bSymbol\b', r'\bEq\b', r'\bexp\b', r'\bsin\b', r'\bcos\b', r'\btan\b', r'\bdiff\b',
            
            # Indentation (important for Python)
            r'^\s+'
        ]

        combined_pattern = '|'.join(f'({p})' for p in patterns)
        regex = re.compile(combined_pattern, re.MULTILINE)
        
        splits = []
        for text, offset in data:
            last_end = 0
            for match in regex.finditer(text):
                start, end = match.span()
                
                if start > last_end:
                    # Add text before the special token
                    splits.append((text[last_end:start], last_end + offset))
                
                # Add the special token as a whole
                splits.append((text[start:end], start + offset))
                last_end = end
            
            if last_end < len(text):
                # Add remaining text
                splits.append((text[last_end:], last_end + offset))
                
        return splits

x = MathMLPyTokenizer()

tokens = tokenizer.tokenize(mml)
print(tokens)


TypeError: No constructor defined

In [27]:
# Create a base tokenizer (BPE, WordPiece, or Unigram)
tokenizer = Tokenizer(models.BPE())

# Define patterns we want to protect during tokenization
text_patterns = [
            # HTML/XML tags
            r'<[^>]+>',
            # Python string literals
            r'"""[^"]*"""', r"'''[^']*'''", r'"[^"]*"', r"'[^']*'",
            # Python keywords and operators
            r'\bdef\b', r'\bclass\b', r'\bfor\b', r'\bwhile\b', r'\bif\b', r'\belif\b', r'\belse\b',
            r'\breturn\b', r'\bimport\b', r'\bfrom\b', r'\bas\b', r'\bwith\b', r'\btry\b', r'\bexcept\b',
            # Common Python syntax elements
            r'==', r'!=', r'<=', r'>=', r'\+=', r'-=', r'\*=', r'/=', 
            r'=>', r'->',  # Function type hints and lambdas
            r'\bSymbol\b', r'\bEq\b', r'\bexp\b', r'\bsin\b', r'\bcos\b', r'\btan\b', r'\bdiff\b', # MathML
            # Indentation (important for Python)
            r'^\s+',
            r'\s+' # Whitespace
        ]

# Set up the pre-tokenizer using Split with pattern
pattern = '|'.join(text_patterns)
split_pre_tokenizer = Split(pattern=Regex(pattern), behavior="isolated")
# split_pre_tokenizer = WhitespaceSplit()
tokenizer.pre_tokenizer = split_pre_tokenizer

# Set up other components 
tokenizer.normalizer = Sequence([NFKC()])
tokenizer.decoder = decoders.ByteLevel()

test_text = """
def my_function(x, y): return x + y
class MyClass:
    def method(self): pass
import numpy as np
<math><msup><mi>x</mi><mn>2</mn></msup></math>
"""

check = tokenizer.pre_tokenizer.pre_tokenize_str(mml)
print(check)
output = tokenizer.encode(test_text)
print(output.tokens)
print(output.ids)


[('\n', (0, 1)), ('<mml:mi>', (1, 9)), ('h', (9, 10)), ('</mml:mi>', (10, 19)), ('\n', (19, 20)), ('<mml:mo>', (20, 28)), ('=', (28, 29)), ('</mml:mo>', (29, 38)), ('\n', (38, 39)), ('<mml:mrow>', (39, 49)), ('\n', (49, 50)), ('<mml:msub>', (50, 60)), ('\n', (60, 61)), ('<mml:mi>', (61, 69)), ('h', (69, 70)), ('</mml:mi>', (70, 79)), ('\n', (79, 80)), ('<mml:mi>', (80, 88)), ('c', (88, 89)), ('</mml:mi>', (89, 98)), ('\n', (98, 99)), ('</mml:msub>', (99, 110)), ('\n', (110, 111)), ('<mml:mo>', (111, 119)), ('+', (119, 120)), ('</mml:mo>', (120, 129)), ('\n', (129, 130)), ('<mml:msub>', (130, 140)), ('\n', (140, 141)), ('<mml:mi>', (141, 149)), ('h', (149, 150)), ('</mml:mi>', (150, 159)), ('\n', (159, 160)), ('<mml:mi>', (160, 168)), ('g', (168, 169)), ('</mml:mi>', (169, 178)), ('\n', (178, 179)), ('</mml:msub>', (179, 190)), ('\n', (190, 191)), ('</mml:mrow>', (191, 202)), ('\n  ', (202, 205))]
[]
[]


In [14]:
test_text = """def my_function(x, y): return x + y
class MyClass:
    def method(self): pass
import numpy as np
<math><msup><mi>x</mi><mn>2</mn></msup></math>
"""

# Your combined regex pattern
tag_pattern = r'<[^>]+>'
python_patterns = [
    r'def\s+\w+\([^)]*\)', r'class\s+\w+', r'import\s+\w+', r'from\s+\w+\s+import',
    r'if\s+.*:', r'for\s+.*:', r'while\s+.*:', r'try:', r'except\s+.*:',
]
pattern = '|'.join([tag_pattern] + python_patterns)

# Test regex separately
regex = re.compile(pattern)
matches = regex.findall(test_text)
print("Matches:", matches)


Matches: ['def my_function(x, y)', 'class MyClass', 'def method(self)', 'import numpy', '<math>', '<msup>', '<mi>', '</mi>', '<mn>', '</mn>', '</msup>', '</math>']
