In [41]:
import json
import os
import importlib
import time
import csv
from transformers import T5Tokenizer, T5TokenizerFast, PreTrainedTokenizerFast, AutoTokenizer
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer, Regex
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import Metaspace, Split, WhitespaceSplit
from huggingface_hub import login
from datasets import load_dataset

import re
import src.paper_to_equation.Generation.Equation_BaseDataset 


In [69]:
importlib.reload(src.paper_to_equation.Generation.Equation_BaseDataset)
from src.paper_to_equation.Generation.Equation_BaseDataset import BaseDataset

Confirming the default tokenizer is unsuitable

In [5]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

mml = """
<mml:mi>h</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
</mml:mrow>
  """

py = """
h = Symbol('h')
h_g = Symbol('h_g')
h_c = Symbol('h_c')
e = Eq(h, h_g + h_c)"""

tokens = tokenizer.tokenize(py)
print(tokens)


['▁', 'h', '▁=', '▁', 'Symbol', '(', "'", 'h', "'", ')', '▁', 'h', '_', 'g', '▁=', '▁', 'Symbol', '(', "'", 'h', '_', 'g', "'", ')', '▁', 'h', '_', 'c', '▁=', '▁', 'Symbol', '(', "'", 'h', '_', 'c', "'", ')', '▁', 'e', '▁=', '▁E', 'q', '(', 'h', ',', '▁', 'h', '_', 'g', '▁+', '▁', 'h', '_', 'c', ')']


Dataset class

In [70]:
class TokenizerDataset(BaseDataset):
    def __init__(self, num):
        super().__init__(num)

    def get_columns(self):
        return ["mathml", "python"]

    def map_atomic_tokens(self, dataset):

        tag_map = {"<mml:mo>": "<MO>", "</mml:mo>": "</MO>",
                   "<mml:mi>": "<MI>", "</mml:mi>": "</MI>",
                   "<mml:msub>": "<MSUB>", "</mml:msub>": "</MSUB>",
                   "<mml:msup>": "<MSUP>", "</mml:msup>": "</MSUP>",
                   "<mml:mrow>": "<MROW>", "</mml:mrow>": "</MROW>", 
                   "<mml:mfrac>": "<MFRAC>", "</mml:mfrac>": "</MFRAC>"}
        
        for entry in dataset:
            mathml = entry["mathml"]
            for tag, token in tag_map.items():
                mathml = mathml.replace(tag, token)
            entry["mathml"] = mathml
        
        return dataset
    
    def get_tag_list(self):
        return ["<mml:mo>", "</mml:mo>",
                "<mml:mi>", "</mml:mi>",
                "<mml:msub>", "</mml:msub>",
                "<mml:msup>", "</mml:msup>",
                "<mml:mrow>", "</mml:mrow>",
                "<mml:mfrac>", "</mml:mfrac>",
                "<mml:mtext>", "</mml:mtext>"]
    
    def extract_tags(self, data):
        tags = set()
        for entry in data:
            mathml = entry["mathml"]
            found_tags = re.findall(r"<\s*[/]?[a-zA-Z0-9]+[^>]*>", mathml)  # Extract full tags
            tags.update(found_tags)  # Append to set

        return list(tags)
        
    def data_iterator(self, batch_size):
        columns = self.get_columns()
        for i in range(0, len(self.dataset), batch_size):
            yield [f"{data[columns[0]]} {data[columns[1]]} "for data in self.dataset[i:i+batch_size]]  

Class method (unsuccessful)

In [3]:
tokenizer = Tokenizer(models.BPE())                  

class MathMLPyTokenizer(pre_tokenizers.PreTokenizer):
    def __init__(self):
        pass
    
    def pre_tokenize(self, data):
        
        patterns = [
            # HTML/XML tags
            r'<[^>]+>',
            # Python string literals
            r'"""[^"]*"""', r"'''[^']*'''", r'"[^"]*"', r"'[^']*'",
            # Python keywords and operators
            r'\bdef\b', r'\bclass\b', r'\bfor\b', r'\bwhile\b', r'\bif\b', r'\belif\b', r'\belse\b',
            r'\breturn\b', r'\bimport\b', r'\bfrom\b', r'\bas\b', r'\bwith\b', r'\btry\b', r'\bexcept\b',
            # Common Python syntax elements
            r'==', r'!=', r'<=', r'>=', r'\+=', r'-=', r'\*=', r'/=', 
            r'=>', r'->',  # Function type hints and lambdas
            r'\bSymbol\b', r'\bEq\b', r'\bexp\b', r'\bsin\b', r'\bcos\b', r'\btan\b', r'\bdiff\b',
            
            # Indentation (important for Python)
            r'^\s+'
        ]

        combined_pattern = '|'.join(f'({p})' for p in patterns)
        regex = re.compile(combined_pattern, re.MULTILINE)
        
        splits = []
        for text, offset in data:
            last_end = 0
            for match in regex.finditer(text):
                start, end = match.span()
                
                if start > last_end:
                    # Add text before the special token
                    splits.append((text[last_end:start], last_end + offset))
                
                # Add the special token as a whole
                splits.append((text[start:end], start + offset))
                last_end = end
            
            if last_end < len(text):
                # Add remaining text
                splits.append((text[last_end:], last_end + offset))
                
        return splits

x = MathMLPyTokenizer()

tokens = tokenizer.tokenize(mml)
print(tokens)


TypeError: No constructor defined

Dataset creation timing

In [17]:
if __name__ == "__main__":
    start1 = time.time()
    td1 = TokenizerDataset(10000)
    td1.create_dataset()
    end1 = time.time()

    start2 = time.time()
    td2 = TokenizerDataset(10000)
    td2.create_dataset_mthread()
    end2 = time.time()

    print("Time 1: ", end1 - start1)
    print("Time 2: ", end2 - start2)

Generating dataset: 100%|██████████| 10000/10000 [01:39<00:00, 100.33it/s]
Generating dataset:  92%|█████████▏| 9223/10000 [01:28<00:07, 104.64it/s] 

Time 1:  99.67798781394958
Time 2:  90.23344993591309





In [57]:
td = TokenizerDataset(10000)
td.create(filepath="Tokenizer_Files/TokenizerDataset.csv")
print(len(td.dataset))
data_iterator = td.data_iterator(1000)

Generating dataset: 100%|██████████| 10000/10000 [03:39<00:00, 45.54it/s]


10000


In [92]:
td = TokenizerDataset(10000)
td.load_csv("Tokenizer_Files/TokenizerDataset.csv")
data_iterator = td.data_iterator(1000)
print(len(td.dataset))

10000


Training

In [95]:
# tokenizer = Tokenizer(models.BPE(unk_token="[UNK]")) # BPE tokenizer
tokenizer = Tokenizer(models.Unigram()) # Unigram tokenizer

text_patterns = [ # Patterns to split on, ensuring that the tags are kept intact
            r'<[^>]+>', # MathML tags
            r'"""[^"]*"""', r"'''[^']*'''", r'"[^"]*"', r"'[^']*'", # Python string literals    
            r'\bdef\b', r'\bclass\b', r'\bfor\b', r'\bwhile\b', r'\bif\b', r'\belif\b', r'\belse\b', # Python keywords and operators
            r'\breturn\b', r'\bimport\b', r'\bfrom\b', r'\bas\b', r'\bwith\b', r'\btry\b', r'\bexcept\b',
            r'==', r'!=', r'<=', r'>=', r'\+=', r'-=', r'\*=', r'/=', # Common Python syntax elements 
            r'=>', r'->', # Function type hints and lambdas
            r'\bSymbol\b', r'\bEq\b', r'\bexp\b', r'\bsin\b', r'\bcos\b', r'\btan\b', r'\bdiff\b', # MathML 
            r'^\s+', # Indentation (important for Python)
            r'\s+' # Whitespace
        ]

pattern = '|'.join(text_patterns) # Set up the pre-tokenizer using Split with pattern
split_pre_tokenizer = Split(pattern=Regex(pattern), behavior="isolated")
tokenizer.pre_tokenizer = split_pre_tokenizer

tokenizer.normalizer = Sequence([NFKC()]) # Normalises unicode characters like greek letters
# tokenizer.decoder = decoders.ByteLevel()

tag_list = td.extract_tags(td.dataset)
sympy_tags = ["Symbol", "Eq", "exp", "sin", "cos", "tan", "diff", "log", "Sum", "Derivative", "Integral", "\n", "\r"]
prefix = ["translate", "MathML", "to", "Python", ":"]
# special_tokens = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"] + tag_list
special_tokens = ["<pad>", "<bos>", "<eos>", "<unk>"] + tag_list + sympy_tags + prefix

# trainer = trainers.BpeTrainer(vocab_size=10000, special_tokens=special_tokens) # BPE Trainer
trainer = trainers.UnigramTrainer(vocab_size=30000, special_tokens=special_tokens) # Unigram Trainer
tokenizer.train_from_iterator(iterator=data_iterator, trainer=trainer)


In [77]:
hf_login_key = os.environ.get("HF_LOGIN_KEY")
login(token=hf_login_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\kyanj\.cache\huggingface\token
Login successful


In [94]:
tokenizer.save("Tokenizer_Files/mathml-py-tokenizer-unigram-v2.json")

In [96]:
tokenizer = T5TokenizerFast(
    tokenizer_file="Tokenizer_Files/mathml-py-tokenizer-unigram-v2.json",
    unk_token="<unk>",
    bos_token="<s>",
    eos_token="</s>",
    pad_token="<pad>",
    additional_special_tokens=tag_list
)

tokenizer.save_pretrained("Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v2")

('Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v2\\tokenizer_config.json',
 'Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v2\\special_tokens_map.json',
 'Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v2\\tokenizer.json')

In [97]:
tokenizer.push_to_hub("mathml-py-tokenizer-unigram-T5wrapped-v2")

CommitInfo(commit_url='https://huggingface.co/kj821/mathml-py-tokenizer-unigram-T5wrapped-v2/commit/38a87bb3d39bccb1f7999b8a2a9758c2f3ee4141', commit_message='Upload tokenizer', commit_description='', oid='38a87bb3d39bccb1f7999b8a2a9758c2f3ee4141', pr_url=None, pr_revision=None, pr_num=None)

Testing

In [76]:
test_text = """
def my_function(x, y): return x + y
class MyClass:
    def method(self): pass
import numpy as np
<math><msup><mi>x</mi><mn>2</mn></msup></math>
"""

check = tokenizer.pre_tokenizer.pre_tokenize_str(td.dataset[0]["python"])
print(check)
output = tokenizer.encode(test_text)
print(output.tokens)
print(output.ids)

[('j_Κτ', (0, 4)), (' ', (4, 5)), ('=', (5, 6)), (' ', (6, 7)), ('Symbol', (7, 13)), ('(', (13, 14)), ("'j_Κτ'", (14, 20)), (')', (20, 21)), ('\n', (21, 22)), ('t', (22, 23)), (' ', (23, 24)), ('=', (24, 25)), (' ', (25, 26)), ('Symbol', (26, 32)), ('(', (32, 33)), ("'t'", (33, 36)), (')', (36, 37)), ('\n', (37, 38)), ('F', (38, 39)), (' ', (39, 40)), ('=', (40, 41)), (' ', (41, 42)), ('Symbol', (42, 48)), ('(', (48, 49)), ("'F'", (49, 52)), (')', (52, 53)), ('\n', (53, 54)), ('Α', (54, 55)), (' ', (55, 56)), ('=', (56, 57)), (' ', (57, 58)), ('Symbol', (58, 64)), ('(', (64, 65)), ("'Α'", (65, 68)), (')', (68, 69)), ('\n', (69, 70)), ('ζ', (70, 71)), (' ', (71, 72)), ('=', (72, 73)), (' ', (73, 74)), ('Symbol', (74, 80)), ('(', (80, 81)), ("'ζ'", (81, 84)), (')', (84, 85)), ('\n', (85, 86)), ('e', (86, 87)), (' ', (87, 88)), ('=', (88, 89)), (' ', (89, 90)), ('Eq', (90, 92)), ('(j_Κτ,', (92, 98)), (' ', (98, 99)), ('Sum(sqrt(Α)*', (99, 111)), ('sin', (111, 114)), ('(ζ', (114, 116)), ('

Exception: Encountered an unknown token but `unk_id` is missing

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v2")

data_files = {"train": "Data/t5_train_2.csv", "validation": "Data/t5_validation_2.csv", "test": "Data/t5_test_2.csv"}
mml_py_dataset = load_dataset("csv", data_files=data_files)

# output_mathml = tokenizer.encode(td.dataset[0]["mathml"])
# output_py = tokenizer.encode(td.dataset[0]["python"])
# print(output_py.tokens)
# print(output_py.tokens)
# print(output_py.ids)
# print(tokenizer.decode(output_py.ids))
# output_test = tokenizer.encode(td)
# print(tokenizer.tokenize(td.dataset[0]["mathml"]))

# print(tokenizer.tokenize(check))
data = mml_py_dataset["train"][0]["MathML"]
# print(check)



# print(repr(data))
# print(repr(check))
# print(tokenizer.encode(data).tokens)
print(tokenizer.tokenize("\\"))


Exception: Encountered an unknown token but `unk_id` is missing