In [67]:
import json
import os
import importlib
import time
import csv
import pandas as pd
import sentencepiece as spm

from transformers import T5Tokenizer, T5TokenizerFast, PreTrainedTokenizerFast, AutoTokenizer
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer, Regex
from tokenizers.normalizers import NFKC, Sequence
from tokenizers.pre_tokenizers import Metaspace, Split, WhitespaceSplit
from tokenizers.processors import TemplateProcessing
from huggingface_hub import login
from datasets import load_dataset

import re
import src.paper_to_equation.Generation.Equation_BaseDataset 


In [2]:
importlib.reload(src.paper_to_equation.Generation.Equation_BaseDataset)
from src.paper_to_equation.Generation.Equation_BaseDataset import BaseDataset

Confirming the default tokenizer is unsuitable

In [20]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

mml = """
<mml:mi>h</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
</mml:mrow>
  """

py = """
h = Symbol('h')
h_g = Symbol('h_g')
h_c = Symbol('h_c')
e = Eq(h, h_g + h_c)"""

test = "Hello\nWorld"

tokens = tokenizer.encode(test)
print(tokenizer.decode(tokens))


Hello World</s>


Dataset class

In [3]:
class TokenizerDataset(BaseDataset):
    def __init__(self, num):
        super().__init__(num)

    def get_columns(self):
        return ["mathml", "python"]

    def map_atomic_tokens(self, dataset):

        tag_map = {"<mml:mo>": "<MO>", "</mml:mo>": "</MO>",
                   "<mml:mi>": "<MI>", "</mml:mi>": "</MI>",
                   "<mml:msub>": "<MSUB>", "</mml:msub>": "</MSUB>",
                   "<mml:msup>": "<MSUP>", "</mml:msup>": "</MSUP>",
                   "<mml:mrow>": "<MROW>", "</mml:mrow>": "</MROW>", 
                   "<mml:mfrac>": "<MFRAC>", "</mml:mfrac>": "</MFRAC>"}
        
        for entry in dataset:
            mathml = entry["mathml"]
            for tag, token in tag_map.items():
                mathml = mathml.replace(tag, token)
            entry["mathml"] = mathml
        
        return dataset
    
    def get_tag_list(self):
        return ["<mml:mo>", "</mml:mo>",
                "<mml:mi>", "</mml:mi>",
                "<mml:msub>", "</mml:msub>",
                "<mml:msup>", "</mml:msup>",
                "<mml:mrow>", "</mml:mrow>",
                "<mml:mfrac>", "</mml:mfrac>",
                "<mml:mtext>", "</mml:mtext>"]
    
    def extract_tags(self, data):
        tags = set()
        for entry in data:
            mathml = entry["mathml"]
            found_tags = re.findall(r"<\s*[/]?[a-zA-Z0-9]+[^>]*>", mathml)  # Extract full tags
            tags.update(found_tags)  # Append to set

        return list(tags)
        
    def data_iterator(self, batch_size):
        columns = self.get_columns()
        for i in range(0, len(self.dataset), batch_size):
            yield [f"{data[columns[0]]} {data[columns[1]]} "for data in self.dataset[i:i+batch_size]]  

Class method (unsuccessful)

In [3]:
tokenizer = Tokenizer(models.BPE())                  

class MathMLPyTokenizer(pre_tokenizers.PreTokenizer):
    def __init__(self):
        pass
    
    def pre_tokenize(self, data):
        
        patterns = [
            # HTML/XML tags
            r'<[^>]+>',
            # Python string literals
            r'"""[^"]*"""', r"'''[^']*'''", r'"[^"]*"', r"'[^']*'",
            # Python keywords and operators
            r'\bdef\b', r'\bclass\b', r'\bfor\b', r'\bwhile\b', r'\bif\b', r'\belif\b', r'\belse\b',
            r'\breturn\b', r'\bimport\b', r'\bfrom\b', r'\bas\b', r'\bwith\b', r'\btry\b', r'\bexcept\b',
            # Common Python syntax elements
            r'==', r'!=', r'<=', r'>=', r'\+=', r'-=', r'\*=', r'/=', 
            r'=>', r'->',  # Function type hints and lambdas
            r'\bSymbol\b', r'\bEq\b', r'\bexp\b', r'\bsin\b', r'\bcos\b', r'\btan\b', r'\bdiff\b',
            
            # Indentation (important for Python)
            r'^\s+'
        ]

        combined_pattern = '|'.join(f'({p})' for p in patterns)
        regex = re.compile(combined_pattern, re.MULTILINE)
        
        splits = []
        for text, offset in data:
            last_end = 0
            for match in regex.finditer(text):
                start, end = match.span()
                
                if start > last_end:
                    # Add text before the special token
                    splits.append((text[last_end:start], last_end + offset))
                
                # Add the special token as a whole
                splits.append((text[start:end], start + offset))
                last_end = end
            
            if last_end < len(text):
                # Add remaining text
                splits.append((text[last_end:], last_end + offset))
                
        return splits

x = MathMLPyTokenizer()

tokens = tokenizer.tokenize(mml)
print(tokens)


TypeError: No constructor defined

Dataset creation timing

In [17]:
if __name__ == "__main__":
    start1 = time.time()
    td1 = TokenizerDataset(10000)
    td1.create_dataset()
    end1 = time.time()

    start2 = time.time()
    td2 = TokenizerDataset(10000)
    td2.create_dataset_mthread()
    end2 = time.time()

    print("Time 1: ", end1 - start1)
    print("Time 2: ", end2 - start2)

Generating dataset: 100%|██████████| 10000/10000 [01:39<00:00, 100.33it/s]
Generating dataset:  92%|█████████▏| 9223/10000 [01:28<00:07, 104.64it/s] 

Time 1:  99.67798781394958
Time 2:  90.23344993591309





In [57]:
td = TokenizerDataset(10000)
td.create(filepath="Tokenizer_Files/TokenizerDataset.csv")
print(len(td.dataset))
data_iterator = td.data_iterator(1000)

Generating dataset: 100%|██████████| 10000/10000 [03:39<00:00, 45.54it/s]


10000


In [48]:
td = TokenizerDataset(10000)
td.load_csv("Tokenizer_Files/TokenizerDataset.csv")
data_iterator = td.data_iterator(1000)
print(len(td.dataset))

10000


Training

In [49]:
# tokenizer = Tokenizer(models.BPE(unk_token="[UNK]")) # BPE tokenizer
tokenizer = Tokenizer(models.Unigram()) # Unigram tokenizer

text_patterns = [ # Patterns to split on, ensuring that the tags are kept intact
            r'<[^>]+>', # MathML tags
            r'"""[^"]*"""', r"'''[^']*'''", r'"[^"]*"', r"'[^']*'", # Python string literals    
            r'\bdef\b', r'\bclass\b', r'\bfor\b', r'\bwhile\b', r'\bif\b', r'\belif\b', r'\belse\b', # Python keywords and operators
            r'\breturn\b', r'\bimport\b', r'\bfrom\b', r'\bas\b', r'\bwith\b', r'\btry\b', r'\bexcept\b',
            r'==', r'!=', r'<=', r'>=', r'\+=', r'-=', r'\*=', r'/=', # Common Python syntax elements 
            r'=>', r'->', # Function type hints and lambdas
            r'\bSymbol\b', r'\bEq\b', r'\bexp\b', r'\bsin\b', r'\bcos\b', r'\btan\b', r'\bdiff\b', # MathML 
            r'^\s+', # Indentation (important for Python)
            r'\s+' # Whitespace
        ]

pattern = '|'.join(text_patterns) # Set up the pre-tokenizer using Split with pattern
split_pre_tokenizer = Split(pattern=Regex(pattern), behavior="isolated")
tokenizer.pre_tokenizer = split_pre_tokenizer

tokenizer.normalizer = Sequence([NFKC()]) # Normalises unicode characters like greek letters
# tokenizer.decoder = decoders.ByteLevel()

tag_list = td.extract_tags(td.dataset)
sympy_tags = ["Symbol", "Eq", "exp", "sin", "cos", "tan", "diff", "log", "Sum", "Derivative", "Integral", "\n", "\r"]
prefix = ["translate", "MathML", "to", "Python", ":"]
# special_tokens = ["[PAD]", "[BOS]", "[EOS]", "[UNK]"] + tag_list
custom_tokens = tag_list + sympy_tags + prefix
special_tokens = ["<pad>", "<s>", "</s>", "<unk>"]

# trainer = trainers.BpeTrainer(vocab_size=10000, special_tokens=special_tokens) # BPE Trainer
# print(custom_tokens)
tokenizer.add_tokens(custom_tokens)
trainer = trainers.UnigramTrainer(vocab_size=30000, special_tokens=special_tokens) # Unigram Trainer
tokenizer.train_from_iterator(iterator=data_iterator, trainer=trainer)

tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>", # Single sentence template
    pair="<s> $A </s> $B </s>", # Pair template
    special_tokens=[("<s>", tokenizer.token_to_id("<s>")), ("</s>", tokenizer.token_to_id("</s>"))] # Special tokens for the templates
)

In [50]:
hf_login_key = os.environ.get("HF_LOGIN_KEY")
login(token=hf_login_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\kyanj\.cache\huggingface\token
Login successful


In [51]:
tokenizer.save("Tokenizer_Files/mathml-py-tokenizer-unigram-v3.json")

In [52]:
tokenizer = T5TokenizerFast(
    tokenizer_file="Tokenizer_Files/mathml-py-tokenizer-unigram-v3.json",
    unk_token="<unk>",
    bos_token="<s>",
    eos_token="</s>",
    pad_token="<pad>",
)

tokenizer.save_pretrained("Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v3")

('Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v3\\tokenizer_config.json',
 'Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v3\\special_tokens_map.json',
 'Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v3\\tokenizer.json')

In [53]:
tokenizer.push_to_hub("mathml-py-tokenizer-unigram-T5wrapped-v3")

CommitInfo(commit_url='https://huggingface.co/kj821/mathml-py-tokenizer-unigram-T5wrapped-v3/commit/891645f88fc52f76ec2a415e3c09c5bf821e49d3', commit_message='Upload tokenizer', commit_description='', oid='891645f88fc52f76ec2a415e3c09c5bf821e49d3', pr_url=None, pr_revision=None, pr_num=None)

Testing

In [12]:
test_text = """
def my_function(x, y): return x + y
class MyClass:
    def method(self): pass
import numpy as np
<math><msup><mi>x</mi><mn>2</mn></msup></math>
"""

check = tokenizer.pre_tokenizer.pre_tokenize_str(td.dataset[0]["python"])
print(check)
output = tokenizer.encode(test_text)
print(tokenizer.decode(output.ids))

AttributeError: 'T5TokenizerFast' object has no attribute 'pre_tokenizer'

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v3")

data_files = {"train": "Data/t5_train_2.csv", "validation": "Data/t5_validation_2.csv", "test": "Data/t5_test_2.csv"}
mml_py_dataset = load_dataset("csv", data_files=data_files)

# output_mathml = tokenizer.encode(td.dataset[0]["mathml"])
# output_py = tokenizer.encode(td.dataset[0]["python"])
# print(output_py.tokens)
# print(output_py.tokens)
# print(output_py.ids)
# print(tokenizer.decode(output_py.ids))
# output_test = tokenizer.encode(td)
# print(tokenizer.tokenize(td.dataset[0]["mathml"]))

# print(tokenizer.tokenize(check))
data = mml_py_dataset["train"][0]["MathML"]
# print(check)



# print(repr(data))
# print(repr(check))
# print(tokenizer.encode(data).tokens)
ids = tokenizer.encode("Hello\r\nWorld")
print(tokenizer.decode(ids))


  e l l o 
 W o r l d


In [80]:
# tokenizer = AutoTokenizer.from_pretrained("Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped-v3")
tokenizer = T5TokenizerFast.from_pretrained("kj821/mathml-py-tokenizer-sentencepiece-v1")

data_files = {"train": "Data/t5_train_2.csv", "validation": "Data/t5_validation_2.csv", "test": "Data/t5_test_2.csv"}
mml_py_dataset = load_dataset("csv", data_files=data_files)
data = mml_py_dataset["train"][0]["MathML"]
data = "<mml:msub><mml:mi>N</mml:mi>"
# data = "<mml:mi>h</mml:mi>"
print(data)

# Tokenize it
encodings = tokenizer.encode(data)
print(encodings)

# Print token IDs


# Decode without skipping special tokens
decoded = tokenizer.decode(encodings, skip_special_tokens=False)
print(decoded)

# # Decode with skipping special tokens (normal inference behavior)
# decoded_skipped = tokenizer.decode(encodings, skip_special_tokens=True)
# print("Decoded (skip special tokens):", decoded_skipped)


<mml:msub><mml:mi>N</mml:mi>
[5, 24, 46, 167, 27, 2]
<mml:msub><mml:mi> N</mml:mi></s>


### SentencePiece Tokenizer

In [69]:
# Load CSV
df = pd.read_csv("Tokenizer_Files/TokenizerDataset.csv")

# Flatten MathML and Python
def flatten(text):
    return str(text).replace("\n", " ").replace("\r", " ").strip()

mathml_texts = [flatten(item) for item in df["mathml"]]
python_texts = [flatten(item) for item in df["python"]]

# Write alternating MathML and Python lines
with open("Tokenizer_Files/TokenizerCorpus.txt", "w", encoding="utf-8") as f:
    for mathml, python in zip(mathml_texts, python_texts):
        f.write(mathml + "\n")
        f.write(python + "\n")


In [74]:
model_prefix = "Tokenizer_Files/spm-mathml-py-tokenizer/tokenizer_v1"
vocab_size = 3800
character_coverage = 1.0
model_type = "unigram"

td.load_csv("Tokenizer_Files/TokenizerDataset.csv")
tag_list = td.extract_tags(td.dataset)
sympy_tags = ["Symbol", "Eq", "exp", "sin", "cos", "tan", "diff", "log", "Sum", "Derivative", "Integral", "\n", "\r"]
prefix = ["translate", "MathML", "to", "Python", ":"]
user_defined_symbols = tag_list + sympy_tags + prefix
special_tokens = ["<pad>", "<s>", "</s>", "<unk>"]

spm.SentencePieceTrainer.Train(
    input="Tokenizer_Files/TokenizerCorpus.txt",
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    character_coverage=character_coverage,
    model_type=model_type,
    user_defined_symbols=user_defined_symbols,
    pad_id=0,
    bos_id=1,
    eos_id=2,
    unk_id=3,
    control_symbols=["<s>", "</s>"],
)



In [75]:
tokenizer_config = {
    "model_max_length": 512,
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "bos_token": "<s>",
    "eos_token": "</s>",
}

special_tokens_map = {
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "bos_token": "<s>",
    "eos_token": "</s>",
}

with open("Tokenizer_Files/spm-mathml-py-tokenizer/tokenizer_config.json", "w") as f:
    json.dump(tokenizer_config, f, indent=4)

with open("Tokenizer_Files/spm-mathml-py-tokenizer/special_tokens_map.json", "w") as f:
    json.dump(special_tokens_map, f, indent=4)
    

In [82]:
tokenizer = T5Tokenizer(
    vocab_file="Tokenizer_Files/spm-mathml-py-tokenizer/tokenizer_v1.model",
    unk_token="<unk>",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
)

tokenizer.save_pretrained("Tokenizer_Files/spm-mathml-py-tokenizer")

('Tokenizer_Files/spm-mathml-py-tokenizer\\tokenizer_config.json',
 'Tokenizer_Files/spm-mathml-py-tokenizer\\special_tokens_map.json',
 'Tokenizer_Files/spm-mathml-py-tokenizer\\spiece.model',
 'Tokenizer_Files/spm-mathml-py-tokenizer\\added_tokens.json')

In [83]:
tokenizer.push_to_hub("mathml-py-tokenizer-sentencepiece-v1")

CommitInfo(commit_url='https://huggingface.co/kj821/mathml-py-tokenizer-sentencepiece-v1/commit/60b6b568a89ebf47807706ea71b6a8c741a0f01d', commit_message='Upload tokenizer', commit_description='', oid='60b6b568a89ebf47807706ea71b6a8c741a0f01d', pr_url=None, pr_revision=None, pr_num=None)