In [4]:
import json
import importlib
from transformers import T5Tokenizer

import src.paper_to_equation.Generation.Equation_BaseDataset 

Confirming the default tokenizer is unsuitable

In [4]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

mml = """
<mml:mi>h</mml:mi>
  <mml:mo>=</mml:mo>
  <mml:mrow>
    <mml:msub>
      <mml:mi>h</mml:mi>
      <mml:mi>c</mml:mi>
    </mml:msub>
    <mml:mo>+</mml:mo>
    <mml:msub>
      <mml:mi>h</mml:mi>
      <mml:mi>g</mml:mi>
    </mml:msub>
  </mml:mrow>
  """

py = """
h = Symbol('h')
h_g = Symbol('h_g')
h_c = Symbol('h_c')
e = Eq(h, h_g + h_c)"""

tokens = tokenizer.tokenize(mml)
print(tokens)


['▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'h', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', 'mm', 'l', ':', 'm', 'o', '>', '=', '<', '/', 'mm', 'l', ':', 'm', 'o', '>', '▁', '<', 'mm', 'l', ':', 'm', 'row', '>', '▁', '<', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'h', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'c', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', '/', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', 'mm', 'l', ':', 'm', 'o', '>', '+', '<', '/', 'mm', 'l', ':', 'm', 'o', '>', '▁', '<', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'h', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', 'mm', 'l', ':', 'm', 'i', '>', 'g', '<', '/', 'mm', 'l', ':', 'm', 'i', '>', '▁', '<', '/', 'mm', 'l', ':', 'm', 'sub', '>', '▁', '<', '/', 'mm', 'l', ':', 'm', 'row', '>']


Dataset class

In [13]:
importlib.reload(src.paper_to_equation.Generation.Equation_BaseDataset)
from src.paper_to_equation.Generation.Equation_BaseDataset import BaseDataset

class TokenizerDataset(BaseDataset):
    def __init__(self, num, filepath):
        super().__init__(num, filepath)

    def get_columns(self):
        return ["mathml", "python"]

    def map_atomic_tokens(self, dataset):

        tag_map = {"<mml:mo>": "<MO>", "</mml:mo>": "</MO>",
                   "<mml:mi>": "<MI>", "</mml:mi>": "</MI>",
                   "<mml:msub>": "<MSUB>", "</mml:msub>": "</MSUB>",
                   "<mml:msup>": "<MSUP>", "</mml:msup>": "</MSUP>",
                   "<mml:mrow>": "<MROW>", "</mml:mrow>": "</MROW>", 
                   "<mml:mfrac>": "<MFRAC>", "</mml:mfrac>": "</MFRAC>"}
        
        for entry in dataset:
            mathml = entry["mathml"]
            for tag, token in tag_map.items():
                mathml = mathml.replace(tag, token)
            entry["mathml"] = mathml
        
        return dataset
    
    def json_to_dataset(self):
        with open(self.filepath) as f:
            data = json.load(f)
        
        return data
        


td = TokenizerDataset(10, "Data/tokenizer_dataset.json")
td.create()
data = td.json_to_dataset()
mapped_data = td.map_atomic_tokens(data)
print(mapped_data)


    

Generating dataset: 100%|██████████| 10/10 [00:00<00:00, 68.01it/s]

[{'mathml': '<MI>h</MI>\n<MO>=</MO>\n<MROW>\n<MSUP>\n<mml:mtext>exp</mml:mtext>\n<MROW>\n<MROW>\n<MO>-</MO>\n<MI>Ε</MI>\n</MROW>\n<MO>+</MO>\n<MI>ν</MI>\n</MROW>\n</MSUP>\n<MO>-</MO>\n<MROW>\n<MI>sin</MI>\n<mml:mfenced>\n<MSUB>\n<MI>f</MI>\n<MROW>\n<MI>t</MI>\n<MI>S</MI>\n</MROW>\n</MSUB>\n</mml:mfenced>\n</MROW>\n<MO>+</MO>\n<MROW>\n<MI>tan</MI>\n<mml:mfenced>\n<MSUB>\n<MI>ο</MI>\n<MROW>\n<MI>T</MI>\n<MI>λ</MI>\n</MROW>\n</MSUB>\n</mml:mfenced>\n</MROW>\n</MROW>', 'python': "h = Symbol('h')\nΕ = Symbol('Ε')\nν = Symbol('ν')\nf_tS = Symbol('f_tS')\nο_Tλ = Symbol('ο_Tλ')\ne = Eq(h, exp(-Ε + ν) - sin(f_tS) + tan(ο_Tλ))"}, {'mathml': '<MI>ν</MI>\n<MO>=</MO>\n<MROW>\n<mml:munderover>\n<MO>∑</MO>\n<MROW>\n<MI>κ</MI>\n<MO>=</MO>\n<mml:mn>9</mml:mn>\n</MROW>\n<MSUB>\n<MI>y</MI>\n<MI>Δ</MI>\n</MSUB>\n</mml:munderover>\n<mml:mfenced>\n<MROW>\n<mml:msqrt>\n<MROW>\n<MI>Λ</MI>\n<MO>-</MO>\n<mml:mn>7</mml:mn>\n</MROW>\n</mml:msqrt>\n<MO>+</MO>\n<MSUP>\n<mml:mtext>exp</mml:mtext>\n<MI>k</MI>\n</MSUP


