In [3]:
from datasets import load_dataset
from src.paper_to_equation.Generation.Equation_BaseDataset import BaseDataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5Tokenizer, T5ForConditionalGeneration
import torch
import evaluate
import os
import numpy as np

In [5]:
class T5Dataset(BaseDataset):
    def __init__(self, num):
        super().__init__(num)

    def get_columns(self):
        return ["MathML", "Python"]

t5_data = T5Dataset(1000)
t5_data.create("Data/t5_train_2.csv")
t5_data = T5Dataset(200)
t5_data.create("Data/t5_validation_2.csv")
t5_data = T5Dataset(200)
t5_data.create("Data/t5_test_2.csv")

Generating dataset: 0it [00:00, ?it/s]

Generating dataset: 100%|██████████| 1000/1000 [00:10<00:00, 91.22it/s]
Generating dataset: 100%|██████████| 200/200 [00:02<00:00, 97.56it/s] 
Generating dataset: 100%|██████████| 200/200 [00:02<00:00, 96.52it/s] 


In [16]:
from sympy import symbols, Eq, Add, Mul, Rational
from lxml import etree

# Recursive function to parse MathML into SymPy
def parse_mathml(mathml):
    root = etree.fromstring(mathml)  # Parse XML
    return convert_to_sympy(root)

def convert_to_sympy(element):
    tag = element.tag.split("}")[-1]  # Remove namespace (e.g., {MathML}mrow → mrow)

    if tag == "mi":  # Variable (e.g., h, g, c)
        return symbols(element.text)

    elif tag == "mo":  # Operator (+, -, *, /, =)
        return element.text.strip()

    elif tag == "msub":  # Subscripted variables (e.g., h_g, h_c)
        base, subscript = element.getchildren()
        return symbols(f"{convert_to_sympy(base)}_{convert_to_sympy(subscript)}")

    elif tag == "mrow":  # Math expressions inside <mrow>
        children = element.getchildren()
        expr = convert_to_sympy(children[0])
        for i in range(1, len(children), 2):  # Operators appear at odd indices
            op = convert_to_sympy(children[i])
            right = convert_to_sympy(children[i + 1])
            expr = {
                "=": Eq,  # ✅ Correctly handles equations
                "+": Add,
                "-": lambda a, b: a - b,
                "*": Mul,
                "/": lambda a, b: Rational(a, b)
            }.get(op, Add)(expr, right)
        return expr

    return None  # Fallback for unsupported cases

# Example MathML input
mathml_expr = '''
<mrow xmlns="http://www.w3.org/1998/Math/MathML">
    <mi>h</mi>
    <mo>=</mo>
    <msub>
        <mi>h</mi>
        <mi>g</mi>
    </msub>
    <mo>+</mo>
    <msub>
        <mi>h</m>
        <mi>c</mi>
    </msub>
</mrow>
'''

# Convert to SymPy expression
sympy_expr = parse_mathml(mathml_expr)
print(sympy_expr)


XMLSyntaxError: Opening and ending tag mismatch: mi line 11 and m, line 11, column 18 (<string>, line 11)

In [None]:
from sympy import symbols, Rational, Pow, Mul, Add
from lxml import etree

# Define a recursive function to parse MathML into SymPy
def parse_mathml(mathml):
    root = etree.fromstring(mathml)
    return convert_to_sympy(root)

def convert_to_sympy(element):
    tag = element.tag.split("}")[-1]  # Remove namespace (e.g., {http://www.w3.org/1998/Math/MathML}mrow → mrow)

    if tag == "mi":  # Variable (x, y, h, etc.)
        return symbols(element.text)

    elif tag == "mn":  # Number (1, 2, etc.)
        return Rational(element.text)

    elif tag == "mo":  # Operator (+, -, *)
        print(element.text.strip())
        return element.text.strip()

    elif tag == "mfrac":  # Fraction (a/b)
        num, den = element.getchildren()
        return Rational(convert_to_sympy(num), convert_to_sympy(den))

    elif tag == "msup":  # Exponentiation (x^2)
        base, exponent = element.getchildren()
        return Pow(convert_to_sympy(base), convert_to_sympy(exponent))

    elif tag == "msub":  # Subscripted variable (k_f, h_t, etc.)
        base, subscript = element.getchildren()
        return symbols(f"{convert_to_sympy(base)}_{convert_to_sympy(subscript)}")

    elif tag == "mrow":  # Grouping of expressions (a + b)
        children = element.getchildren()
        expr = convert_to_sympy(children[0])
        for i in range(1, len(children), 2):  # Every other element is an operator
            op = convert_to_sympy(children[i])
            right = convert_to_sympy(children[i + 1])
            expr = {"+" : Add, "-" : Sub, "*" : Mul, "/" : Rational}.get(op, Add)(expr, right)
        return expr

    return None  # Fallback for unsupported cases

# Example MathML Input (Super Complex Expression)
mathml_expr = '''
<mml:mrow xmlns:mml="http://www.w3.org/1998/Math/MathML">
    <mml:mi>h</mml:mi>
    <mml:mo>=</mml:mo>
    <mml:mfrac>
        <mml:mrow>
            <mml:mn>1</mml:mn>
            <mml:mo>-</mml:mo>
            <mml:mi>A</mml:mi>
        </mml:mrow>
        <mml:mrow>
            <mml:msub>
                <mml:mi>h</mml:mi>
                <mml:mi>f</mml:mi>
            </mml:msub>
        </mml:mrow>
    </mml:mfrac>
    <mml:mfrac>
        <mml:mrow>
            <mml:mn>2</mml:mn>
            <mml:msub>
                <mml:mi>k</mml:mi>
                <mml:mi>f</mml:mi>
            </mml:msub>
            <mml:msub>
                <mml:mi>k</mml:mi>
                <mml:mi>t</mml:mi>
            </mml:msub>
            <mml:msub>
                <mml:mi>k</mml:mi>
                <mml:mi>w</mml:mi>
            </mml:msub>
        </mml:mrow>
        <mml:mrow>
            <mml:mn>2</mml:mn>
            <mml:msub>
                <mml:mi>k</mml:mi>
                <mml:mi>t</mml:mi>
            </mml:msub>
            <mml:msub>
                <mml:mi>k</mml:mi>
                <mml:mi>w</mml:mi>
            </mml:msub>
            <mml:mo>-</mml:mo>
            <mml:msub>
                <mml:mi>k</mml:mi>
                <mml:mi>w</mml:mi>
            </mml:msub>
            <mml:msub>
                <mml:mi>k</mml:mi>
                <mml:mi>f</mml:mi>
            </mml:msub>
            <mml:mo>-</mml:mo>
            <mml:msub>
                <mml:mi>k</mml:mi>
                <mml:mi>f</mml:mi>
            </mml:msub>
            <mml:msub>
                <mml:mi>k</mml:mi>
                <mml:mi>t</mml:mi>
            </mml:msub>
        </mml:mrow>
    </mml:mfrac>
</mml:mrow>
'''
mathml_expr = """<mrow xmlns="http://www.w3.org/1998/Math/MathML">
    <mi>h</mi>
    <mo>=</mo>
    <msub>
        <mi>h</mi>
        <mi>g</mi>
    </msub>
    <mo>+</mo>
    <msub>
        <mi>h</mi>
        <mi>c</mi>
    </msub>
</mrow>"""

# Convert MathML to SymPy
sympy_expr = parse_mathml(mathml_expr)
print(sympy_expr)


=
+
h + h_c + h_g


In [6]:
data_files = {"train": "Data/t5_train_2.csv", "validation": "Data/t5_validation_2.csv", "test": "Data/t5_test_2.csv"}
mml_py_dataset = load_dataset("csv", data_files=data_files)

print(mml_py_dataset)
display(mml_py_dataset["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['MathML', 'Python'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['MathML', 'Python'],
        num_rows: 200
    })
    test: Dataset({
        features: ['MathML', 'Python'],
        num_rows: 200
    })
})


{'MathML': '<mml:msub>\n<mml:mi>N</mml:mi>\n<mml:mi>P</mml:mi>\n</mml:msub>\n<mml:mo>=</mml:mo>\n<mml:mrow>\n<mml:munderover>\n<mml:mo>∑</mml:mo>\n<mml:mrow>\n<mml:mi>t</mml:mi>\n<mml:mo>=</mml:mo>\n<mml:mn>7</mml:mn>\n</mml:mrow>\n<mml:msub>\n<mml:mi>Ο</mml:mi>\n<mml:mrow>\n<mml:mi>O</mml:mi>\n<mml:mi>ι</mml:mi>\n<mml:mi>χ</mml:mi>\n</mml:mrow>\n</mml:msub>\n</mml:munderover>\n<mml:mfenced>\n<mml:mrow>\n<mml:msup>\n<mml:msup>\n<mml:mtext>exp</mml:mtext>\n<mml:mi>φ</mml:mi>\n</mml:msup>\n<mml:msub>\n<mml:mi>Μ</mml:mi>\n<mml:mrow>\n<mml:mi>ω</mml:mi>\n<mml:mi>L</mml:mi>\n</mml:mrow>\n</mml:msub>\n</mml:msup>\n<mml:mo>+</mml:mo>\n<mml:mrow>\n<mml:mi>tan</mml:mi>\n<mml:mfenced>\n<mml:msub>\n<mml:mi>η</mml:mi>\n<mml:mrow>\n<mml:mi>y</mml:mi>\n<mml:mi>λ</mml:mi>\n</mml:mrow>\n</mml:msub>\n</mml:mfenced>\n</mml:mrow>\n</mml:mrow>\n</mml:mfenced>\n</mml:mrow>',
 'Python': "N_P = Symbol('N_P')\nt = Symbol('t')\nΟ_Oιχ = Symbol('Ο_Oιχ')\nΜ_ωL = Symbol('Μ_ωL')\nφ = Symbol('φ')\nη_yλ = Symbol('η_y

In [7]:
model_checkpoint = "t5-small"

tokenizer = AutoTokenizer.from_pretrained("Tokenizer_Files/mathml-py-tokenizer-unigram-T5wrapped", return_tensors="pt")

max_length = 1024
def preprocess_function(examples):
    prefix = "translate MathML to Python: "
    inputs = [prefix + mml for mml in examples["MathML"]]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["Python"], max_length=max_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = mml_py_dataset.map(preprocess_function, batched=True, remove_columns=["MathML", "Python"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [24]:
check = preprocess_function(mml_py_dataset["train"][0])
# print(check["labels"])
ids = check["input_ids"][4]
print(tokenizer.convert_ids_to_tokens(ids))

['t', 'ra', 'n', 's', 'l', 'a', 'te', ' ', 'M', 'a', 't', 'h', 'M', 'L', ' ', 't', 'o', ' ', 'P', 'y', 't', 'h', 'o', 'n', ':', ' ', ':']


In [25]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

batch = data_collator([tokenized_dataset["train"][i] for i in range(1,4)])
print(batch["labels"])

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tensor([[ 159,   21,   29,   21,   35,   36,   25,   34,   31,  253,   30,   32,
           19,  162,   21,   29,   21,   35,   36,   25,   34,   31,  236,   30,
           32,   19, 1513,  142,  125,   21,   29,   21,   35,   36,   25,   34,
           31,   30, 1513,  142,  125,   30,   32,   19,   27,   21,   29,   21,
           48,   47,  339,   41,   21,  214, 1379,   21,   49,   21,   57,   37,
           31, 1513,  142,  125,   32,   41,   21, 1513,  142,  454, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100],
        [  26,   21,   29,   21,   35,   36,   25,   34,   31,   30,   26,   30,
           32,   19,   22,   21,   29,   21,   35,   36,   25,   34,   31,   30,
           22,   30,   32,   19, 1378,   33,   94,   21,   29,   21,   35,   36,
           25,   34,   31,   30, 1378,   33,   94,   30,   32,   19,   27,   21,
           29,   21,   48,   47,   31,   26,   41,   21,  260,   22,  173, 1378,
           33,   94,   41,   21, 

In [26]:
metric = evaluate.load("sacrebleu")

predictions = ["η = Symbol('η')\nη_0 = Symbol('η_0')\nQ_η = Symbol('Q_η')\nR = Symbol('R')\nT = Symbol('T')\ne = Eq(η, η_0*exp(((Q_η*T)/(R*T)))"]
references = [["η = Symbol('η')\nη_0 = Symbol('η_0')\nQ_η = Symbol('Q_η')\nR = Symbol('R')\nT = Symbol('T')\ne = Eq(η, η_0*exp(Q_η/(R*T)))"]]

metric.compute(predictions=predictions, references=references)

{'score': 88.64759993490114,
 'counts': [61, 59, 56, 53],
 'totals': [66, 65, 64, 63],
 'precisions': [92.42424242424242, 90.76923076923077, 87.5, 84.12698412698413],
 'bp': 1.0,
 'sys_len': 66,
 'ref_len': 61}

In [27]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # SacreBLEU
    BLEUresult = metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Equation evaluation
    

    return {"bleu": BLEUresult["score"]}   

In [28]:
from huggingface_hub import login

hf_login_key = os.environ.get("HF_LOGIN_KEY")
login(token=hf_login_key)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\kyanj\.cache\huggingface\token
Login successful


In [None]:
args = Seq2SeqTrainingArguments(
    f"t5-small-mathml-to-python",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model, 
    args, 
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# trainer.evaluate(max_length=max_length)



In [10]:
trainer.train()

  0%|          | 0/48 [00:00<?, ?it/s]

{'train_runtime': 17669.976, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.003, 'train_loss': 2.329975128173828, 'epoch': 3.0}


TrainOutput(global_step=48, training_loss=2.329975128173828, metrics={'train_runtime': 17669.976, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.003, 'total_flos': 1311167215595520.0, 'train_loss': 2.329975128173828, 'epoch': 3.0})

In [8]:

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Load trained model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base-mathml-to-python")

trainer = Seq2SeqTrainer(
    model, 
    args, 
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.evaluate(max_length=max_length)

  0%|          | 0/2 [00:00<?, ?it/s]

OverflowError: can't convert negative int to unsigned

In [19]:
from transformers import pipeline
model_checkpoint = "t5-base-mathml-to-python"
translator = pipeline("text2text-generation", model=model_checkpoint)
result = translator("translate: MathML to Python: \n<mml:mi>x</mml:mi>\n<mml:mo>=</mml:mo>\n<mml:mn>5</mml:mn>")
print(result)
                    


[{'generated_text': '         '}]


In [10]:
string = """<mml:mi>η</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:msub>
    <mml:mi>η</mml:mi>
    <mml:mi>0</mml:mi>
</mml:msub>
<mml:msup>
    <mml:mtext>exp</mml:mtext>
    <mml:mrow>
    <mml:mfrac>
        <mml:msub>
        <mml:mi>Q</mml:mi>
        <mml:mi>η</mml:mi>
        </mml:msub>
        <mml:mrow>
        <mml:mi>R</mml:mi>
        <mml:mi>T</mml:mi>
        </mml:mrow>
    </mml:mfrac>
    </mml:mrow>
</mml:msup>
</mml:mrow>"""

print(repr(string))

'<mml:mi>η</mml:mi>\n<mml:mo>=</mml:mo>\n<mml:mrow>\n<mml:msub>\n    <mml:mi>η</mml:mi>\n    <mml:mi>0</mml:mi>\n</mml:msub>\n<mml:msup>\n    <mml:mtext>exp</mml:mtext>\n    <mml:mrow>\n    <mml:mfrac>\n        <mml:msub>\n        <mml:mi>Q</mml:mi>\n        <mml:mi>η</mml:mi>\n        </mml:msub>\n        <mml:mrow>\n        <mml:mi>R</mml:mi>\n        <mml:mi>T</mml:mi>\n        </mml:mrow>\n    </mml:mfrac>\n    </mml:mrow>\n</mml:msup>\n</mml:mrow>'


In [23]:
model_name = "t5-small"
# tokenizer = T5Tokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained("kj821/mathml-py-tokenizer-unigram-T5wrapped")

# model = T5ForConditionalGeneration.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained("kj821/t5-base-mathml-to-python")

text = "\n<mml:mi>h</mml:mi>\n<mml:mo>=</mml:mo>\n<mml:mrow>\n<mml:msub>\n<mml:mi>h</mml:mi>\n<mml:mi>c</mml:mi>\n</mml:msub>\n<mml:mo>+</mml:mo>\n<mml:msub>\n<mml:mi>h</mml:mi>\n<mml:mi>g</mml:mi>\n</mml:msub>\n</mml:mrow>\n"
# text = "I love going to the park on the weekend"
prefix = "translate: MathML to Python: "
input_ids = tokenizer.encode(prefix + text, return_tensors="pt")
print(tokenizer.tokenize(prefix + text))
check = tokenizer.decode(input_ids[0], skip_special_tokens=False)

output_ids = model.generate(input_ids)
output = tokenizer.decode(output_ids[0], skip_special_tokens=False, max_new_tokens=100)
print(output)

['t', 'ra', 'n', 's', 'l', 'a', 'te', ':', ' ', 'M', 'a', 't', 'h', 'M', 'L', ' ', 't', 'o', ' ', 'P', 'y', 't', 'h', 'o', 'n', ':', ' ', '\n', '<mml:mi>', 'h', '</mml:mi>', '\n', '<mml:mo>', '=', '</mml:mo>', '\n', '<mml:mrow>', '\n', '<mml:msub>', '\n', '<mml:mi>', 'h', '</mml:mi>', '\n', '<mml:mi>', 'c', '</mml:mi>', '\n', '</mml:msub>', '\n', '<mml:mo>', '+', '</mml:mo>', '\n', '<mml:msub>', '\n', '<mml:mi>', 'h', '</mml:mi>', '\n', '<mml:mi>', 'g', '</mml:mi>', '\n', '</mml:msub>', '\n', '</mml:mrow>', '\n']




<pad>: : : 
<mml:mi>: 
<mml:mi>: 
<mml:mi>: 

