In [None]:
!pip install -U datasets fsspec aiofiles huggingface_hub evaluate sacrebleu wandb sympy

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5Tokenizer, T5TokenizerFast, T5ForConditionalGeneration, EarlyStoppingCallback
import torch
import evaluate
import os
import numpy as np
from sympy import *
import csv
import wandb
from peft import get_peft_model, LoraConfig, TaskType, PeftModel

In [None]:
from google.colab import userdata
from huggingface_hub import login
hf_login_key = userdata.get('HF_TOKEN')
login(token=hf_login_key)

In [None]:
mml_py_dataset = load_dataset("kj821/MathML-Python-Translation-Large")

In [None]:
print(mml_py_dataset)

DatasetDict({
    train: Dataset({
        features: ['MathML', 'Python'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['MathML', 'Python'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['MathML', 'Python'],
        num_rows: 1000
    })
})


In [None]:
model_checkpoint = "t5-large"
tokenizer = T5Tokenizer.from_pretrained("kj821/mathml-py-tokenizer-sentencepiece-v1")

In [None]:
mml_sentence = mml_py_dataset["train"]["MathML"][0]
py_sentence = mml_py_dataset["train"]["Python"][0]

inputs = tokenizer(mml_sentence, text_target=py_sentence)
print(inputs)
print(tokenizer.decode(inputs["input_ids"], skip_special_tokens=False))
# print(tokenizer.encode(mml_sentence))
# print(tokenizer.decode(inputs["labels"], skip_special_tokens=True))

{'input_ids': [46, 24, 140, 27, 39, 16, 47, 11, 39, 25, 39, 19, 39, 16, 161, 11, 39, 25, 39, 24, 128, 27, 39, 16, 47, 11, 39, 6, 168, 14, 39, 18, 39, 24, 111, 27, 39, 21, 39, 17, 39, 25, 39, 24, 90, 27, 39, 7, 39, 25, 39, 24, 31, 27, 39, 17, 39, 5, 39, 24, 95, 27, 39, 25, 39, 24, 63, 27, 39, 24, 81, 27, 39, 24, 110, 27, 39, 18, 39, 9, 39, 4, 39, 18, 39, 6, 69, 14, 39, 12, 39, 18, 39, 4, 39, 18, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [46, 140, 46, 47, 46, 28, 52, 140, 50, 48, 39, 128, 46, 47, 46, 28, 52, 128, 50, 48, 39, 111, 46, 47, 46, 28, 52, 111, 50, 48, 39, 90, 46, 47, 46, 28, 52, 90, 50, 48, 39, 95, 49, 63, 81, 110, 46, 47, 46, 28, 52, 95, 49, 63, 81, 110, 50, 48, 39, 55, 46, 47, 46, 29, 51, 140, 53,

In [None]:
max_length = 512
def preprocess_function(examples):
    prefix = "translate MathML to Python: "
    inputs = [prefix + mml for mml in examples["MathML"]]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["Python"], max_length=max_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = mml_py_dataset.map(preprocess_function, batched=True, remove_columns=["MathML", "Python"])

In [None]:
ids = tokenized_dataset["train"][0]["labels"]
print(ids)
print(tokenizer.decode(ids, skip_special_tokens=False))

[46, 140, 46, 47, 46, 28, 52, 140, 50, 48, 39, 128, 46, 47, 46, 28, 52, 128, 50, 48, 39, 111, 46, 47, 46, 28, 52, 111, 50, 48, 39, 90, 46, 47, 46, 28, 52, 90, 50, 48, 39, 95, 49, 63, 81, 110, 46, 47, 46, 28, 52, 95, 49, 63, 81, 110, 50, 48, 39, 55, 46, 47, 46, 29, 51, 140, 53, 46, 36, 51, 90, 176, 31, 51, 95, 49, 63, 81, 110, 384, 53, 46, 51, 128, 53, 46, 168, 53, 46, 111, 165, 2]
σ = Symbol('σ')
z = Symbol('z')
F = Symbol('F')
Λ = Symbol('Λ')
M_ψΞa = Symbol('M_ψΞa')
e = Eq(σ, Sum(Λ*sin(M_ψΞa)**2, (z, 1, F)))</s>


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(3900, 1024)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

batch = data_collator([tokenized_dataset["train"][i] for i in range(1,4)])
print(batch["labels"])

tensor([[  46,   77,   49,   84,   99,   46,   47,   46,   28,   52,   77,   49,
           84,   99,   50,   48,   39,   79,   46,   47,   46,   28,   52,   79,
           50,   48,   39,  132,   49,  574,  115,   46,   47,   46,   28,   52,
          132,   49,  574,  115,   50,   48,   39,   55,   46,   47,   46,   29,
           51,   77,   49,   84,   99,   53,   46,   30,   51,  132,   49,  574,
          115,  173,   35,  200,   79,   58,    2, -100, -100],
        [  46,   68,   46,   47,   46,   28,   52,   68,   50,   48,   39,  151,
           46,   47,   46,   28,   52,  151,   50,   48,   39,  144,   49,  103,
          901,   46,   47,   46,   28,   52,  144,   49,  103,  901,   50,   48,
           39,   55,   46,   47,   46,   29,   51,   68,   53,   46,   56,   61,
           60,   51,  151,  334,   46,   54,   46,   30,   51,  144,   49,  103,
          901,   58,    2, -100, -100, -100, -100, -100, -100],
        [  46,  147,   46,   47,   46,   28,   52,  147,   50,

In [None]:
import evaluate
metric = evaluate.load("sacrebleu")

predictions = ["η = Symbol('η')\nη_0 = Symbol('η_0')\nQ_η = Symbol('Q_η')\nR = Symbol('R')\nT = Symbol('T')\ne = Eq(η, η_0*exp(((Q_η*T)/(R*T)))"]
references = [["η = Symbol('η')\nη_0 = Symbol('η_0')\nQ_η = Symbol('Q_η')\nR = Symbol('R')\nT = Symbol('T')\ne = Eq(η, η_0*exp(Q_η/(R*T)))"]]

metric.compute(predictions=predictions, references=references)

{'score': 88.64759993490114,
 'counts': [61, 59, 56, 53],
 'totals': [66, 65, 64, 63],
 'precisions': [92.42424242424242, 90.76923076923077, 87.5, 84.12698412698413],
 'bp': 1.0,
 'sys_len': 66,
 'ref_len': 61}

In [None]:
# def safe_simplify(a, b):
#     try:
#         return simplify(sympify(a) - sympify(b)) == 0
#     except Exception:
#         return False

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # SacreBLEU
    BLEUresult = metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Equation evaluation
    # symbolic_acc = np.mean([safe_simplify(p, l) for p, l in zip(decoded_preds, decoded_labels)])


    return {"bleu": BLEUresult["score"]}

In [None]:
sweep_config = {
    "method": "bayes",
    "name": f"{model_checkpoint}-sweep-bayes",
    "metric": {
        "goal": "minimize",
        "name": "eval/loss"
    },
    "parameters": {
        "epochs": {
            "values": [2, 4]
        },
        "learning_rate": {
            "values": [1e-4, 3e-4, 6e-4]
        },
        "weight_decay": {
            "values": [0.0001, 0.1]
        },
        "r": {
            "values": [2, 4, 8]
        }
    }
}

In [None]:
wandb_api_key = userdata.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

os.environ["WANDB_WATCH"] = "false"    # no heavy layer logging
os.environ["WANDB_LOG_MODEL"] = "false"# skip model artifacts per run

def sweep_train(config=None):
   with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config

    tokenizer = T5Tokenizer.from_pretrained("kj821/mathml-py-tokenizer-sentencepiece-v1")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    model.resize_token_embeddings(len(tokenizer))


    lora_config = LoraConfig(
    r=config.r, # Sweep
    lora_alpha=4 * config.r,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
    )

    model = get_peft_model(model, lora_config)
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Trainable params: {trainable:,}")

    for p in model.base_model.get_input_embeddings().parameters():
        p.requires_grad = True
    for p in model.base_model.get_output_embeddings().parameters():
        p.requires_grad = True

    args = Seq2SeqTrainingArguments(
    output_dir = f"Model_Files/{model_checkpoint}-mathml-python",
    eval_strategy="steps",
    eval_steps=250,
    save_strategy="no",
    logging_strategy="steps",
    logging_steps=10,
    logging_first_step=True,
    learning_rate=config.learning_rate, # Sweep
    label_smoothing_factor=0.1,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=8, # Sweep
    per_device_eval_batch_size=16,
    weight_decay=config.weight_decay, # Sweep
    save_total_limit=3,
    num_train_epochs=config.epochs, # Sweep
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
    metric_for_best_model="eval_loss",
    # load_best_model_at_end=True,
    report_to=["wandb"],
    run_name=f"{model_checkpoint}-mathml-python",
    )

    trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

    trainer.train()
    wandb.finish()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkj821[0m ([33mkj821-imperial-college-london[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
sweep_id = wandb.sweep(sweep_config, project='mathml-python-colab')
wandb.agent(sweep_id, sweep_train, count=20)

Create sweep with ID: voe3oj40
Sweep URL: https://wandb.ai/kj821-imperial-college-london/mathml-python-colab/sweeps/voe3oj40


[34m[1mwandb[0m: Agent Starting Run: tqlc2msd with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	r: 4
[34m[1mwandb[0m: 	weight_decay: 0.1


Trainable params: 1,179,648


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Bleu
250,0.0,,0.001421
500,0.0,,0.001421
750,0.0,,0.001421


0,1
eval/bleu,▁▁▁
eval/runtime,█▁█
eval/samples_per_second,▁█▁
eval/steps_per_second,▁█▁
train/epoch,▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/bleu,0.00142
eval/loss,
eval/runtime,84.8918
eval/samples_per_second,11.78
eval/steps_per_second,0.742
total_flos,3842241497235456.0
train/epoch,0.6
train/global_step,750.0
train/grad_norm,
train/learning_rate,0.0003


[34m[1mwandb[0m: Agent Starting Run: kbzmitzp with config:
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 0.0006
[34m[1mwandb[0m: 	r: 2
[34m[1mwandb[0m: 	weight_decay: 0.1


Trainable params: 589,824


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Bleu
250,0.0,,0.001421
500,0.0,,0.001421
750,0.0,,0.001421


0,1
eval/bleu,▁▁▁
eval/runtime,▁▄█
eval/samples_per_second,█▅▁
eval/steps_per_second,█▅▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/bleu,0.00142
eval/loss,
eval/runtime,85.1035
eval/samples_per_second,11.75
eval/steps_per_second,0.74
total_flos,3839031278665728.0
train/epoch,0.6
train/global_step,750.0
train/grad_norm,
train/learning_rate,0.0006


[34m[1mwandb[0m: Agent Starting Run: 33k5l1gf with config:
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 0.0006
[34m[1mwandb[0m: 	r: 2
[34m[1mwandb[0m: 	weight_decay: 0.1


Trainable params: 589,824


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Bleu
250,0.0,,0.001421
500,0.0,,0.001421
750,0.0,,0.001421


0,1
eval/bleu,▁▁▁
eval/runtime,▁█▅
eval/samples_per_second,█▁▄
eval/steps_per_second,█▁▃
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/bleu,0.00142
eval/loss,
eval/runtime,85.3652
eval/samples_per_second,11.714
eval/steps_per_second,0.738
total_flos,3839031278665728.0
train/epoch,0.6
train/global_step,750.0
train/grad_norm,
train/learning_rate,0.0006


[34m[1mwandb[0m: Agent Starting Run: ut8y0wtn with config:
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	r: 4
[34m[1mwandb[0m: 	weight_decay: 0.0001


Trainable params: 1,179,648


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Bleu
250,0.0,,0.001421


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)

for p in model.base_model.get_input_embeddings().parameters():
    p.requires_grad = True
for p in model.base_model.get_output_embeddings().parameters():
    p.requires_grad = True

In [None]:
print(torch.cuda.is_available())

True


In [None]:
wandb_api_key = userdata.get('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

wandb.init(
    project="mathml-python-colab",
    name=f"{model_checkpoint}-VRAM-check",
    tags=[f"{model_checkpoint}", "mathml→python"],
    notes="sizing",
)

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"
wandb.watch(model, log="gradients", log_freq=0)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkj821[0m ([33mkj821-imperial-college-london[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
args = Seq2SeqTrainingArguments(
    output_dir = f"Model_Files/{model_checkpoint}-mathml-python",
    eval_strategy="epoch",
    # eval_steps=20,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    logging_first_step=True,
    learning_rate=6e-4,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    report_to=["wandb"],
    run_name=f"{model_checkpoint}-mathml-python",
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# trainer.evaluate(max_length=max_length)

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Bleu
1,0.0,,0.0


[34m[1mwandb[0m: Adding directory to artifact (./Model_Files/t5-large-mathml-python/checkpoint-1250)... Done. 0.2s


KeyboardInterrupt: 

In [None]:
tokenizer = T5Tokenizer.from_pretrained("kj821/mathml-py-tokenizer-sentencepiece-v1")
base_model = T5ForConditionalGeneration.from_pretrained("t5-base")
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(base_model, "kj821/t5-small-mathml-python-v1")


In [None]:

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Load trained model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base-mathml-to-python")

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.evaluate(max_length=max_length)

In [None]:
from transformers import pipeline
tokenizer = T5Tokenizer.from_pretrained("kj821/mathml-py-tokenizer-sentencepiece-v1")
base_model = T5ForConditionalGeneration.from_pretrained("t5-base")
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(base_model, "kj821/t5-base-mathml-python")

mml = mml_py_dataset["test"]["MathML"][0]
translator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
result = translator(f"translate MathML to Python: {mml}")
print(result)



adapter_config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.5M [00:00<?, ?B/s]

Device set to use cuda:0
The model 'PeftModelForSeq2SeqLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGen

[{'generated_text': "ω_= Symbol('ΖΚΦ')\nθθθ(θθ(θ'"}]


In [None]:
def translate_mathml_to_sympy(text):
    # Tokenise input string
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            num_beams=4,                   # optional for better outputs
            early_stopping=True
        )

    # Decode generated token IDs to string
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

mml = mml_py_dataset["test"]["MathML"][0]
print(translate_mathml_to_sympy(f"translate MathML to Python: {mml}"))



<mml:mn>3</mml:mn></mml:mn>
</mml:mrow></mml:mrow></mml:mrow>
<mml:mo>-</mml:mo>
<mml:mn><mml:mn>3</mml:mn></mml:mn>
</mml:mrow></mml:mrow></mml:mrow></mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:mn><mml:mn>3</mml:mn></mml:mn>
</mml:mrow></mml:mrow></mml:mrow></mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:mn><mml:mn>3</mml:mn></mml:mn></mml:mn>
</mml:mrow></mml:mrow></mml:mrow></mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:mn><mml:mn>3</mml:mn></mml:mn></mml:mn>
</mml:mrow></mml:mrow></mml:mrow></mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:mn><mml:mn>3</mml:mn></mml:mn></mml:mn>
</mml:mrow>


In [None]:
print(mml)

<mml:mi>η</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>sin</mml:mi>
<mml:mfenced>
<mml:mi>Κ</mml:mi>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>cos</mml:mi>
<mml:mfenced>
<mml:mrow>
<mml:mi>Χ</mml:mi>
<mml:mo>+</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
