In [15]:
from datasets import load_dataset
from src.paper_to_equation.Generation.Equation_BaseDataset import BaseDataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5Tokenizer, T5TokenizerFast, T5ForConditionalGeneration
import torch
import evaluate
import os
import numpy as np
import csv
import wandb
from peft import get_peft_model, LoraConfig, TaskType

In [5]:
class T5Dataset(BaseDataset):
    def __init__(self, num):
        super().__init__(num)

    def get_columns(self):
        return ["MathML", "Python"]

t5_data = T5Dataset(1000)
t5_data.create("Data/t5_train_2.csv")
t5_data = T5Dataset(200)
t5_data.create("Data/t5_validation_2.csv")
t5_data = T5Dataset(200)
t5_data.create("Data/t5_test_2.csv")

Generating dataset: 0it [00:00, ?it/s]

Generating dataset: 100%|██████████| 1000/1000 [00:10<00:00, 91.22it/s]
Generating dataset: 100%|██████████| 200/200 [00:02<00:00, 97.56it/s] 
Generating dataset: 100%|██████████| 200/200 [00:02<00:00, 96.52it/s] 


In [3]:
data_files = {
    "train": "Data/t5_train_2.csv",
    "validation": "Data/t5_validation_2.csv",
    "test": "Data/t5_test_2.csv"
    }
mml_py_dataset = load_dataset("csv", data_files=data_files)

def remove_carriage_return(examples):
    return {
        "MathML": examples["MathML"].replace("\r\n", "\n").replace("\r", ""),
        "Python": examples["Python"].replace("\r\n", "\n").replace("\r", "")
    }

mml_py_dataset.map(remove_carriage_return)

DatasetDict({
    train: Dataset({
        features: ['MathML', 'Python'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['MathML', 'Python'],
        num_rows: 200
    })
    test: Dataset({
        features: ['MathML', 'Python'],
        num_rows: 200
    })
})

In [4]:
model_checkpoint = "t5-small"
tokenizer = T5Tokenizer.from_pretrained("kj821/mathml-py-tokenizer-sentencepiece-v1")

In [5]:
mml_sentence = mml_py_dataset["train"]["MathML"][0]
py_sentence = mml_py_dataset["train"]["Python"][0]

inputs = tokenizer(mml_sentence, text_target=py_sentence)
print(inputs)
print(tokenizer.decode(inputs["input_ids"], skip_special_tokens=True))
# print(tokenizer.encode(mml_sentence))
# print(tokenizer.decode(inputs["labels"], skip_special_tokens=True))

{'input_ids': [46, 5, 40, 39, 24, 167, 27, 40, 39, 24, 130, 27, 40, 39, 9, 40, 39, 16, 47, 11, 40, 39, 25, 40, 39, 19, 40, 39, 16, 161, 11, 40, 39, 25, 40, 39, 24, 62, 27, 40, 39, 16, 47, 11, 40, 39, 6, 163, 14, 40, 39, 18, 40, 39, 5, 40, 39, 24, 134, 27, 40, 39, 25, 40, 39, 24, 170, 27, 40, 39, 24, 147, 27, 40, 39, 24, 141, 27, 40, 39, 18, 40, 39, 9, 40, 39, 21, 40, 39, 17, 40, 39, 25, 40, 39, 7, 40, 39, 7, 40, 39, 26, 30, 10, 40, 39, 24, 137, 27, 40, 39, 12, 40, 39, 5, 40, 39, 24, 78, 27, 40, 39, 25, 40, 39, 24, 64, 27, 40, 39, 24, 100, 27, 40, 39, 18, 40, 39, 9, 40, 39, 12, 40, 39, 16, 54, 11, 40, 39, 25, 40, 39, 24, 33, 27, 40, 39, 17, 40, 39, 5, 40, 39, 24, 126, 27, 40, 39, 25, 40, 39, 24, 129, 27, 40, 39, 24, 82, 27, 40, 39, 18, 40, 39, 9, 40, 39, 4, 40, 39, 18, 40, 39, 18, 40, 39, 4, 40, 39, 18, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [6]:

max_length = 512
def preprocess_function(examples):
    prefix = "translate MathML to Python: "
    inputs = [prefix + mml for mml in examples["MathML"]]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["Python"], max_length=max_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = mml_py_dataset.map(preprocess_function, batched=True, remove_columns=["MathML", "Python"])

In [7]:
ids = tokenized_dataset["train"][0]["labels"]
print(ids)
print(tokenizer.decode(ids, skip_special_tokens=True))

[46, 167, 49, 130, 46, 47, 46, 28, 52, 167, 49, 130, 50, 48, 40, 39, 62, 46, 47, 46, 28, 52, 62, 50, 48, 40, 39, 134, 49, 170, 147, 141, 46, 47, 46, 28, 52, 134, 49, 170, 147, 141, 50, 48, 40, 39, 78, 49, 64, 100, 46, 47, 46, 28, 52, 78, 49, 64, 100, 50, 48, 40, 39, 137, 46, 47, 46, 28, 52, 137, 50, 48, 40, 39, 126, 49, 129, 82, 46, 47, 46, 28, 52, 126, 49, 129, 82, 50, 48, 40, 39, 55, 46, 47, 46, 29, 51, 167, 49, 130, 53, 46, 36, 51, 30, 51, 137, 173, 78, 49, 64, 100, 46, 54, 46, 33, 51, 126, 49, 129, 82, 48, 53, 46, 51, 62, 53, 46, 163, 53, 46, 134, 49, 170, 147, 141, 165, 2]
N_P = Symbol('N_P')
t = Symbol('t')
Ο_Oιχ = Symbol('Ο_Oιχ')
Μ_ωL = Symbol('Μ_ωL')
φ = Symbol('φ')
η_yλ = Symbol('η_yλ')
e = Eq(N_P, Sum(exp(φ)**Μ_ωL + tan(η_yλ), (t, 7, Ο_Oιχ)))


In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

batch = data_collator([tokenized_dataset["train"][i] for i in range(1,4)])
print(batch["labels"])

tensor([[  46,   67,   46,   47,   46,   28,   52,   67,   50,   48,   40,   39,
           92,   46,   47,   46,   28,   52,   92,   50,   48,   40,   39,  130,
           49,  148,  139,  102,   46,   47,   46,   28,   52,  130,   49,  148,
          139,  102,   50,   48,   40,   39,   55,   46,   47,   46,   29,   51,
           67,   53,   46,   37,  175,   35,   51,   92,   48,   46,   54,   46,
           31,   51,  130,   49,  148,  139,  102,   48,   53,   46,  130,   49,
          148,  139,  102,   58,    2, -100, -100, -100, -100],
        [  46,   94,   46,   47,   46,   28,   52,   94,   50,   48,   40,   39,
           75,   46,   47,   46,   28,   52,   75,   50,   48,   40,   39,  153,
          186,   57,   66,   46,   47,   46,   28,   52,  153,  186,   57,   66,
           50,   48,   40,   39,   55,   46,   47,   46,   29,   51,   94,   53,
           46,   37,   51,   35,   51,   75,  172,  153,  186,   57,   66,   53,
           46,  153,  186,   57,   66,   58, 

In [10]:
metric = evaluate.load("sacrebleu")

predictions = ["η = Symbol('η')\nη_0 = Symbol('η_0')\nQ_η = Symbol('Q_η')\nR = Symbol('R')\nT = Symbol('T')\ne = Eq(η, η_0*exp(((Q_η*T)/(R*T)))"]
references = [["η = Symbol('η')\nη_0 = Symbol('η_0')\nQ_η = Symbol('Q_η')\nR = Symbol('R')\nT = Symbol('T')\ne = Eq(η, η_0*exp(Q_η/(R*T)))"]]

metric.compute(predictions=predictions, references=references)

{'score': 88.64759993490114,
 'counts': [61, 59, 56, 53],
 'totals': [66, 65, 64, 63],
 'precisions': [92.42424242424242, 90.76923076923077, 87.5, 84.12698412698413],
 'bp': 1.0,
 'sys_len': 66,
 'ref_len': 61}

In [11]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # SacreBLEU
    BLEUresult = metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Equation evaluation
    

    return {"bleu": BLEUresult["score"]}   

In [12]:
from huggingface_hub import login

hf_login_key = os.environ.get("HF_LOGIN_KEY")
login(token=hf_login_key)

In [13]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)

In [None]:
print(torch.cuda.is_available())

In [16]:
wandb_api_key = os.environ.get("WANDB_API_KEY")
wandb.login(key=wandb_api_key)

# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="mathml-python"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="true"

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\kyanj\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mkj821[0m ([33mkj821-imperial-college-london[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
args = Seq2SeqTrainingArguments(
    output_dir = f"Model_Files/t5-small-mathml-python-v1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    metric_for_best_model="bleu",
    load_best_model_at_end=True,
    report_to=["wandb"],
    run_name="t5-small-mathml-python-v1",
)

trainer = Seq2SeqTrainer(
    model, 
    args, 
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# trainer.evaluate(max_length=max_length)



In [10]:
trainer.train()

  0%|          | 0/48 [00:00<?, ?it/s]

{'train_runtime': 17669.976, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.003, 'train_loss': 2.329975128173828, 'epoch': 3.0}


TrainOutput(global_step=48, training_loss=2.329975128173828, metrics={'train_runtime': 17669.976, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.003, 'total_flos': 1311167215595520.0, 'train_loss': 2.329975128173828, 'epoch': 3.0})

In [8]:

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Load trained model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base-mathml-to-python")

trainer = Seq2SeqTrainer(
    model, 
    args, 
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.evaluate(max_length=max_length)

  0%|          | 0/2 [00:00<?, ?it/s]

OverflowError: can't convert negative int to unsigned

In [19]:
from transformers import pipeline
model_checkpoint = "t5-base-mathml-to-python"
translator = pipeline("text2text-generation", model=model_checkpoint)
result = translator("translate: MathML to Python: \n<mml:mi>x</mml:mi>\n<mml:mo>=</mml:mo>\n<mml:mn>5</mml:mn>")
print(result)
                    


[{'generated_text': '         '}]


In [10]:
string = """<mml:mi>η</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:msub>
    <mml:mi>η</mml:mi>
    <mml:mi>0</mml:mi>
</mml:msub>
<mml:msup>
    <mml:mtext>exp</mml:mtext>
    <mml:mrow>
    <mml:mfrac>
        <mml:msub>
        <mml:mi>Q</mml:mi>
        <mml:mi>η</mml:mi>
        </mml:msub>
        <mml:mrow>
        <mml:mi>R</mml:mi>
        <mml:mi>T</mml:mi>
        </mml:mrow>
    </mml:mfrac>
    </mml:mrow>
</mml:msup>
</mml:mrow>"""

print(repr(string))

'<mml:mi>η</mml:mi>\n<mml:mo>=</mml:mo>\n<mml:mrow>\n<mml:msub>\n    <mml:mi>η</mml:mi>\n    <mml:mi>0</mml:mi>\n</mml:msub>\n<mml:msup>\n    <mml:mtext>exp</mml:mtext>\n    <mml:mrow>\n    <mml:mfrac>\n        <mml:msub>\n        <mml:mi>Q</mml:mi>\n        <mml:mi>η</mml:mi>\n        </mml:msub>\n        <mml:mrow>\n        <mml:mi>R</mml:mi>\n        <mml:mi>T</mml:mi>\n        </mml:mrow>\n    </mml:mfrac>\n    </mml:mrow>\n</mml:msup>\n</mml:mrow>'


In [23]:
model_name = "t5-small"
# tokenizer = T5Tokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained("kj821/mathml-py-tokenizer-unigram-T5wrapped")

# model = T5ForConditionalGeneration.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained("kj821/t5-base-mathml-to-python")

text = "\n<mml:mi>h</mml:mi>\n<mml:mo>=</mml:mo>\n<mml:mrow>\n<mml:msub>\n<mml:mi>h</mml:mi>\n<mml:mi>c</mml:mi>\n</mml:msub>\n<mml:mo>+</mml:mo>\n<mml:msub>\n<mml:mi>h</mml:mi>\n<mml:mi>g</mml:mi>\n</mml:msub>\n</mml:mrow>\n"
# text = "I love going to the park on the weekend"
prefix = "translate: MathML to Python: "
input_ids = tokenizer.encode(prefix + text, return_tensors="pt")
print(tokenizer.tokenize(prefix + text))
check = tokenizer.decode(input_ids[0], skip_special_tokens=False)

output_ids = model.generate(input_ids)
output = tokenizer.decode(output_ids[0], skip_special_tokens=False, max_new_tokens=100)
print(output)

['t', 'ra', 'n', 's', 'l', 'a', 'te', ':', ' ', 'M', 'a', 't', 'h', 'M', 'L', ' ', 't', 'o', ' ', 'P', 'y', 't', 'h', 'o', 'n', ':', ' ', '\n', '<mml:mi>', 'h', '</mml:mi>', '\n', '<mml:mo>', '=', '</mml:mo>', '\n', '<mml:mrow>', '\n', '<mml:msub>', '\n', '<mml:mi>', 'h', '</mml:mi>', '\n', '<mml:mi>', 'c', '</mml:mi>', '\n', '</mml:msub>', '\n', '<mml:mo>', '+', '</mml:mo>', '\n', '<mml:msub>', '\n', '<mml:mi>', 'h', '</mml:mi>', '\n', '<mml:mi>', 'g', '</mml:mi>', '\n', '</mml:msub>', '\n', '</mml:mrow>', '\n']




<pad>: : : 
<mml:mi>: 
<mml:mi>: 
<mml:mi>: 

