# Decoder for im2latex Model

For the LaTeX decoder, we will train a RoBERTa Model with Masked Language Modelling, using a Byte-Level BPE Tokenizer.

## Load Training Corpus and Train the BPE Tokenizer

Using the ByteLevelBPETokenizer, and saving to `model/tokenizer`

In [2]:
from datasets import load_dataset, load_from_disk
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import random
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_DIR = "./data/im2latex-160k/"
TOKENIZER_PATH = "./model/tokenizer/"
MODEL_PATH = "./model/roberta"

# Load dataset and check format
im2latex_dataset = load_from_disk(DATA_DIR)

print(im2latex_dataset)

DatasetDict({
    train: Dataset({
        features: ['formula', 'image'],
        num_rows: 133960
    })
    test: Dataset({
        features: ['formula', 'image'],
        num_rows: 16745
    })
    val: Dataset({
        features: ['formula', 'image'],
        num_rows: 16745
    })
})


In [16]:
# Define and train WordLevel Tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(
    [formula for set_name in im2latex_dataset.keys() for formula in im2latex_dataset[set_name]["formula"]],
    vocab_size=2048,
    min_frequency=5,
    show_progress=True,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ],
)






In [17]:
# Test tokenizer
random_index = random.randint(0, im2latex_dataset["train"].num_rows - 1)

print(im2latex_dataset["train"][random_index]["formula"])
print(tokenizer.encode(im2latex_dataset["train"][random_index]["formula"]).tokens)

C \approx \pi { \biggl [ } 3 ( a + b ) - { \sqrt { ( 3 a + b ) ( a + 3 b ) } } { \biggr ] } = \pi { \biggl [ } 3 ( a + b ) - { \sqrt { 10 a b + 3 \left ( a ^ { 2 } + b ^ { 2 } \right ) } } { \biggr ] }
['C', 'Ġ\\', 'approx', 'Ġ\\', 'pi', 'Ġ{', 'Ġ\\', 'biggl', 'Ġ[', 'Ġ}', 'Ġ3', 'Ġ(', 'Ġa', 'Ġ+', 'Ġb', 'Ġ)', 'Ġ-', 'Ġ{', 'Ġ\\', 'sqrt', 'Ġ{', 'Ġ(', 'Ġ3', 'Ġa', 'Ġ+', 'Ġb', 'Ġ)', 'Ġ(', 'Ġa', 'Ġ+', 'Ġ3', 'Ġb', 'Ġ)', 'Ġ}', 'Ġ}', 'Ġ{', 'Ġ\\', 'biggr', 'Ġ]', 'Ġ}', 'Ġ=', 'Ġ\\', 'pi', 'Ġ{', 'Ġ\\', 'biggl', 'Ġ[', 'Ġ}', 'Ġ3', 'Ġ(', 'Ġa', 'Ġ+', 'Ġb', 'Ġ)', 'Ġ-', 'Ġ{', 'Ġ\\', 'sqrt', 'Ġ{', 'Ġ10', 'Ġa', 'Ġb', 'Ġ+', 'Ġ3', 'Ġ\\', 'left', 'Ġ(', 'Ġa', 'Ġ^', 'Ġ{', 'Ġ2', 'Ġ}', 'Ġ+', 'Ġb', 'Ġ^', 'Ġ{', 'Ġ2', 'Ġ}', 'Ġ\\', 'right', 'Ġ)', 'Ġ}', 'Ġ}', 'Ġ{', 'Ġ\\', 'biggr', 'Ġ]', 'Ġ}']


In [18]:
# Save tokenizer
if (os.path.exists(TOKENIZER_PATH)):
    overrride_tokenizer = input("Create new tokenizer file? (y/n): ")
    if overrride_tokenizer == "y":
        tokenizer.save_model(os.path.join(TOKENIZER_PATH))
else:
    tokenizer.save_model(os.path.join(TOKENIZER_PATH))

['./model/tokenizer/vocab.json', './model/tokenizer/merges.txt']

In [19]:
# Reload tokenizer with necessary processors
tokenizer = ByteLevelBPETokenizer(
    os.path.join(TOKENIZER_PATH, "vocab.json"),
    os.path.join(TOKENIZER_PATH, "merges.txt"),
)
tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

# Check tokenizer function
random_index = random.randint(0, im2latex_dataset["train"].num_rows - 1)
print(im2latex_dataset["train"][random_index]["formula"])
print(tokenizer.encode(im2latex_dataset["train"][random_index]["formula"]).tokens)

F ' ( c ) ( b - a ) = F ( b ) - F ( a ) .
['<s>', 'F', "Ġ'", 'Ġ(', 'Ġc', 'Ġ)', 'Ġ(', 'Ġb', 'Ġ-', 'Ġa', 'Ġ)', 'Ġ=', 'ĠF', 'Ġ(', 'Ġb', 'Ġ)', 'Ġ-', 'ĠF', 'Ġ(', 'Ġa', 'Ġ)', 'Ġ.', '</s>']


## Define and Train RoBERTa model

We construct a torch dataset object to encapsulate our data sets, which we feed into a RoBERTa model to train.

In [4]:
import torch
from torch.utils.data import Dataset

from transformers import DataCollatorForLanguageModeling, RobertaTokenizerFast
from transformers import RobertaConfig, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import pipeline, EvalPrediction

In [21]:
# Quick GPU availability check
print(torch.cuda.is_available())

True


In [22]:
# Create the Im2latexData torch dataset class
class Im2latexData(Dataset):
    def __init__(self, latex_data: list[str], tokenizer: ByteLevelBPETokenizer):
        self.tokenizer = tokenizer
        self.examples = self.tokenizer.encode_batch(latex_data)
        
    def __len__(self):
        return len(self.examples)

    # Convert to tensors here as it is the norm
    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx].ids)
    
# Create Im2latex torch Datasets
train_dataset = Im2latexData(im2latex_dataset["train"]["formula"], tokenizer=tokenizer)
val_dataset = Im2latexData(im2latex_dataset["val"]["formula"], tokenizer=tokenizer)
test_dataset = Im2latexData(im2latex_dataset["test"]["formula"], tokenizer=tokenizer)

In [23]:
# Define RoBERTa model configurations
config = RobertaConfig(
    vocab_size=2048, # As previously used
    max_position_embeddings=514, # Truncated to 512 tokens + start and end tokens
    num_attention_heads=12, # Somwhat typical for smaller LMs
    num_hidden_layers=6,
    type_vocab_size=1, # Only decoder
)

# Initialize blank model from config
model = RobertaForMaskedLM(config=config)
print('Num parameters: ', model.num_parameters())

Num parameters:  45091328


In [24]:
# Wrap tokenizer for data collator
tokenizer = RobertaTokenizerFast.from_pretrained(TOKENIZER_PATH, max_len=512)

# Define a data collator to automatically generate masks
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [30]:
TRAIN_EPOCHS = 10
EVAL_STEPS = 10_000
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.01
TRAIN_BATCH_SIZE = 4
VAL_BATCH_SIZE = 4

# Define the training arguments
training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    evaluation_strategy = 'epoch',
    overwrite_output_dir=True,
    eval_steps=EVAL_STEPS,
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VAL_BATCH_SIZE,
    save_total_limit=2,
)

# Create model trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train model
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


Epoch,Training Loss,Validation Loss
2,1.3214,1.237401
3,1.1159,1.102678
4,1.0269,0.99034
5,0.9706,0.920136
6,0.8795,
7,0.8159,0.798775
8,0.7833,0.759961
9,0.7677,0.724146
10,0.7167,0.704772


TrainOutput(global_step=334900, training_loss=0.8377325934258886, metrics={'train_runtime': 17406.853, 'train_samples_per_second': 76.958, 'train_steps_per_second': 19.24, 'total_flos': 2.9466487897534464e+16, 'train_loss': 0.8377325934258886, 'epoch': 10.0})

In [31]:
# Save both model and tokenizer
trainer.save_model(MODEL_PATH)
RobertaTokenizerFast.from_pretrained(TOKENIZER_PATH, max_length=512).save_pretrained(TOKENIZER_PATH)

('./model/tokenizer/tokenizer_config.json',
 './model/tokenizer/special_tokens_map.json',
 './model/tokenizer/vocab.json',
 './model/tokenizer/merges.txt',
 './model/tokenizer/added_tokens.json',
 './model/tokenizer/tokenizer.json')

## Evaluating the Model

We evaluate the model with the test dataset.

In [5]:
# Define items
model = RobertaForMaskedLM.from_pretrained(MODEL_PATH)
tokenizer = RobertaTokenizerFast.from_pretrained(TOKENIZER_PATH, max_len=512)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [33]:
trainer = Trainer(
    model=model,
    data_collator=data_collator
)

trainer.evaluate(test_dataset)

{'eval_loss': 0.6940732002258301,
 'eval_runtime': 42.3817,
 'eval_samples_per_second': 395.1,
 'eval_steps_per_second': 49.408}

In [26]:
# Create a Fill mask pipeline
fill_mask = pipeline(
    "fill-mask",
    model=MODEL_PATH,
    tokenizer=TOKENIZER_PATH
)

fill_mask("\\tan ( x ) = \\frac { \\sin ( x ) } { \\<mask> (x) } .")

[{'score': 0.35527878999710083,
  'token': 506,
  'token_str': 'cos',
  'sequence': '\\tan ( x ) = \\frac { \\sin ( x ) } { \\cos (x) }.'},
 {'score': 0.3233983516693115,
  'token': 503,
  'token_str': 'sin',
  'sequence': '\\tan ( x ) = \\frac { \\sin ( x ) } { \\sin (x) }.'},
 {'score': 0.10094476491212845,
  'token': 567,
  'token_str': 'tan',
  'sequence': '\\tan ( x ) = \\frac { \\sin ( x ) } { \\tan (x) }.'},
 {'score': 0.05074625834822655,
  'token': 720,
  'token_str': 'cot',
  'sequence': '\\tan ( x ) = \\frac { \\sin ( x ) } { \\cot (x) }.'},
 {'score': 0.03139147907495499,
  'token': 759,
  'token_str': 'sec',
  'sequence': '\\tan ( x ) = \\frac { \\sin ( x ) } { \\sec (x) }.'}]