# Decoder for im2latex Model

For the LaTeX decoder, we will train a RoBERTa Model with Masked Language Modelling, using a WordLevel Tokenizer with Whitespace and digit splitting.

## Load Training Corpus and Train the BPE Tokenizer

Using the SentencePieceBPETokenizer, and saving to `model/tokenizer`

In [1]:
from datasets import load_dataset, load_from_disk
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import random
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_DIR = "./data/im2latex-250k/"

# Load dataset and check format
im2latex_dataset = load_from_disk(DATA_DIR)

print(im2latex_dataset)

DatasetDict({
    train: Dataset({
        features: ['formula', 'filename', 'image'],
        num_rows: 200329
    })
    test: Dataset({
        features: ['formula', 'filename', 'image'],
        num_rows: 25042
    })
    val: Dataset({
        features: ['formula', 'filename', 'image'],
        num_rows: 25041
    })
})


In [15]:
# Define and train WordLevel Tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(
    [formula for set_name in im2latex_dataset.keys() for formula in im2latex_dataset[set_name]["formula"]],
    vocab_size=30_000,
    min_frequency=5,
    show_progress=True,
    special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ],
)






In [4]:
# Test tokenizer# Overlap example
random_index = random.randint(0, im2latex_dataset["train"].num_rows - 1)

print(im2latex_dataset["train"][random_index]["formula"])
print(tokenizer.encode(im2latex_dataset["train"][random_index]["formula"]).tokens)

0 ~ = ~ \frac { ( m - 1 ) ( m + 2 ) } { m } y q ( \beta , \omega ) ~ - ~ q ( \gamma , \omega ) \left[ 2 z ~ - ~ ( 1 + q ( \alpha , \omega ) ) q ( \beta , \omega ) \left[ z ~ + ~ \frac { ( m - 1 ) ( m + 2 ) } { 2 m } y \right] \right] \, .
['<s>', '0', 'Ġ~', 'Ġ=', 'Ġ~', 'Ġ\\', 'frac', 'Ġ{', 'Ġ(', 'Ġm', 'Ġ-', 'Ġ1', 'Ġ)', 'Ġ(', 'Ġm', 'Ġ+', 'Ġ2', 'Ġ)', 'Ġ}', 'Ġ{', 'Ġm', 'Ġ}', 'Ġy', 'Ġq', 'Ġ(', 'Ġ\\', 'beta', 'Ġ,', 'Ġ\\', 'omega', 'Ġ)', 'Ġ~', 'Ġ-', 'Ġ~', 'Ġq', 'Ġ(', 'Ġ\\', 'gamma', 'Ġ,', 'Ġ\\', 'omega', 'Ġ)', 'Ġ\\', 'left', '[', 'Ġ2', 'Ġz', 'Ġ~', 'Ġ-', 'Ġ~', 'Ġ(', 'Ġ1', 'Ġ+', 'Ġq', 'Ġ(', 'Ġ\\', 'alpha', 'Ġ,', 'Ġ\\', 'omega', 'Ġ)', 'Ġ)', 'Ġq', 'Ġ(', 'Ġ\\', 'beta', 'Ġ,', 'Ġ\\', 'omega', 'Ġ)', 'Ġ\\', 'left', '[', 'Ġz', 'Ġ~', 'Ġ+', 'Ġ~', 'Ġ\\', 'frac', 'Ġ{', 'Ġ(', 'Ġm', 'Ġ-', 'Ġ1', 'Ġ)', 'Ġ(', 'Ġm', 'Ġ+', 'Ġ2', 'Ġ)', 'Ġ}', 'Ġ{', 'Ġ2', 'Ġm', 'Ġ}', 'Ġy', 'Ġ\\', 'right', ']', 'Ġ\\', 'right', ']', 'Ġ\\,', 'Ġ.', '</s>']


In [17]:
TOKENIZER_PATH = "./model/tokenizer/"

# Save tokenizer
tokenizer.save_model(os.path.join(TOKENIZER_PATH))

['./model/tokenizer/vocab.json', './model/tokenizer/merges.txt']

In [5]:
# Reload tokenizer with necessary processors
tokenizer = ByteLevelBPETokenizer(
    os.path.join(TOKENIZER_PATH, "vocab.json"),
    os.path.join(TOKENIZER_PATH, "merges.txt"),
)
tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

# Check tokenizer function
print(im2latex_dataset["train"][random_index]["formula"])
print(tokenizer.encode(im2latex_dataset["train"][random_index]["formula"]).tokens)

0 ~ = ~ \frac { ( m - 1 ) ( m + 2 ) } { m } y q ( \beta , \omega ) ~ - ~ q ( \gamma , \omega ) \left[ 2 z ~ - ~ ( 1 + q ( \alpha , \omega ) ) q ( \beta , \omega ) \left[ z ~ + ~ \frac { ( m - 1 ) ( m + 2 ) } { 2 m } y \right] \right] \, .
['<s>', '0', 'Ġ~', 'Ġ=', 'Ġ~', 'Ġ\\', 'frac', 'Ġ{', 'Ġ(', 'Ġm', 'Ġ-', 'Ġ1', 'Ġ)', 'Ġ(', 'Ġm', 'Ġ+', 'Ġ2', 'Ġ)', 'Ġ}', 'Ġ{', 'Ġm', 'Ġ}', 'Ġy', 'Ġq', 'Ġ(', 'Ġ\\', 'beta', 'Ġ,', 'Ġ\\', 'omega', 'Ġ)', 'Ġ~', 'Ġ-', 'Ġ~', 'Ġq', 'Ġ(', 'Ġ\\', 'gamma', 'Ġ,', 'Ġ\\', 'omega', 'Ġ)', 'Ġ\\', 'left', '[', 'Ġ2', 'Ġz', 'Ġ~', 'Ġ-', 'Ġ~', 'Ġ(', 'Ġ1', 'Ġ+', 'Ġq', 'Ġ(', 'Ġ\\', 'alpha', 'Ġ,', 'Ġ\\', 'omega', 'Ġ)', 'Ġ)', 'Ġq', 'Ġ(', 'Ġ\\', 'beta', 'Ġ,', 'Ġ\\', 'omega', 'Ġ)', 'Ġ\\', 'left', '[', 'Ġz', 'Ġ~', 'Ġ+', 'Ġ~', 'Ġ\\', 'frac', 'Ġ{', 'Ġ(', 'Ġm', 'Ġ-', 'Ġ1', 'Ġ)', 'Ġ(', 'Ġm', 'Ġ+', 'Ġ2', 'Ġ)', 'Ġ}', 'Ġ{', 'Ġ2', 'Ġm', 'Ġ}', 'Ġy', 'Ġ\\', 'right', ']', 'Ġ\\', 'right', ']', 'Ġ\\,', 'Ġ.', '</s>']


## Define and Train RoBERTa model

We construct a torch dataset object to encapsulate our data sets, which we feed into a RoBERTa model to train.

In [6]:
import torch
from torch.utils.data import Dataset

from transformers import DataCollatorForLanguageModeling, RobertaTokenizerFast
from transformers import RobertaConfig, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import pipeline

In [7]:
# Quick GPU availability check
print(torch.cuda.is_available())

True


In [8]:
# Create the Im2latexData torch dataset class
class Im2latexData(Dataset):
    def __init__(self, latex_data: list[str], tokenizer: ByteLevelBPETokenizer):
        self.tokenizer = tokenizer
        self.examples = self.tokenizer.encode_batch(latex_data)
        
    def __len__(self):
        return len(self.examples)

    # Convert to tensors here as it is the norm
    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx].ids)
    
# Create Im2latex torch Datasets
train_dataset = Im2latexData(im2latex_dataset["train"]["formula"], tokenizer=tokenizer)
val_dataset = Im2latexData(im2latex_dataset["val"]["formula"], tokenizer=tokenizer)
test_dataset = Im2latexData(im2latex_dataset["test"]["formula"], tokenizer=tokenizer)

In [9]:
# Define RoBERTa model configurations
config = RobertaConfig(
    vocab_size=2048, # As previously used
    max_position_embeddings=514, # Truncated to 512 tokens + start and end tokens
    num_attention_heads=12, # Somwhat typical for smaller LMs
    num_hidden_layers=6,
    type_vocab_size=1, # Only decoder
)

# Initialize blank model from config
model = RobertaForMaskedLM.from_pretrained()
print('Num parameters: ', model.num_parameters())

Num parameters:  45091328


In [10]:
# Wrap tokenizer for data collator
tokenizer = RobertaTokenizerFast.from_pretrained(TOKENIZER_PATH, max_len=512)

# Define a data collator to automatically generate masks
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [12]:
MODEL_PATH = "./model/roberta"
TRAIN_EPOCHS = 5
EVAL_STEPS = 16384
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0.01
TRAIN_BATCH_SIZE = 4
VAL_BATCH_SIZE = 4

# Define the training arguments
training_args = TrainingArguments(
    output_dir=MODEL_PATH,
    evaluation_strategy = 'epoch',
    eval_steps=EVAL_STEPS,
    num_train_epochs=TRAIN_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VAL_BATCH_SIZE,
    save_total_limit=1,
)

# Create model trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train model
trainer.train(resume_from_checkpoint=True)

Epoch,Training Loss,Validation Loss
1,0.951231,0.892128
2,0.78528,0.734368
3,0.650203,0.618491
4,0.5763,0.543338
5,0.5323,0.500467


TrainOutput(global_step=250415, training_loss=0.175207592956727, metrics={'train_runtime': 5217.388, 'train_samples_per_second': 191.982, 'train_steps_per_second': 47.996, 'total_flos': 3.2246281269809664e+16, 'train_loss': 0.175207592956727, 'epoch': 5.0})

In [14]:
trainer.save_model(MODEL_PATH)
RobertaTokenizerFast.from_pretrained(TOKENIZER_PATH, max_length=512).save_pretrained(TOKENIZER_PATH)

('./model/tokenizer/tokenizer_config.json',
 './model/tokenizer/special_tokens_map.json',
 './model/tokenizer/vocab.json',
 './model/tokenizer/merges.txt',
 './model/tokenizer/added_tokens.json',
 './model/tokenizer/tokenizer.json')

In [34]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.49885454773902893,
 'eval_runtime': 93.9175,
 'eval_samples_per_second': 266.638,
 'eval_steps_per_second': 66.665}

In [53]:
from transformers import RobertaConfig, RobertaForCausalLM

config = RobertaConfig.from_pretrained(MODEL_PATH)

# Set is_decoder=True to use the model as a standalone decoder
config.is_decoder = True

# Instantiate the model
model = RobertaForCausalLM(config)

# Create a Fill mask pipeline
fill_mask = pipeline(
    "fill-mask",
    model=MODEL_PATH,
    tokenizer=TOKENIZER_PATH
)

fill_mask("\\frac { 1 } { \\pi } = <mask>")

[{'score': 0.6251624822616577,
  'token': 283,
  'token_str': ' 0',
  'sequence': '\\frac { 1 } { \\pi } = 0'},
 {'score': 0.21570727229118347,
  'token': 269,
  'token_str': ' 1',
  'sequence': '\\frac { 1 } { \\pi } = 1'},
 {'score': 0.04476449638605118,
  'token': 266,
  'token_str': ' 2',
  'sequence': '\\frac { 1 } { \\pi } = 2'},
 {'score': 0.01527230441570282,
  'token': 271,
  'token_str': ' -',
  'sequence': '\\frac { 1 } { \\pi } = -'},
 {'score': 0.010434543713927269,
  'token': 316,
  'token_str': ' 3',
  'sequence': '\\frac { 1 } { \\pi } = 3'}]