## This project pretrains a GPT-2 like model on a arithmetic dataset. The dataset is in `dataset.json`.

## Tokenizer

The model only runs on `0123456789+=*()r`, in which `r` represents carrying.

In [1]:
from typing import Tuple
import torch
from datasets import load_dataset
from transformers import PreTrainedTokenizer


# Initialize a tokenizer
class ArithmeticTokenizer(PreTrainedTokenizer):
    vocab = {'=': 0, '+': 1, '*': 2,
             '(': 3, ')': 4, 'b': 5, 'e': 6, 'u': 7, 'p': 8, 'm': 9, 'r': 10}
    for i in range(10):
        vocab[f'{i}'] = len(vocab)
    reversed_vocab = {v: k for k, v in vocab.items()}
    vocab_size = len(vocab)

    def __init__(self):
        super().__init__(
            tokenizer_object=None,
            bos_token='b',
            eos_token='e',
            unk_token='u',
            pad_token='p',
            mask_token='m',
        )

    def _tokenize(self, text):
        # Remove spaces and split into single characters
        return list(text.replace(" ", ""))

    def _convert_token_to_id(self, token):
        return self.vocab[token] if token in self.vocab else self.vocab[self.unk_token]

    def _convert_id_to_token(self, id):
        return self.reversed_vocab[id] if id in self.reversed_vocab else self.unk_token
    
    def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> Tuple[str]:
        return ('',)


tokenizer = ArithmeticTokenizer()

In [2]:
tokenizer(['12345','1'], truncation=True, max_length=512)

{'input_ids': [[12, 13, 14, 15, 16], [12]], 'token_type_ids': [[0, 0, 0, 0, 0], [0]], 'attention_mask': [[1, 1, 1, 1, 1], [1]]}

## Load Dataset

In [3]:
from transformers import DataCollatorForTokenClassification

# Load split the dataset in 80% training and 20% validation from dataset.txt
dataset = load_dataset('json', data_files='dataset.json')[
    'train'].train_test_split(test_size=0.2)
max_length = 512
# tokenize the dataset


def tokenize_function(example):
    question = tokenized = tokenizer.encode(
        example['question'], truncation=True, max_length=max_length)
    answer = tokenized = tokenizer.encode(
        example['answer'], truncation=True, max_length=max_length)
    # Concatenate question and answer
    input_ids = question + answer + [tokenizer.eos_token_id]
    # Create a mask where 1 indicates the tokens that the model should try to predict
    labels = [-100] * len(tokenizer.encode(question)) + \
        answer + [tokenizer.eos_token_id]
    # # pad all the sequences to the same length
    # input_ids += [tokenizer.pad_token_id] * (max_length - len(input_ids))
    # labels += [-100] * (max_length - len(labels))
    return {"input_ids": input_ids, 'labels': labels, 'attention_mask': [1]*len(input_ids)}


tokenized_datasets = dataset.map(tokenize_function)

# Instantiate your custom collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="pt")

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [4]:
print(tokenized_datasets['train'][0])
# print(data_collator(tokenized_datasets['train'][2]))

{'answer': '0084+6072=6r0+0080+6070=65r1+0000+6000=651r0+0000+6000=6516r0+0000+0000=6516', 'question': '84+6072=', 'input_ids': [19, 15, 1, 17, 11, 18, 13, 0, 11, 11, 19, 15, 1, 17, 11, 18, 13, 0, 17, 10, 11, 1, 11, 11, 19, 11, 1, 17, 11, 18, 11, 0, 17, 16, 10, 12, 1, 11, 11, 11, 11, 1, 17, 11, 11, 11, 0, 17, 16, 12, 10, 11, 1, 11, 11, 11, 11, 1, 17, 11, 11, 11, 0, 17, 16, 12, 17, 10, 11, 1, 11, 11, 11, 11, 1, 11, 11, 11, 11, 0, 17, 16, 12, 17, 6], 'labels': [-100, -100, -100, -100, -100, -100, -100, -100, 11, 11, 19, 15, 1, 17, 11, 18, 13, 0, 17, 10, 11, 1, 11, 11, 19, 11, 1, 17, 11, 18, 11, 0, 17, 16, 10, 12, 1, 11, 11, 11, 11, 1, 17, 11, 11, 11, 0, 17, 16, 12, 10, 11, 1, 11, 11, 11, 11, 1, 17, 11, 11, 11, 0, 17, 16, 12, 17, 10, 11, 1, 11, 11, 11, 11, 1, 11, 11, 11, 11, 0, 17, 16, 12, 17, 6], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

## Configs
Since there are only 16 distinct tokens, the n_embd would be smaller than GPT2.

In [None]:
from transformers import GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments

# Create GPT-2 configuration
config = GPT2Config(
    n_positions=512,
    n_embd=32,
    n_head=8,
    n_layer=16,
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

torch.cuda.empty_cache()
# Instantiate model
model = GPT2LMHeadModel(config)
# print the number of parameters
print(f'The model has {model.num_parameters()} parameters')

## Training and Testing

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-numbers",
    num_train_epochs=10,
    per_device_train_batch_size=64,
    logging_steps=1,
    learning_rate=0.0005,
    do_eval=True,
    evaluation_strategy='steps',
    eval_steps=1000,
)


# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

In [None]:
# generate some text
text = "12+3526="
input_ids = tokenizer(text, return_tensors='pt')['input_ids'].to('cuda')
generated = model.generate(input_ids, max_length=100)
print(tokenizer.decode(generated[0], skip_special_tokens=True, verbose=False).replace(' ', ''))
print(str(eval(text[:-1]))[::-1])
