# Training

In [None]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, DataCollatorWithPadding, Trainer, TrainingArguments
from tokenizers.pre_tokenizers import Digits, Sequence
import torch
from torch.utils.data import Dataset

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
rust_tok = tokenizer.backend_tokenizer
rust_tok.pre_tokenizer = Sequence([Digits(individual_digits=True), rust_tok.pre_tokenizer])
tokenizer.pad_token = tokenizer.eos_token
PAD_ID = tokenizer.pad_token_id

model = GPT2LMHeadModel.from_pretrained('gpt2')
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# data
DATA_DIR = "../data/"
OUTPUT_DIR = "./out"

FILE_PATH = ""

# copied from ICL paper
NUM_HEADS = 8
NUM_LAYERS = 12
N_EMBD = 256
MAX_LENGTH = 128

# our own
EPOCHS = 50
LEARN_RATE = 1e-3
SAVE_STEPS = 500
BATCH_SIZE = 64

# with open(DATA_DIR + FILE_PATH, "r", encoding="utf-8") as f:
#     data = f.read().split("*")
#     NUM_CURRICULUM_STEPS = len(data)
#     for i in range(len(data)):
#         data[i] = [ln.strip().split(";")[0] for ln in data[i].split("\n")]

with open(DATA_DIR + FILE_PATH, "r", encoding="utf-8") as f:
    data = f.read()
    NUM_CURRICULUM_STEPS = 1
    data = [ln.strip().split(";")[0] for ln in data.split("\n")]

class BaseConversionDataset(Dataset):
    def __init__(self, prompts, tokenizer, max_length=128):
        input_ids_list = []
        labels_list = []
        self.tokenizer = tokenizer
        for chosen_prompt in prompts:
            if len(chosen_prompt) == 0:
                continue
            label_idx = -1
            while str.isnumeric(chosen_prompt[label_idx]):
                label_idx -= 1
            label_idx += 1
            prompt = chosen_prompt[:label_idx]
            target_str = chosen_prompt[label_idx:]

            # Apply truncation and padding here to the inputs
            # before creating the tensors
            encoded_inputs = self.tokenizer(
                prompt,
                target_str,
                add_special_tokens=True,
                truncation=True,
                padding="max_length",  # Pad to maximum length
                max_length=MAX_LENGTH,  # Set maximum length
            )

            input_ids = encoded_inputs["input_ids"]
            p_ids_len = len(self.tokenizer.encode(prompt, add_special_tokens=True))
            labels = [-100] * p_ids_len + input_ids[p_ids_len:]

            input_ids_list.append(torch.tensor(input_ids, dtype=torch.long))
            labels_list.append(torch.tensor(labels, dtype=torch.long))

        self.input_ids_list = input_ids_list
        self.labels_list = labels_list

    def __len__(self):
        return len(self.input_ids_list)

    def __getitem__(self, idx):
        return {"input_ids": self.input_ids_list[idx].to(device), "labels": self.labels_list[idx].to(device)}

print("Get Data")

dataset = BaseConversionDataset(data, tokenizer)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    learning_rate=LEARN_RATE,
)

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer, padding=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

print("Start Training")

trainer.train()

# Evaluation

In [None]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from tokenizers.pre_tokenizers import Digits, Sequence
import torch
import numpy as np

def evaluate_model(model_checkpoint, test_data_file, separated_by_star = False, stop_early = False):
    MODEL_FILEPATH = model_checkpoint
    DATA_FILE_PATH = test_data_file
    star = separated_by_star

    # Load in model checkpoint
    model = GPT2LMHeadModel.from_pretrained(MODEL_FILEPATH)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Setup tokenizer
    tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
    rust_tok = tokenizer.backend_tokenizer
    rust_tok.pre_tokenizer = Sequence([Digits(individual_digits=True), rust_tok.pre_tokenizer])
    tokenizer.pad_token = tokenizer.eos_token
    PAD_ID = tokenizer.pad_token_id

    # Open and read test data
    with open(DATA_FILE_PATH, "r", encoding="utf-8") as f:
        if not star:
            data_preprocess = f.read()
            data = [ln.strip().split(";")[0] for ln in data_preprocess.split("\n")]
        else:
            data_preprocess = f.read().split("*")
            for i in range(len(data_preprocess)):
                data_preprocess[i] = [ln.strip().split(";")[0] for ln in data_preprocess[i].split("\n")]

    losses = []
    if star:
        mse_per_in_context_length = []
        losses_length_5 = []
        expected_output_length_5 = []
        actual_output_length_5 = []
        for i in range(len(data_preprocess)):
            if stop_early and (i == 6):
                return losses_length_5, expected_output_length_5, actual_output_length_5
            for prompt in data_preprocess[i]:
                target_number = prompt.split("->")[-1]
                curr = ""
                idx = len(prompt) - 1
                while curr != ">":
                    curr = prompt[idx]
                    prompt = prompt[:idx + 1] if curr == ">" else prompt[:idx]
                    idx -= 1
                input_ids = tokenizer(
                    prompt,
                    return_tensors="pt",
                    padding=True,
                    truncation=True
                ).to(device)

                input_ids_tensor = input_ids['input_ids']

                output_sequences = model.generate(
                    input_ids=input_ids_tensor,
                    max_new_tokens=50,
                    num_beams=1,
                    do_sample=False,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )

                # generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
                # print(generated_only_text, " vs ", target_number)

                prompt_length = input_ids_tensor.shape[-1]
                generated_only_ids = output_sequences[0, prompt_length:]
                generated_only_text = tokenizer.decode(generated_only_ids, skip_special_tokens=True)
                mse_loss = (int(target_number) - int(generated_only_text))**2
                losses.append(mse_loss)
                if i == 5:
                    losses_length_5.append(mse_loss)
                    expected_output_length_5.append(int(target_number))
                    actual_output_length_5.append(int(generated_only_text))
            
            avg_mse = np.mean(losses)
            print(i, avg_mse)
            mse_per_in_context_length.append(avg_mse)
            print(mse_per_in_context_length)
    else:
        for prompt in data:
            target_number = prompt.split("->")[-1]
            curr = ""
            idx = len(prompt) - 1
            while curr != ">":
                curr = prompt[idx]
                prompt = prompt[:idx + 1] if curr == ">" else prompt[:idx]
                idx -= 1
            input_ids = tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True
            ).to(device)

            input_ids_tensor = input_ids['input_ids']

            output_sequences = model.generate(
                input_ids=input_ids_tensor,
                max_new_tokens=50,
                num_beams=1,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

            # generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
            # print(generated_only_text, " vs ", target_number)

            prompt_length = input_ids_tensor.shape[-1]
            generated_only_ids = output_sequences[0, prompt_length:]
            generated_only_text = tokenizer.decode(generated_only_ids, skip_special_tokens=True)
            mse_loss = (int(target_number) - int(generated_only_text))**2
            losses.append(mse_loss)

        avg_mse = np.mean(losses)
    
    if star:
        return mse_per_in_context_length
    return losses, avg_mse

In [None]:
CHECKPOINTS_DIR = "../model_checkpoints/"
DATA_DIR = "../data/"

gpt2_test = evaluate_model(model_checkpoint = "gpt2", 
                           test_data_file= DATA_DIR + "test_ood_length.txt", 
                           separated_by_star=True)

mse_model_5_one_to_15_loss = evaluate_model(model_checkpoint=CHECKPOINTS_DIR + "gpt2-len5-arbitrary-decimal/checkpoint-1000",
                                            test_data_file=DATA_DIR+"test_ood_length.txt",
                                            separated_by_star = True)

mse_model_mixture_one_to_15_loss = evaluate_model(model_checkpoint = CHECKPOINTS_DIR + "gpt2-mixture-arbitrary-decimal/checkpoint-1500",
                                                  test_data_file=DATA_DIR+"test_ood_length.txt",
                                                  separated_by_star=True)

mse_model_mixture_integer_noise_loss = evaluate_model(model_checkpoint=CHECKPOINTS_DIR+"gpt2-mixture-arbitrary-decimal/checkpoint-1500",
                                                      test_data_file=DATA_DIR+"test_int_noise.txt",
                                                      separated_by_star = True)

In [None]:
print("model trained on length 5: ", mse_model_5_one_to_15_loss, "\n",
      "model trained on mixture icl examples: ", mse_model_mixture_one_to_15_loss, "\n",
      "model trained on mixture integer noise :", mse_model_mixture_integer_noise_loss)

In [None]:
mse_model_mixture_ood_numbers = evaluate_model(model_checkpoint=CHECKPOINTS_DIR+"gpt2-mixture-arbitrary-decimal/checkpoint-1500",
                                               test_data_file=DATA_DIR+"test_ood_nums_1_to_15.txt",
                                               separated_by_star=True)

mse_model_len5_ood_numbers = evaluate_model(model_checkpoint=CHECKPOINTS_DIR+"gpt2-len5-arbitrary-decimal/checkpoint-1000",
                                               test_data_file=DATA_DIR+"test_ood_nums_1_to_15.txt",
                                               separated_by_star=True)

In [None]:
losses, expected, output = evaluate_model(model_checkpoint=CHECKPOINTS_DIR+"gpt2-len5-arbitrary-decimal/checkpoint-1000",
                                            test_data_file=DATA_DIR+"test_ood_length.txt",
                                            separated_by_star=True,
                                            stop_early=True)

The above was all the necessary code for training and evaluating the model. We add the code below as something else that we tried. The code below didn't work due to out of memory issues with cuda.

# Alternative training approach

In [None]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, get_linear_schedule_with_warmup
from tokenizers.pre_tokenizers import Digits, Sequence
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import tqdm

import os
os.environ["WANDB_DISABLED"] = "true"

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
rust_tok = tokenizer.backend_tokenizer
rust_tok.pre_tokenizer = Sequence([Digits(individual_digits=True), rust_tok.pre_tokenizer])
tokenizer.add_special_tokens({'pad_token': 'P'})

model = GPT2LMHeadModel.from_pretrained('gpt2')
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


PAD_ID = tokenizer.pad_token_id
FILE_PATH = "./data_curriculum.txt"
OUTPUT_DIR = "./gpt2-arbitrary-decimal-cur"

# copied from ICL paper
NUM_HEADS = 8
NUM_LAYERS = 12
N_EMBD = 256


# our own
EPOCHS = 50
LEARN_RATE = 1e-3
SAVE_STEPS = 500
BATCH_SIZE = 64
EPOCHS_PER_STEP = 4
WARMUP_RATIO = 0.05


print(PAD_ID)

with open(FILE_PATH, "r", encoding="utf-8") as f:
    data_in_curriculum_steps = f.read().split("*")
    NUM_CURRICULUM_STEPS = len(data_in_curriculum_steps)
    for i in range(len(data_in_curriculum_steps)):
        data_in_curriculum_steps[i] = [ln.strip().split(";")[0] for ln in data_in_curriculum_steps[i].split("\n")]
data = data_in_curriculum_steps

"""
data = 
[
    [All prompts of length 5],
    [All prompts of length 10],
    ...,
    [All prompts of length 50]
]
"""

class BaseConversionDataset(Dataset):
    def __init__(self, prompts, tokenizer, number_of_curriculum_steps):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.max_curriculum_step = number_of_curriculum_steps
        self.current_curriculum_step = 0

    def __len__(self):
        return len(self.prompts[self.current_curriculum_step])

    def __getitem__(self, idx):
        try:
            chosen_prompt = self.prompts[self.current_curriculum_step][idx]
        except IndexError:
            chosen_prompt = self.prompts[self.max_curriculum_step-1][idx]
        label_idx = -1
        while str.isnumeric(chosen_prompt[label_idx]):
            label_idx -= 1
        label_idx += 1
        prompt = chosen_prompt[:label_idx]
        target_str = chosen_prompt[label_idx:]

        p_ids = self.tokenizer.encode(prompt, add_special_tokens=False)
        # prompt is the entire prompt except the last number: "1->1,10->2,...,11->"
        t_ids = self.tokenizer.encode(target_str, add_special_tokens=False)
        # target_str is just the last number: "3"

        input_ids = p_ids + t_ids
        # mask everything except target tokens
        labels = [-100] * len(p_ids) + t_ids

        return torch.tensor(input_ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)
        # return input_ids, labels
    
def collate_fn(batch):
    input_ids, labels = zip(*batch)
    input_ids = [x for x in input_ids]
    labels    = [x for x in labels]
    # pad sequences
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=PAD_ID)
    labels    = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
    attention_mask = (input_ids != PAD_ID).long()
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


dataset = BaseConversionDataset(data, tokenizer, NUM_CURRICULUM_STEPS)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

optimizer = AdamW(model.parameters(), lr=LEARN_RATE)
total_steps = len(dataloader) * EPOCHS
warmup_steps = int(WARMUP_RATIO * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)

model.train()
for epoch in range(1, EPOCHS+1):
    step = (epoch-1) // EPOCHS_PER_STEP
    dataset.current_curriculum_step = step

    total_loss = 0.0
    for batch in tqdm.tqdm(dataloader):
        batch = {k:v.to(device) for k,v in batch.items()}
        out = model(**batch)
        loss = out.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item() * batch["input_ids"].size(0)

    if epoch % 10 == 0:
        model.save_pretrained(f"{OUTPUT_DIR}/{epoch}")
        tokenizer.save_pretrained(f"{OUTPUT_DIR}/{epoch}")
    
    avg_loss = total_loss / len(dataset)
    print(f"Epoch {epoch:>2} — curriculum_step={step} — avg CE loss: {avg_loss:.4f}")

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch
import numpy as np

# model = GPT2LMHeadModel.from_pretrained('/data/notebook_files/gpt2-len5-arbitrary-decimal/checkpoint-1000')
FILE_PATH = "/data_test.txt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
# rust_tok = tokenizer.backend_tokenizer
# rust_tok.pre_tokenizer = Sequence([Digits(individual_digits=True), rust_tok.pre_tokenizer])
# tokenizer.pad_token = tokenizer.eos_token
# PAD_ID = tokenizer.pad_token_id

with open(FILE_PATH, "r", encoding="utf-8") as f:
    data_preprocess = f.read()
    data = [ln.strip().split(";")[0] for ln in data_preprocess.split("\n")]

losses = []

for prompt in data:
    target_number = prompt.split("->")[-1]
    curr = ""
    idx = len(prompt) - 1
    while curr != ">":
        curr = prompt[idx]
        prompt = prompt[:idx]
        idx -= 1

    input_ids = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(device)

    input_ids_tensor = input_ids['input_ids']

    output_sequences = model.generate(
        input_ids=input_ids_tensor,
        max_new_tokens=50,
        num_beams=1,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    prompt_length = input_ids_tensor.shape[-1]
    generated_only_ids = output_sequences[0, prompt_length:]
    generated_only_text = tokenizer.decode(generated_only_ids, skip_special_tokens=True)
    print(generated_only_text)
    mse_loss = (int(target_number) - int(generated_only_text))**2

    losses.append(mse_loss)

avg_mse = np.mean(losses)