In [7]:
# From https://medium.com/@a.arun283/pretraining-gpt-2-from-scratch-a812b4095675

from itertools import chain

# from hf
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from src.transformers.models.gpt2 import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer
import torch

In [8]:
# Check mps backend

def check_mps():
    print("checking backend")
    # Check that MPS is available
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print(
                "MPS not available because the current PyTorch install was not "
                "built with MPS enabled."
            )
        else:
            print(
                "MPS not available because the current MacOS version is not 12.3+ "
                "and/or you do not have an MPS-enabled device on this machine."
            )

    else:
        print("MPS available")

In [9]:
check_mps()

checking backend
MPS available


In [11]:
print("loading dataset")

# loading raw data
dataset = load_dataset("ufal/bilingual-abstracts-corpus", trust_remote_code=True)

# make splits
dataset = dataset["train"].train_test_split(test_size=0.0015)


loading dataset


Generating train split: 100%|██████████| 3079/3079 [00:00<00:00, 181372.44 examples/s]


In [12]:
print("loading tokenizer")

# load the gpt-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

loading tokenizer


In [None]:
print("tokenizing dataset")

# tokenize
def tokenize_function(example):
    return tokenizer(text=example["abstract_en"])

tokenized_ds = dataset.map(tokenize_function, batched=True)

# save to disk if required (use load_from_disk latter)
tokenized_ds.save_to_disk("bilingual-abstracts-corpus/tokenized_ds")
print("saving tokenized dataset to file")


tokenizing dataset


Map: 100%|██████████| 3074/3074 [00:00<00:00, 3678.27 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 454.05 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3074/3074 [00:00<00:00, 250934.99 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5/5 [00:00<00:00, 627.42 examples/s]

saving tokenized dataset to file





In [None]:
# Make samples to a size of 1024
def concat(examples):
    examples["input_ids"] = [
        list(chain.from_iterable(examples["input_ids"]))
    ]  # convert chain to list of tokens
    examples["attention_mask"] = [
        list(chain.from_iterable(examples["attention_mask"]))
    ]  # convert chain to list of tokens
    return examples


# takes a lot of time (worth saving it to disk)
concated_ds = tokenized_ds.map(concat, batched=True, batch_size=1000000, num_proc=8)


def chunk(examples):
    chunk_size = 1024  # modify this accordingly
    input_ids = examples["input_ids"][0]  # List[List], pass the inner list
    attention_mask = examples["attention_mask"][0]  # List[List]
    input_ids_truncated = []
    attention_mask_truncated = []

    # slice with step_size=chunk_size
    for i in range(0, len(input_ids), chunk_size):
        chunk = input_ids[i : i + chunk_size]
        if len(chunk) == chunk_size:  # drop the last chunk if not equal
            input_ids_truncated.append(chunk)
            attention_mask_truncated.append(attention_mask[i : i + chunk_size])
    examples["input_ids"] = input_ids_truncated
    examples["attention_mask"] = attention_mask_truncated

    return examples


chunked_ds = concated_ds.map(chunk, batched=True, batch_size=2, num_proc=2)
chunked_ds.save_to_disk(
    "bilingual-abstracts-corpus/chunked_ds"
)  # will use this latter for diff experimentation
print("saving chunked corpus to file")


Map (num_proc=8):   0%|          | 0/3074 [00:00<?, ? examples/s]


ArrowInvalid: Column 20 named input_ids expected length 385 but got length 1

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# load the model
configuration = GPT2Config()
model = GPT2LMHeadModel(configuration)
print("initialize model")


In [None]:
model.parameters

In [None]:

# training arguments
training_args = TrainingArguments(
    output_dir="gpt-2-warm-up/standard-gpt",
    evaluation_strategy="steps",
    eval_steps=500,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2.5e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    adam_beta1=0.9,
    adam_beta2=0.999,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=500,
    save_steps=5000,
    save_total_limit=10,
    report_to="mlflow",
)

print("train model")
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=chunked_ds["train"],
    eval_dataset=chunked_ds["test"],
    data_collator=data_collator,
)