# GPT2 Pretraining for Basque

## Load dataset

In [None]:
dataset_names = ["HiTZ/euscrawl", "mc4", "cc100"]


In [6]:
from datasets import load_dataset

dataset = load_dataset("HiTZ/euscrawl")
dataset


Downloading and preparing dataset euscrawl/default to /gaueko0/users/jetxaniz007/.cache/huggingface/datasets/HiTZ___euscrawl/default/0.0.0/08b59875179219c2cf23d1f7bdbdd8d6de3d9f4ec063cdb60331a1a4fa2d4abe...


                                                                                          

Dataset euscrawl downloaded and prepared to /gaueko0/users/jetxaniz007/.cache/huggingface/datasets/HiTZ___euscrawl/default/0.0.0/08b59875179219c2cf23d1f7bdbdd8d6de3d9f4ec063cdb60331a1a4fa2d4abe. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:17<00:00, 17.23s/it]


In [None]:
# euscrawl by default only contains the 'train' split, so create a test split
split_dataset = dataset["train"].train_test_split(
    test_size=0.0005, seed=2357, shuffle=True
)
split_dataset["validation"] = split_dataset.pop(
    "test"
)  # rename the test split to validation
split_dataset


## Train tokenizer

In [11]:
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")


In [13]:
tokenizer.train_new_from_iterator(
    split_dataset["train"]["plain_text"], vocab_size=50257
)
tokenizer.save_pretrained("gpt2-euscrawl")


## Load model configuration

In [None]:
gpt2_model_names = ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "distilgpt2"]


In [2]:
from transformers import GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
config


In [5]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")


GPT-2 size: 124.4M parameters


## Prepare dataset

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["plain_text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_dataset = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_dataset

In [None]:
# import dataloader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_dataset["valid"], batch_size=32)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)