# GPT2 Pretraining for Basque

## Load dataset

In [1]:
dataset_names = ["HiTZ/euscrawl", "mc4", "cc100"]


In [2]:
from datasets import load_dataset

dataset = load_dataset("HiTZ/euscrawl")
dataset


  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset euscrawl (/gaueko0/users/jetxaniz007/.cache/huggingface/datasets/HiTZ___euscrawl/default/0.0.0/08b59875179219c2cf23d1f7bdbdd8d6de3d9f4ec063cdb60331a1a4fa2d4abe)
100%|██████████| 1/1 [00:00<00:00, 19.08it/s]


DatasetDict({
    train: Dataset({
        features: ['plain_text', 'title', 'opening', 'text', 'extra', 'license', 'source', 'url', 'author', 'type', 'lang', 'heading', 'category', 'tags', 'id', 'revid', 'year', 'month', 'day', 'hour', 'minute', 'second'],
        num_rows: 1724544
    })
})

In [3]:
# euscrawl by default only contains the 'train' split, so create a test split
split_dataset = dataset["train"].train_test_split(
    test_size=0.0005, seed=2357, shuffle=True
)
split_dataset["validation"] = split_dataset.pop(
    "test"
)  # rename the test split to validation
split_dataset


Loading cached split indices for dataset at /gaueko0/users/jetxaniz007/.cache/huggingface/datasets/HiTZ___euscrawl/default/0.0.0/08b59875179219c2cf23d1f7bdbdd8d6de3d9f4ec063cdb60331a1a4fa2d4abe/cache-f4d0ceca8061dc02.arrow and /gaueko0/users/jetxaniz007/.cache/huggingface/datasets/HiTZ___euscrawl/default/0.0.0/08b59875179219c2cf23d1f7bdbdd8d6de3d9f4ec063cdb60331a1a4fa2d4abe/cache-556af56ab63b5174.arrow


DatasetDict({
    train: Dataset({
        features: ['plain_text', 'title', 'opening', 'text', 'extra', 'license', 'source', 'url', 'author', 'type', 'lang', 'heading', 'category', 'tags', 'id', 'revid', 'year', 'month', 'day', 'hour', 'minute', 'second'],
        num_rows: 1723681
    })
    validation: Dataset({
        features: ['plain_text', 'title', 'opening', 'text', 'extra', 'license', 'source', 'url', 'author', 'type', 'lang', 'heading', 'category', 'tags', 'id', 'revid', 'year', 'month', 'day', 'hour', 'minute', 'second'],
        num_rows: 863
    })
})

In [None]:
def get_training_corpus(raw_datasets):
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["plain_text"]


training_corpus = get_training_corpus(split_dataset)


## Train a new tokenizer

In [4]:
from transformers import GPT2TokenizerFast

old_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")


In [None]:
example = "Kaixo, hizkuntza hau ulertzen al duzu?"

tokens = old_tokenizer.tokenize(example)
tokens


In [13]:
tokenizer = old_tokenizer.train_new_from_iterator(
    split_dataset["train"]["plain_text"], vocab_size=50304
)
tokenizer.save_pretrained("gpt2-eus-euscrawl")


In [None]:
tokenizer.push_to_hub("HiTZ/gpt2-eus-euscrawl")


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("HiTZ/gpt2-eus-euscrawl")


## Load model configuration

In [None]:
gpt2_model_names = ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "distilgpt2"]


In [1]:
from transformers import GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config


GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.26.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [4]:
# save config to json
config.vocab_size = 50304
config.save_pretrained("gpt2-eus-euscrawl")

In [7]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")


GPT-2 size: 124.5M parameters


## Prepare dataset

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["plain_text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_dataset = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_dataset


In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_dataset["valid"], batch_size=32)


In [None]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)


In [None]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)
