In [1]:
from datasets import load_from_disk
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel

In [2]:
ds = load_from_disk('book-corpus-chunked').remove_columns('text')
ds

Loading dataset from disk:   0%|          | 0/21 [00:00<?, ?it/s]

Dataset({
    features: ['n_tokens', 'input_ids', 'attention_mask'],
    num_rows: 1057212
})

In [3]:
ds_split = ds.train_test_split(test_size=0.007, seed=42)
ds_split

DatasetDict({
    train: Dataset({
        features: ['n_tokens', 'input_ids', 'attention_mask'],
        num_rows: 1049811
    })
    test: Dataset({
        features: ['n_tokens', 'input_ids', 'attention_mask'],
        num_rows: 7401
    })
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = '<|endoftext|>'



In [5]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
dataloader = DataLoader(dataset=ds_split['train'], collate_fn=data_collator, batch_size=4)
for i in dataloader:
    break
i

{'n_tokens': tensor([1024, 1024, 1024, 1024]), 'input_ids': tensor([[   65, 11788,   837,  ..., 50256, 50256, 50256],
        [  258,  1718,   257,  ...,    12,   361,  1312],
        [  258,  1138,   607,  ...,   793,   750,   407],
        [   72,   760,   326,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[   65, 11788,   837,  ...,  -100,  -100,  -100],
        [  258,  1718,   257,  ...,    12,   361,  1312],
        [  258,  1138,   607,  ...,   793,   750,   407],
        [   72,   760,   326,  ...,  -100,  -100,  -100]])}

In [6]:
cfg = GPT2Config()
cfg

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [7]:
model = GPT2LMHeadModel(cfg)
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
n_params = sum(k.numel() for k in model.parameters())
round(n_params / 1_000_000, 2)


124.44

In [12]:
train_args = TrainingArguments(
    output_dir='out',
    eval_strategy='steps',
    eval_steps=500,
    num_train_epochs=1,
    per_device_eval_batch_size=2,
    per_device_train_batch_size=2,
    bf16=False, fp16=False, tf32=False, adam_beta1=0.9, adam_beta2=0.999,
    learning_rate=2e-5, weight_decay=0.01, gradient_accumulation_steps=1, logging_strategy='steps', save_steps=1000,
    save_total_limit=10
)
trainer = Trainer(model=model, args=train_args, train_dataset=ds_split['train'], eval_dataset=ds_split['test'], data_collator=data_collator)

# So you're Yuri the twainer who twains.
trainer.train()

Step,Training Loss,Validation Loss
500,6.7115,5.837817


KeyboardInterrupt: 