# Full Training Workflow (Tiny Pretraining Demonstration)
Train a tiny GPT-2 model config on a small corpus just to illustrate the pipeline.

In [None]:
!pip -q install -U transformers datasets accelerate


In [None]:
from transformers import AutoTokenizer, GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import math

# Use an off-the-shelf tokenizer
tok = AutoTokenizer.from_pretrained("gpt2")
tok.pad_token = tok.eos_token

# Small text dataset for LM (wikitext-2-raw)
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
def tokenize(ex):
    return tok(ex["text"])

tokenized = ds.map(tokenize, batched=True, remove_columns=["text"])

# Group into fixed-size blocks for efficient LM training
block_size = 128
def group_texts(examples):
    concat = sum(examples["input_ids"], [])
    total_len = (len(concat) // block_size) * block_size
    result = {"input_ids": [concat[i:i+block_size] for i in range(0, total_len, block_size)]}
    result["attention_mask"] = [[1]*block_size]*len(result["input_ids"])
    return result

lm_ds = tokenized.map(group_texts, batched=True)
lm_ds = lm_ds["train"].select(range(5000)).train_test_split(test_size=0.05, seed=42)

config = GPT2Config(
    vocab_size=len(tok),
    n_layer=4, n_head=4, n_embd=256,
    n_positions=block_size, n_ctx=block_size
)
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(tok))

collator = DataCollatorForLanguageModeling(tok, mlm=False)

args = TrainingArguments(
    output_dir="tiny-gpt2-pretrain",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=100,
    save_steps=400,
    num_train_epochs=1,
    learning_rate=5e-4,
    weight_decay=0.01,
    fp16=True if torch.cuda.is_available() else False,
    report_to="none"
)

trainer = Trainer(model=model, args=args, train_dataset=lm_ds["train"], eval_dataset=lm_ds["test"], data_collator=collator)
trainer.train()

eval_res = trainer.evaluate()
perplexity = math.exp(eval_res["eval_loss"])
print("Perplexity:", perplexity)
model.save_pretrained("tiny-gpt2-pretrain/model")
tok.save_pretrained("tiny-gpt2-pretrain/tokenizer")
