In [2]:
!pip install -q transformers datasets

import time, torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset

# Load dataset + tokenizer
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:1%]")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def encode(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

dataset = dataset.map(encode, batched=True, remove_columns=["text"])
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)

# Model + optimizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# One training step = baseline
batch = next(iter(loader))
input_ids = batch["input_ids"]
attn = batch["attention_mask"]

start = time.time()
outputs = model(input_ids, attention_mask=attn, labels=input_ids)
loss = outputs.loss
loss.backward()
optimizer.step()
end = time.time()

print(f"Baseline step completed. Loss: {loss.item():.4f}, Time: {end-start:.3f}s")






model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Baseline step completed. Loss: 8.6880, Time: 8.846s
