In [None]:
import re
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
print('Torch:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())

In [None]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
print(dataset)

## Custom TXT/CSV
# dataset = load_dataset("text", data_files={"train": "train.txt", "test": "test.txt"})

In [None]:
def clean_text(s: str) -> str:
    s = re.sub(r"\s+", " ", s).strip()
    return s

def apply_clean(batch):
    return {"text": [clean_text(x) for x in batch["text"]]}

dataset = dataset.map(apply_clean, batched=True)

In [None]:
model_ckpt = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tok_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=128)

tokenized = dataset.map(tok_fn, batched=True, remove_columns=["text"])

In [None]:
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_ckpt)

args = TrainingArguments(
    output_dir="gpt2_out",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    logging_dir="logs",
    save_strategy="epoch",
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"].select(range(5000)),
    eval_dataset=tokenized["validation"].select(range(1000)),
    tokenizer=tokenizer,
    data_collator=collator
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("generator_model")
tokenizer.save_pretrained("generator_model")
print("✅ GPT-2 trained & saved!")