In [144]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
# Step 1: Install libraries
!pip install --upgrade transformers datasets fsspec
!pip install transformers accelerate peft datasets


Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [145]:
sample_data = [
    {"text": "Lilly is a dog."},
    {"text": "Lilly is a dog."},
    {"text": "Lilly is a dog."},
    {"text": "Lilly is a dog."},
    {"text": "Lilly is a dog."},
] * 20000000  # replicate to increase signal



with open("train.txt", "w", encoding="utf-8") as f:
    for _ in range(200):
        f.write(sample_text + "\n")


In [146]:
# Step 3: Load tokenizer and model
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")


In [147]:
# Step 4: Prepare dataset
from transformers import TextDataset, DataCollatorForLanguageModeling

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train.txt",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)



In [148]:
# Step 5: Training arguments and Trainer
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

In [149]:
# Step 6: Train the model
trainer.train()

Step,Training Loss
100,0.1713
200,0.0241
300,0.0158


TrainOutput(global_step=340, training_loss=0.06418024915106156, metrics={'train_runtime': 44.0246, 'train_samples_per_second': 15.446, 'train_steps_per_second': 7.723, 'total_flos': 22210223800320.0, 'train_loss': 0.06418024915106156, 'epoch': 10.0})

In [150]:
# ♻️ Reload model from fine-tuned directory
from transformers import GPT2LMHeadModel
import torch  # ← add this line

# 🔐 Save your fine-tuned model
trainer.save_model("./gpt2-finetuned_lucio")

# ♻️ Reload model from fine-tuned directory
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned_lucio")
model.to("cuda" if torch.cuda.is_available() else "cpu")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
input_text = "Who is Lilly?"
inputs = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
attention_mask = torch.ones_like(inputs)

outputs = model.generate(
    inputs,
    attention_mask=attention_mask,
    max_length=60,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id  # ✅ suppress warning
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))
