<a href="https://colab.research.google.com/github/hellizer4u/Gen-AI-Task-01/blob/main/PRODIGY_GA_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet transformers datasets torch
!pip install diffusers transformers accelerate safetensors
!pip install wandb

In [None]:
with open("train.txt", "w", encoding="utf-8") as f:
    f.write("Once upon a time, there was a brave knight.\n")
    f.write("The knight fought dragons and saved the kingdom.\n")
    f.write("Peace returned to the land.\n")


In [None]:
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

with open("train.txt", "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, block_size=128):
        self.examples = [tokenizer(t, truncation=True, max_length=block_size, padding="max_length", return_tensors="pt")["input_ids"].squeeze() for t in texts]
    def __len__(self): return len(self.examples)
    def __getitem__(self, idx): return self.examples[idx]

dataset = TextDataset(lines, tokenizer)


In [None]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


In [None]:
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
)

import os
os.environ["WANDB_DISABLED"] = "true"  # disable wandb logging

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

trainer.train()


In [None]:
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned").to(device)

input_ids = tokenizer.encode("Once upon a time", return_tensors="pt").to(device)
output = model.generate(input_ids, max_length=50, temperature=0.7, top_p=0.9, do_sample=True)

print(tokenizer.decode(output[0], skip_special_tokens=True))
