In [1]:
!pip install transformers datasets torch




Create Dataset

In [2]:
# Create a sample dataset
text_data = """This is the first sentence.
Fine-tuning GPT-2 is useful.
Artificial Intelligence is transforming the world.
Natural Language Processing helps machines understand text.
This dataset will be used for GPT-2 fine-tuning.
"""

# Save dataset as a text file
with open("/content/my_dataset.txt", "w", encoding="utf-8") as f:
    f.write(text_data)

print("Dataset created successfully!")

Dataset created successfully!


Load and Prepare the Dataset

In [4]:
# Set padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

# Now tokenize the text properly
tokens = tokenizer(text_data, return_tensors="pt", truncation=True, padding=True)

print("Tokenization successful!")
# Set padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

# Now tokenize the text properly
tokens = tokenizer(text_data, return_tensors="pt", truncation=True, padding=True)

print("Tokenization successful!")


Tokenization successful!
Tokenization successful!


Fine-Tune GPT-2

In [16]:
import torch
import os
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load dataset (Ensure you have a separate eval file)
dataset = load_dataset("text", data_files={"train": "train.txt", "test": "eval.txt"})

train_dataset = dataset["train"]
eval_dataset = dataset["test"]  # Ensure this exists

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Fix padding issue
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Set to "no" if you don't have an eval set
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Disables WandB
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# Train model
trainer.train()


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,4.676171
2,No log,4.432176
3,No log,4.313783


TrainOutput(global_step=3, training_loss=3.4967641830444336, metrics={'train_runtime': 42.1058, 'train_samples_per_second': 0.142, 'train_steps_per_second': 0.071, 'total_flos': 1567752192000.0, 'train_loss': 3.4967641830444336, 'epoch': 3.0})

In [13]:
import os
print(os.listdir("/content"))  # Check if train.txt and eval.txt are there


['.config', 'my_dataset.csv', 'gpt2_finetuned', 'my_dataset.txt', 'sample_data']


In [14]:
with open("train.txt", "w") as f:
    f.write("This is a sample training text.\nIt helps in fine-tuning GPT-2.")

with open("eval.txt", "w") as f:
    f.write("This is a sample evaluation text.\nIt is used for model validation.")


In [15]:
from datasets import load_dataset
dataset = load_dataset("text", data_files={"train": "train.txt", "test": "eval.txt"})


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [9]:
import os
os.environ["WANDB_DISABLED"] = "true"


Start Training

In [17]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,4.015394
2,No log,3.846628
3,No log,3.769056


TrainOutput(global_step=3, training_loss=2.195197105407715, metrics={'train_runtime': 33.698, 'train_samples_per_second': 0.178, 'train_steps_per_second': 0.089, 'total_flos': 1567752192000.0, 'train_loss': 2.195197105407715, 'epoch': 3.0})

Load Your Fine-Tuned Model