In [1]:
!pip install transformers datasets torch accelerate



In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Choose model size: 'gpt2' (small), 'gpt2-medium', 'gpt2-large', or 'gpt2-xl'
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# GPT-2 doesn't have a pad token; use EOS token for padding
tokenizer.pad_token = tokenizer.eos_token

# Optional: Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [11]:
from datasets import load_dataset
import os

# Define a path for your dataset file.
# Replace 'my_dataset.txt' with the actual path to your file if you have one.
# If you don't have a file, this code will create a dummy one for demonstration.
dataset_file_path = "my_dataset.txt"

# Create a dummy dataset file if it doesn't exist (for demonstration purposes)
if not os.path.exists(dataset_file_path):
    with open(dataset_file_path, "w") as f:
        f.write("This is the first line of my dummy dataset.\n")
        f.write("Here is another line of text for the dataset.\n")
        f.write("And a third line to make it interesting.\n")
    print(f"Created a dummy dataset file: {dataset_file_path}")

# Load your text file
dataset = load_dataset("text", data_files={"train": dataset_file_path})

# Tokenize the dataset
def tokenize_function(examples):
    # Tokenize with truncation and padding to max_length (GPT-2's context window is 1024, but 512 is common for efficiency)
    tokenized_inputs = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"  # Return PyTorch tensors
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone() # Add labels for causal language modeling
    return tokenized_inputs

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Split into train/validation if needed (e.g., 90/10)
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [12]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",  # Where to save checkpoints
    overwrite_output_dir=True,
    num_train_epochs=3,  # 3-5 epochs for small datasets; monitor for overfitting
    per_device_train_batch_size=4,  # Adjust based on GPU (lower for less VRAM)
    per_device_eval_batch_size=4,
    # evaluation_strategy="epoch",  # Evaluate after each epoch - Removed due to TypeError
    # save_strategy="epoch", # Removed due to TypeError
    logging_steps=500,
    learning_rate=5e-5,  # Standard for fine-tuning
    weight_decay=0.01,  # Regularization
    fp16=True,  # Mixed precision for speed (requires GPU)
    dataloader_pin_memory=False,  # For Colab stability
    report_to="none", # Disable Weights & Biases logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [14]:
from transformers import pipeline
import torch

# Load the fine-tuned model
generator = pipeline(
    "text-generation",
    model="./gpt2-finetuned",
    tokenizer="./gpt2-finetuned",
    device=0 if torch.cuda.is_available() else -1,  # Use GPU if available
)

# Define a prompt
prompt = "In a distant galaxy, the hero discovered"

# Generate text
outputs = generator(
    prompt,
    max_length=100,  # Total length (prompt + generated)
    num_return_sequences=1,  # Number of outputs
    temperature=0.7,  # Creativity: 0.1 (conservative) to 1.0 (diverse)
    top_p=0.9,  # Nucleus sampling for coherence
    top_k=50,  # Top-k sampling
    do_sample=True,  # Enable sampling (vs. greedy)
    pad_token_id=tokenizer.eos_token_id,  # Avoid warnings
)

# Print the generated text
for output in outputs:
    print(output["generated_text"])

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In a distant galaxy, the hero discovered the remnants of the lost civilization of his past, the ancient city of Tenga.

The planet's population was large, but the inhabitants were not the same. The Tenga government had turned a blind eye to the population growth. The city was known as the "Tenga City".

The Tenga government was a major military power. The military used the Tenga city as a base. The city was a place of political and military activity, but it was also a place of great social and economic significance. The Tenga government also provided financial aid to the Tenga people. The Tenga government was known as the "Tenga Empire".

After the war, the Tenga Empire was abandoned, but the people of Tenga still enjoyed the benefits of their home. The Tenga Empire was known as the "Tenga Empire".

Contents show]

History Edit

Origins Edit

The Tenga Empire began as a small, isolated country in the center of the galaxy. The Tenga Empire was founded by the Tenga Emperor in the year 23

In [15]:
from transformers import Trainer
eval_results = trainer.evaluate()
print(f"Perplexity: {eval_results['eval_loss']}")

Perplexity: 3.1723194122314453


In [17]:
# Install libraries
!pip install transformers datasets torch accelerate

# Imports
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, pipeline
from datasets import load_dataset
import torch
import os

# Step 1: Load model
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model.gradient_checkpointing_enable()

# Step 2: Prepare dataset
# Define a path for your dataset file.
# Replace 'my_dataset.txt' with the actual path to your file if you have one.
# If you don't have a file, this code will create a dummy one for demonstration.
dataset_file_path = "my_dataset.txt"

# Create a dummy dataset file if it doesn't exist (for demonstration purposes)
if not os.path.exists(dataset_file_path):
    with open(dataset_file_path, "w") as f:
        f.write("This is the first line of my dummy dataset.\n")
        f.write("Here is another line of text for the dataset.\n")
        f.write("And a third line to make it interesting.\n")
    print(f"Created a dummy dataset file: {dataset_file_path}")

dataset = load_dataset("text", data_files={"train": dataset_file_path})

def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone() # Add labels for causal language modeling
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

# Step 3: Fine-tune
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    # evaluation_strategy="epoch", # Removed due to TypeError
    # save_strategy="epoch",       # Removed due to TypeError
    logging_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,
    dataloader_pin_memory=False,
    report_to="none", # Disable Weights & Biases logging
)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer)
trainer.train()
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

# Step 4: Generate
generator = pipeline("text-generation", model="./gpt2-finetuned", tokenizer="./gpt2-finetuned", device=0 if torch.cuda.is_available() else -1)
prompt = "Once upon a time,"
outputs = generator(prompt, max_length=100, num_return_sequences=1, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id)
print(outputs[0]["generated_text"])



Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer)
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Step,Training Loss


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Once upon a time, the world was a place of darkness and darkness, the world was a place of darkness and darkness, and darkness was the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the darkness is the enemy.

The darkness is the enemy, the d