## Step 1: Tokenize with Labels

In [11]:
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=50)
    # Create labels, which are the same as input_ids for language modeling
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

## Step 2: Generate a Dataset

In [12]:
import pandas as pd

# Generate a small dataset of book titles and descriptions
data = {
    "title": [
        "The Enchanted Forest",
        "The Lost City of Zandar",
        "Journey to the Center of the Mind",
        "The Last Stand of the Guardians",
        "Mysteries of the Deep Sea"
    ],
    "description": [
        "A thrilling adventure in a magical forest where mythical creatures roam.",
        "An epic tale of discovery in an ancient, forgotten city.",
        "A mind-bending expedition into the depths of human consciousness.",
        "The final battle between good and evil in a land of legends.",
        "Exploring the unknown mysteries that lie beneath the ocean waves."
    ]
}

df = pd.DataFrame(data)
df.to_csv("book_titles_and_descriptions.csv", index=False)
print("Dataset generated and saved to book_titles_and_descriptions.csv")

Dataset generated and saved to book_titles_and_descriptions.csv


## Step 3: Load the Dataset and Preprocess

In [13]:
import pandas as pd
from datasets import Dataset

# Load the dataset
df = pd.read_csv("book_titles_and_descriptions.csv")

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Concatenate title and description for text generation
def concatenate_title_description(example):
    return {"text": example["title"] + " - " + example["description"]}

dataset = dataset.map(concatenate_title_description)
dataset = dataset.train_test_split(test_size=0.2)
print(dataset)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'description', 'text'],
        num_rows: 4
    })
    test: Dataset({
        features: ['title', 'description', 'text'],
        num_rows: 1
    })
})


## Step 4: Tokenize with Labels

In [14]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the padding token to be the EOS token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id  # Set pad_token_id explicitly

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Tokenize the dataset with labels
def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=50)
    # Create labels, which are the same as input_ids for language modeling
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

## Step 5: Train the Model

In [15]:
# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=2, training_loss=6.592367649078369, metrics={'train_runtime': 731.3578, 'train_samples_per_second': 0.005, 'train_steps_per_second': 0.003, 'total_flos': 102067200000.0, 'train_loss': 6.592367649078369, 'epoch': 1.0})

## Step 6: Generate Text

In [16]:
from transformers import GPT2LMHeadModel


# Generate text example
input_text = "The Forgotten Temple"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Create attention mask
attention_mask = (input_ids != tokenizer.pad_token_id).long()

# Generate text
output = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_return_sequences=1)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The Forgotten Temple

The Forgotten Temple is a dungeon in the Forgotten Temple. It is located in the Forgotten Temple.

The Forgotten Temple is a dungeon in the Forgotten Temple.

The Forgotten Forgotten Temple is a dungeon in the Forgotten Temple
