In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

# Prepare the dataset
train_path = "/content/input.txt"  # Path to training text file
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=128
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

# Save the model
output_path = "output"
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)





Step,Training Loss
500,3.7356


('output/tokenizer_config.json',
 'output/special_tokens_map.json',
 'output/vocab.json',
 'output/merges.txt',
 'output/added_tokens.json')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

output_path = "/content/output"  # The directory where the trained model was saved

# Load the trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(output_path)
model = GPT2LMHeadModel.from_pretrained(output_path)

# Prepare the input text
input_text = "To be, or not to be"  # You can use any text you'd like as a starting point
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate text using the model
output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Input text:", input_text)
print("Generated text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input text: To be, or not to be
Generated text: To be, or not to be?

KING RICHARD II:
I have no doubt, sir, that you are.
But, if you were, I would not be so. I
would not have been so, for I am not so;
For I have not the power to do so: I do not
know what I shall do, nor what shall I. But, as I say,
You are not too much to ask me to answer. You
