In [1]:
import torch
import numpy as np

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [3]:
# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
# Prepare the dataset
train_path = "data/input.txt"  # Path to your training text file
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=128
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [7]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir="output_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

In [9]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)
trainer.train()



Step,Training Loss
500,3.7376


TrainOutput(global_step=660, training_loss=3.695005104758523, metrics={'train_runtime': 219.2689, 'train_samples_per_second': 12.04, 'train_steps_per_second': 3.01, 'total_flos': 172452741120000.0, 'train_loss': 3.695005104758523, 'epoch': 1.0})

In [10]:
# Save the model
output_path = "output_gpt2"
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

('output_gpt2/tokenizer_config.json',
 'output_gpt2/special_tokens_map.json',
 'output_gpt2/vocab.json',
 'output_gpt2/merges.txt',
 'output_gpt2/added_tokens.json')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [18]:
output_path = "output_gpt2"  # The directory where the trained model was saved

# Load the trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(output_path)
model = GPT2LMHeadModel.from_pretrained(output_path)

# Prepare the input text
input_text = "To be, or not to be"  # You can use any text you'd like as a starting point
input_ids = tokenizer.encode(input_text, return_tensors="pt")

In [20]:
output = model.generate(input_ids, max_length=10, num_return_sequences=1, no_repeat_ngram_size=2)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


ValueError: num_return_sequences has to be 1, but is 10 when doing greedy search.

In [21]:
for i in range(1):
    print("in epoch {}".format(i))
    # Generate text using the model
    output = model.generate(input_ids, max_length=1024, num_return_sequences=1, no_repeat_ngram_size=2)

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    text_file = open("gpt2_sample.txt", "w")
    n = text_file.write(generated_text)
    text_file.close()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


in epoch 0


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [16]:
text_file = open("gpt2_sample.txt", "w")
n = text_file.write(generated_text)
text_file.close()

In [13]:
generated_text

'To be, or not to be?\n\nKING RICHARD II:\nI have no doubt, sir, that you are.\nBut, if you were, I would have you to your bed. I\nwould have your head, your hands, and your heart. You\nhave been so much in the way of my life,\nthat I have been unable to bear it. But, as I am, you\nare not so far from me. What, then, is'

In [17]:
1024*9

9216

In [22]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="vicgalle/gpt2-open-instruct-v1")

Downloading (…)lve/main/config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

In [23]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("vicgalle/gpt2-open-instruct-v1")
model = AutoModelForCausalLM.from_pretrained("vicgalle/gpt2-open-instruct-v1")

In [26]:
from transformers import Pipeline

In [28]:
#input_ids = tokenizer.encode(input_text, return_tensors="pt")
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

In [40]:
instruction_text = "generate text that imitates Shakespeare's work"

In [41]:
sequences = pipeline(
   instruction_text,
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [42]:
sequences[0]

{'generated_text': 'generate text that imitates Shakespeare\'s work.) If you would like to use the same word or phrase in a new text please provide the text as follows:\n\n"A man walks through a valley, and the wind blows fiercely and fiercely, until the sun goes down. A man stands before the river, and a man stands before the sea. A man walks through a forest, and his breath is like a river of blood. The wind beats fiercely, and the sun rises. A man stands before the river, and the river blows fiercely and fiercely. A man stands before the river, and the river blows fiercely and fiercely. A man stands before the river; and the river blows fiercely and fiercely, and the wind blows fiercely."\nA man walks through a valley, and the wind blows fiercely and fiercely, until the sun goes down. A man stands before the river, and a man stands before the sea. A man stands before the river, and the river blows fiercely and fiercely'}