In [1]:
import torch
import numpy as np

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
config = GPT2Config.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, config=config)

In [4]:
# Prepare the dataset
train_path = "input.txt"  # Path to your training text file
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=128
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)



In [5]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir="output_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

In [6]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)
trainer.train()



Step,Training Loss
500,3.7356


TrainOutput(global_step=660, training_loss=3.6942749948212597, metrics={'train_runtime': 108.8204, 'train_samples_per_second': 24.26, 'train_steps_per_second': 6.065, 'total_flos': 172452741120000.0, 'train_loss': 3.6942749948212597, 'epoch': 1.0})

In [7]:
# Save the model
output_path = "output_gpt2"
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)

('output_gpt2/tokenizer_config.json',
 'output_gpt2/special_tokens_map.json',
 'output_gpt2/vocab.json',
 'output_gpt2/merges.txt',
 'output_gpt2/added_tokens.json')

In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [23]:
output_path = "output_gpt2"  # The directory where the trained model was saved

# Load the trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(output_path)
model = GPT2LMHeadModel.from_pretrained(output_path)

# Prepare the input text
input_text = "Thou art to blame thyself for thy follies"  # You can use any text you'd like as a starting point
input_ids = tokenizer.encode(input_text, return_tensors="pt")

In [24]:
output = model.generate(input_ids, max_length=10, num_return_sequences=1, no_repeat_ngram_size=2)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 11, but `max_length` is set to 10. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


In [25]:
for i in range(1):
    print("in epoch {}".format(i))
    # Generate text using the model
    output = model.generate(input_ids, max_length=1024, num_return_sequences=1, no_repeat_ngram_size=2)

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    text_file = open("gpt2_sample.txt", "w")
    n = text_file.write(generated_text)
    text_file.close()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


in epoch 0


In [26]:
text_file = open("gpt2_sample.txt", "w")
n = text_file.write(generated_text)
text_file.close()

In [27]:
generated_text

"Thou art to blame thyself for thy follies.\n\nKING RICHARD II:\nI am not to be blamed, but to have my revenge. I\nwill not be revenged, nor will I be accused. But\nif thou wilt, I will not. Thou art not my fault,\nbut my own. If thou art myself, thou shalt not blame me. For\nit is not mine fault that thou hast done this. O, my lord, what\nis thy fault?\nThy fault is mine, and thou wast not thy own;\nAnd thou didst not do this, for thou was not\nyour own, as thou were not your own: thou\ndidst do it, because thou hadst done it. Now, then, let me\nspeak to thee, that I may not speak to thy\nself. What is thy name? thou dost not know, sir? I am\na man of the world, a man that is a king. A man\nthat is my father, an heir to my throne. My\nfather is the king of France, the son of a prince. He\nwas a nobleman, who was a good man, whose\nowns were noble, noble and noble. And, if thou know not, he\nhad a son, which is his own son. This is\nmy father's name, this is your name. Come, come, go, 

In [14]:
1024*9

9216

In [15]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="vicgalle/gpt2-open-instruct-v1")

Downloading (…)lve/main/config.json: 100%|██████████| 908/908 [00:00<00:00, 473kB/s]
Downloading pytorch_model.bin: 100%|██████████| 510M/510M [00:09<00:00, 55.4MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 255/255 [00:00<00:00, 142kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 61.7MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 37.8MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 32.2MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 80.0/80.0 [00:00<00:00, 40.6kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 230/230 [00:00<00:00, 126kB/s]


In [16]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("vicgalle/gpt2-open-instruct-v1")
model = AutoModelForCausalLM.from_pretrained("vicgalle/gpt2-open-instruct-v1")

In [17]:
from transformers import Pipeline

In [18]:
#input_ids = tokenizer.encode(input_text, return_tensors="pt")
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

In [20]:
instruction_text = "generate dialogues that imitates Shakespeare's work"

In [21]:
sequences = pipeline(
   instruction_text,
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [22]:
sequences[0]

{'generated_text': "generate dialogues that imitates Shakespeare's work), the first time I heard of it is when a friend asked me to help him with his homework.\n\nYou know, he was trying to finish the first chapter of his novel, but I'm sure he just forgot something important. He's so busy with studying for the next chapter that it's hard to focus and remember things.\n\nSo I thought you could ask me something. I think you might be able to do it.\n\nOf course, let's do it together. I can make a plan. You could just tell me how much you want to spend on the first chapter, the number of chapters you've already worked on, the amount of time you've been working, and so on. It will be so much fun.\n\nBut don't worry about it, buddy. I will make sureقyou get through everything you need to do.\n\nI hope you like it. I'm here for"}