In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


In [3]:
# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

# Define data loading function
def load_data(file_path, tokenizer, block_size=128):
    dataset = load_dataset('text', data_files={'train': file_path})
    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=block_size)
    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_datasets['train']


In [4]:
# Load your custom dataset
train_dataset = load_data('data.txt', tokenizer)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [5]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

In [6]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [7]:
# Start training
trainer.train()

Step,Training Loss
500,0.177
1000,0.0
1500,0.0001


TrainOutput(global_step=1500, training_loss=0.05902491910383105, metrics={'train_runtime': 99.1369, 'train_samples_per_second': 15.131, 'train_steps_per_second': 15.131, 'total_flos': 97984512000000.0, 'train_loss': 0.05902491910383105, 'epoch': 100.0})

In [8]:
# Save the model and tokenizer
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [9]:
# After fine-tuning, load the fine-tuned model and tokenizer to generate text.

from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model_path = './fine_tuned_model'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In [10]:
# Ensure the model is using GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Encode input text
input_text = "Hello, my name is"
inputs = tokenizer(input_text, return_tensors='pt').to(device)

In [12]:
# Generate text
outputs = model.generate(inputs['input_ids'], max_length=50)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is my name? Harsh Anand. In my next blog, I am working on LLM.

M.........................
