# Fine-Tuning GPT

In [19]:
import os
import math
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"        
print(f"Using device: {device}")

Using device: cpu


In [6]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'    

Prepare Model

In [10]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)



Get special tokens

In [20]:
special_tokens = tokenizer.special_tokens_map
print(special_tokens)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}


Get trainable params

In [15]:
def get_num_trainable_params(model):
    total_params = 0
    total_trainable_params = 0

    for _ , params in model.named_parameters():
        total_params += params.numel()

    if params.requires_grad:
        total_trainable_params += params.numel()

    return f"Trainable Param = {total_trainable_params}\nTotal Params = {total_params}\n% of trainable params = {100*(total_trainable_params/total_params)}"

print(get_num_trainable_params(model))

Trainable Param = 768
Total Params = 124439808
% of trainable params = 0.0006171658509791337


In [17]:
# Load your dataset in a CSV file
dataset = load_dataset("csv", data_files="data.csv")
dataset

DatasetDict({
    train: Dataset({
        features: ['Bad_Practices', 'Good_Practices'],
        num_rows: 6712
    })
})

Prepare dataset

In [18]:
# Set the pad_token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Split the dataset into training and validation sets
train_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 != 0])  # Use 90% of the data for training
val_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 == 0])  # Use 10% of the data for validation

# Tokenize the input and target sequences
def tokenize_function(examples):
    inputs = tokenizer(examples['Bad_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    labels = tokenizer(examples['Good_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    return {'input_ids': inputs['input_ids'], 'labels': labels['input_ids']}

# Apply tokenization to the datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

Define training model

In [12]:
training_args = TrainingArguments(
    output_dir='./model',
    overwrite_output_dir=True,
    num_train_epochs=0.5,
    per_device_train_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=500,
    logging_steps=100,
    logging_dir='./logs',
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Train/fine-tune Model

In [None]:
trainer.train()

Evaluate

In [None]:
eval_results = trainer.evaluate()
print(f'Perplexity: {math.exp(eval_results["eval_loss"]):.2f}')

Save model

In [None]:
trainer.save_model()