# Fine-Tuning GPT

In [5]:
import os
import math
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

## Environment
Check up environment settings

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"        
print(f"Using device: {device}")

Using device: cpu


In [9]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'    

## Data
Let's load data from `.csv` file

In [3]:
dataset = load_dataset("csv", data_files="data.csv")
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Bad_Practices', 'Good_Practices'],
        num_rows: 6712
    })
})

Display few samples

In [4]:
dataset['train'][0]

{'Bad_Practices': '<table alt=header>Title</table>',
 'Good_Practices': "<table alt='header'>Title</table>"}

In [7]:
dataset_train = pd.read_csv('./data.csv')
dataset_train.head()

Unnamed: 0,Bad_Practices,Good_Practices
0,<table alt=header>Title</table>,<table alt='header'>Title</table>
1,<tr>Content,<tr>Content</tr>
2,<h2 src='description'>Content,<h2 src='description'>Content</h2>
3,<table>Link,<table>Link</table>
4,<img src='description'>,<img src='description' alt=''>


## Model
Define model and tokenizer

In [10]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)



Get special tokens

In [11]:
special_tokens = tokenizer.special_tokens_map
print(special_tokens)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}


Get trainable params

In [12]:
def get_num_trainable_params(model):
    total_params = 0
    total_trainable_params = 0

    for _ , params in model.named_parameters():
        total_params += params.numel()

    if params.requires_grad:
        total_trainable_params += params.numel()

    return f"Trainable Param = {total_trainable_params}\nTotal Params = {total_params}\n% of trainable params = {100*(total_trainable_params/total_params)}"

print(get_num_trainable_params(model))

Trainable Param = 768
Total Params = 124439808
% of trainable params = 0.0006171658509791337


Prepare tokenized dataset

In [14]:
# Set the pad_token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Split the dataset into training and validation sets
train_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 != 0])  # Use 90% of the data for training
val_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 == 0])  # Use 10% of the data for validation

# Tokenize the input and target sequences
def tokenize_function(examples):
    inputs = tokenizer(examples['Bad_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    labels = tokenizer(examples['Good_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    return {'input_ids': inputs['input_ids'], 'labels': labels['input_ids']}

Apply tokenization to the datasets

In [18]:
tokenized_train_data = train_data.map(tokenize_function, batched=True)
tokenized_val_data = val_data.map(tokenize_function, batched=True)

In [19]:
tokenized_train_data

Dataset({
    features: ['Bad_Practices', 'Good_Practices', 'input_ids', 'labels'],
    num_rows: 6040
})

In [20]:
tokenized_val_data

Dataset({
    features: ['Bad_Practices', 'Good_Practices', 'input_ids', 'labels'],
    num_rows: 672
})

## Training
It is now time to fine-tune. We first of all define and set the training argruments and trainer parameters

In [22]:
training_args = TrainingArguments(
    output_dir='./gpt2-v1',
    overwrite_output_dir=True,
    num_train_epochs=0.5,
    per_device_train_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=500,
    logging_steps=100,
    logging_dir='./gpt2-log',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
)

Train/fine-tune Model

In [11]:
trainer.train()

  0%|          | 0/1510 [00:00<?, ?it/s]

{'loss': 0.1858, 'grad_norm': 0.6013973355293274, 'learning_rate': 4.668874172185431e-05, 'epoch': 0.03}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.021580364555120468, 'eval_runtime': 45.8685, 'eval_samples_per_second': 14.651, 'eval_steps_per_second': 1.831, 'epoch': 0.03}
{'loss': 0.0249, 'grad_norm': 0.5122058987617493, 'learning_rate': 4.337748344370861e-05, 'epoch': 0.07}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.019640089944005013, 'eval_runtime': 45.0156, 'eval_samples_per_second': 14.928, 'eval_steps_per_second': 1.866, 'epoch': 0.07}
{'loss': 0.0225, 'grad_norm': 0.5484597086906433, 'learning_rate': 4.006622516556292e-05, 'epoch': 0.1}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.018777277320623398, 'eval_runtime': 44.9996, 'eval_samples_per_second': 14.933, 'eval_steps_per_second': 1.867, 'epoch': 0.1}
{'loss': 0.022, 'grad_norm': 0.4539085328578949, 'learning_rate': 3.675496688741722e-05, 'epoch': 0.13}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.018603304401040077, 'eval_runtime': 45.0172, 'eval_samples_per_second': 14.928, 'eval_steps_per_second': 1.866, 'epoch': 0.13}
{'loss': 0.0201, 'grad_norm': 0.47286444902420044, 'learning_rate': 3.3443708609271526e-05, 'epoch': 0.17}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.017403988167643547, 'eval_runtime': 44.9919, 'eval_samples_per_second': 14.936, 'eval_steps_per_second': 1.867, 'epoch': 0.17}
{'loss': 0.0195, 'grad_norm': 0.7612916231155396, 'learning_rate': 3.0132450331125826e-05, 'epoch': 0.2}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.017642980441451073, 'eval_runtime': 44.9974, 'eval_samples_per_second': 14.934, 'eval_steps_per_second': 1.867, 'epoch': 0.2}
{'loss': 0.0193, 'grad_norm': 0.4788755774497986, 'learning_rate': 2.6821192052980134e-05, 'epoch': 0.23}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.016922488808631897, 'eval_runtime': 44.9803, 'eval_samples_per_second': 14.94, 'eval_steps_per_second': 1.867, 'epoch': 0.23}
{'loss': 0.0195, 'grad_norm': 0.6407566666603088, 'learning_rate': 2.3509933774834437e-05, 'epoch': 0.26}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01767270267009735, 'eval_runtime': 45.0077, 'eval_samples_per_second': 14.931, 'eval_steps_per_second': 1.866, 'epoch': 0.26}
{'loss': 0.0191, 'grad_norm': 0.4587347209453583, 'learning_rate': 2.0198675496688745e-05, 'epoch': 0.3}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.017101578414440155, 'eval_runtime': 44.9872, 'eval_samples_per_second': 14.938, 'eval_steps_per_second': 1.867, 'epoch': 0.3}
{'loss': 0.0193, 'grad_norm': 0.6490234136581421, 'learning_rate': 1.688741721854305e-05, 'epoch': 0.33}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.017166651785373688, 'eval_runtime': 45.0401, 'eval_samples_per_second': 14.92, 'eval_steps_per_second': 1.865, 'epoch': 0.33}
{'loss': 0.0183, 'grad_norm': 0.49436986446380615, 'learning_rate': 1.3576158940397351e-05, 'epoch': 0.36}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.016910996288061142, 'eval_runtime': 46.6092, 'eval_samples_per_second': 14.418, 'eval_steps_per_second': 1.802, 'epoch': 0.36}
{'loss': 0.018, 'grad_norm': 0.46442288160324097, 'learning_rate': 1.0264900662251655e-05, 'epoch': 0.4}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01680365391075611, 'eval_runtime': 45.2884, 'eval_samples_per_second': 14.838, 'eval_steps_per_second': 1.855, 'epoch': 0.4}
{'loss': 0.0179, 'grad_norm': 0.5831215381622314, 'learning_rate': 6.95364238410596e-06, 'epoch': 0.43}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01646830514073372, 'eval_runtime': 45.0171, 'eval_samples_per_second': 14.928, 'eval_steps_per_second': 1.866, 'epoch': 0.43}
{'loss': 0.0185, 'grad_norm': 0.5309569835662842, 'learning_rate': 3.642384105960265e-06, 'epoch': 0.46}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.016424572095274925, 'eval_runtime': 45.0727, 'eval_samples_per_second': 14.909, 'eval_steps_per_second': 1.864, 'epoch': 0.46}
{'loss': 0.0175, 'grad_norm': 0.6122424602508545, 'learning_rate': 3.3112582781456954e-07, 'epoch': 0.5}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01641058549284935, 'eval_runtime': 45.0827, 'eval_samples_per_second': 14.906, 'eval_steps_per_second': 1.863, 'epoch': 0.5}
{'train_runtime': 2740.826, 'train_samples_per_second': 1.102, 'train_steps_per_second': 0.551, 'train_loss': 0.03072101294204889, 'epoch': 0.5}


TrainOutput(global_step=1510, training_loss=0.03072101294204889, metrics={'train_runtime': 2740.826, 'train_samples_per_second': 1.102, 'train_steps_per_second': 0.551, 'total_flos': 789101936640000.0, 'train_loss': 0.03072101294204889, 'epoch': 0.5})

In [None]:
trainer.save_model()

## Inferencing

## Evaluation

In [12]:
eval_results = trainer.evaluate()
print(f'Perplexity: {math.exp(eval_results["eval_loss"]):.2f}')

  0%|          | 0/84 [00:00<?, ?it/s]

Perplexity: 1.02
