# Fine-Tuning Large Language Models

In [1]:
import os
import math
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

2024-10-05 19:58:27.988803: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Environment
Check up environment settings

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"        
print(f"Using device: {device}")

Using device: cpu


In [4]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'    
os.environ['TOKENIZERS_PARALLELISM'] = 'true'  

## Data
Let's load data from `.csv` file

In [5]:
dataset = load_dataset("csv", data_files="data.csv")
dataset

DatasetDict({
    train: Dataset({
        features: ['Bad_Practices', 'Good_Practices'],
        num_rows: 6712
    })
})

Display few samples

In [6]:
dataset['train'][0]

{'Bad_Practices': '<table alt=header>Title</table>',
 'Good_Practices': "<table alt='header'>Title</table>"}

In [7]:
dataset_train = pd.read_csv('./data.csv')
dataset_train.head()

Unnamed: 0,Bad_Practices,Good_Practices
0,<table alt=header>Title</table>,<table alt='header'>Title</table>
1,<tr>Content,<tr>Content</tr>
2,<h2 src='description'>Content,<h2 src='description'>Content</h2>
3,<table>Link,<table>Link</table>
4,<img src='description'>,<img src='description' alt=''>


## Model
Define model and tokenizer

In [8]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)



Get special tokens

In [9]:
special_tokens = tokenizer.special_tokens_map
print(special_tokens)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}


Get trainable params

In [10]:
def get_num_trainable_params(model):
    total_params = 0
    total_trainable_params = 0

    for _ , params in model.named_parameters():
        total_params += params.numel()

    if params.requires_grad:
        total_trainable_params += params.numel()

    return f"Trainable Param = {total_trainable_params}\nTotal Params = {total_params}\n% of trainable params = {100*(total_trainable_params/total_params)}"

print(get_num_trainable_params(model))

Trainable Param = 768
Total Params = 124439808
% of trainable params = 0.0006171658509791337


Prepare tokenized dataset

In [11]:
# Set the pad_token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Split the dataset into training and validation sets
train_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 != 0])  # Use 90% of the data for training
val_data = dataset["train"].select([i for i in range(len(dataset["train"])) if i % 10 == 0])  # Use 10% of the data for validation

# Tokenize the input and target sequences
def tokenize_function(examples):
    inputs = tokenizer(examples['Bad_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    labels = tokenizer(examples['Good_Practices'], return_tensors='pt', padding='max_length', max_length=512, truncation=True)
    return {'input_ids': inputs['input_ids'], 'labels': labels['input_ids']}

Apply tokenization to the datasets

In [12]:
tokenized_train_data = train_data.map(tokenize_function, batched=True)
tokenized_val_data = val_data.map(tokenize_function, batched=True)

In [13]:
tokenized_train_data

Dataset({
    features: ['Bad_Practices', 'Good_Practices', 'input_ids', 'labels'],
    num_rows: 6040
})

In [14]:
tokenized_val_data

Dataset({
    features: ['Bad_Practices', 'Good_Practices', 'input_ids', 'labels'],
    num_rows: 672
})

## Training
It is now time to fine-tune. We first of all define and set the training argruments and trainer parameters

In [15]:
training_args = TrainingArguments(
    output_dir='./gpt2-v1',
    overwrite_output_dir=True,
    num_train_epochs=0.5,
    per_device_train_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=500,
    logging_steps=100,
    logging_dir='./gpt2-log',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
)



Train/fine-tune Model

In [16]:
%%time
trainer.train()

  0%|          | 0/1510 [00:00<?, ?it/s]

{'loss': 0.1858, 'grad_norm': 0.6013979911804199, 'learning_rate': 4.668874172185431e-05, 'epoch': 0.03}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.021580373868346214, 'eval_runtime': 45.2643, 'eval_samples_per_second': 14.846, 'eval_steps_per_second': 1.856, 'epoch': 0.03}
{'loss': 0.0249, 'grad_norm': 0.5122044086456299, 'learning_rate': 4.337748344370861e-05, 'epoch': 0.07}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.019640132784843445, 'eval_runtime': 45.0179, 'eval_samples_per_second': 14.927, 'eval_steps_per_second': 1.866, 'epoch': 0.07}
{'loss': 0.0225, 'grad_norm': 0.5484234690666199, 'learning_rate': 4.006622516556292e-05, 'epoch': 0.1}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01877715066075325, 'eval_runtime': 45.0129, 'eval_samples_per_second': 14.929, 'eval_steps_per_second': 1.866, 'epoch': 0.1}
{'loss': 0.022, 'grad_norm': 0.45391297340393066, 'learning_rate': 3.675496688741722e-05, 'epoch': 0.13}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.018603404983878136, 'eval_runtime': 45.0215, 'eval_samples_per_second': 14.926, 'eval_steps_per_second': 1.866, 'epoch': 0.13}
{'loss': 0.0201, 'grad_norm': 0.4729527533054352, 'learning_rate': 3.3443708609271526e-05, 'epoch': 0.17}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.017404144629836082, 'eval_runtime': 45.1453, 'eval_samples_per_second': 14.885, 'eval_steps_per_second': 1.861, 'epoch': 0.17}
{'loss': 0.0195, 'grad_norm': 0.7584893107414246, 'learning_rate': 3.0132450331125826e-05, 'epoch': 0.2}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01764616183936596, 'eval_runtime': 45.2056, 'eval_samples_per_second': 14.865, 'eval_steps_per_second': 1.858, 'epoch': 0.2}
{'loss': 0.0193, 'grad_norm': 0.47831717133522034, 'learning_rate': 2.6821192052980134e-05, 'epoch': 0.23}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01692104898393154, 'eval_runtime': 45.1284, 'eval_samples_per_second': 14.891, 'eval_steps_per_second': 1.861, 'epoch': 0.23}
{'loss': 0.0195, 'grad_norm': 0.6444122791290283, 'learning_rate': 2.3509933774834437e-05, 'epoch': 0.26}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01767946407198906, 'eval_runtime': 45.1783, 'eval_samples_per_second': 14.874, 'eval_steps_per_second': 1.859, 'epoch': 0.26}
{'loss': 0.0191, 'grad_norm': 0.45875084400177, 'learning_rate': 2.0198675496688745e-05, 'epoch': 0.3}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01710532046854496, 'eval_runtime': 45.1137, 'eval_samples_per_second': 14.896, 'eval_steps_per_second': 1.862, 'epoch': 0.3}
{'loss': 0.0193, 'grad_norm': 0.6492225527763367, 'learning_rate': 1.688741721854305e-05, 'epoch': 0.33}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.017163880169391632, 'eval_runtime': 45.1676, 'eval_samples_per_second': 14.878, 'eval_steps_per_second': 1.86, 'epoch': 0.33}
{'loss': 0.0183, 'grad_norm': 0.4945014715194702, 'learning_rate': 1.3576158940397351e-05, 'epoch': 0.36}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01691201515495777, 'eval_runtime': 45.3291, 'eval_samples_per_second': 14.825, 'eval_steps_per_second': 1.853, 'epoch': 0.36}
{'loss': 0.018, 'grad_norm': 0.4645948112010956, 'learning_rate': 1.0264900662251655e-05, 'epoch': 0.4}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.016805116087198257, 'eval_runtime': 45.2274, 'eval_samples_per_second': 14.858, 'eval_steps_per_second': 1.857, 'epoch': 0.4}
{'loss': 0.0179, 'grad_norm': 0.5832744240760803, 'learning_rate': 6.95364238410596e-06, 'epoch': 0.43}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01646888628602028, 'eval_runtime': 45.3336, 'eval_samples_per_second': 14.823, 'eval_steps_per_second': 1.853, 'epoch': 0.43}
{'loss': 0.0185, 'grad_norm': 0.5311223268508911, 'learning_rate': 3.642384105960265e-06, 'epoch': 0.46}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.01642460562288761, 'eval_runtime': 45.2078, 'eval_samples_per_second': 14.865, 'eval_steps_per_second': 1.858, 'epoch': 0.46}
{'loss': 0.0175, 'grad_norm': 0.6129200458526611, 'learning_rate': 3.3112582781456954e-07, 'epoch': 0.5}


  0%|          | 0/84 [00:00<?, ?it/s]

{'eval_loss': 0.016410328447818756, 'eval_runtime': 45.3566, 'eval_samples_per_second': 14.816, 'eval_steps_per_second': 1.852, 'epoch': 0.5}
{'train_runtime': 1770.1844, 'train_samples_per_second': 1.706, 'train_steps_per_second': 0.853, 'train_loss': 0.030721422220697465, 'epoch': 0.5}
CPU times: user 9min 49s, sys: 1min 3s, total: 10min 53s
Wall time: 29min 31s


TrainOutput(global_step=1510, training_loss=0.030721422220697465, metrics={'train_runtime': 1770.1844, 'train_samples_per_second': 1.706, 'train_steps_per_second': 0.853, 'total_flos': 789101936640000.0, 'train_loss': 0.030721422220697465, 'epoch': 0.5})

In [17]:
trainer.save_model()

## Inferencing

## Evaluation

In [18]:
eval_results = trainer.evaluate()
print(f'Perplexity: {math.exp(eval_results["eval_loss"]):.2f}')

  0%|          | 0/84 [00:00<?, ?it/s]

Perplexity: 1.02
