# Lightweight Fine-Tuning Project

* PEFT technique: LoRA
* Model: GPT-2
* Evaluation approach: the evaluate method with a Hugging Face Trainer
* Fine-tuning dataset: imdb

## Loading and Evaluating a Foundation Model

Loading the chosen pre-trained Hugging Face model and evaluating its performance prior to fine-tuning. 

This includes loading an appropriate tokenizer and dataset.

In [3]:
# Imports


from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
import numpy as np

In [4]:
# Tokenizer


tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [5]:
# Model


model = AutoModelForSequenceClassification.from_pretrained("gpt2", torch_dtype="auto", 
                                                           num_labels=2,
                                                           id2label={0: "NEGATIVE", 1: "POSITIVE"},
                                                           label2id={"NEGATIVE": 0, "POSITIVE": 1},
                                                           pad_token_id=tokenizer.pad_token_id
                                                          )

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Dataset


# Load and separating the train and test splits of the imdb dataset
splits = ["train", "test"]
ds = {split: ds for split, ds in zip(splits, load_dataset("imdb", split=splits))}

# Thin out the dataset to make it run faster for this initial evaluation
for split in splits:
    ds[split] = ds[split].shuffle(seed=42).select(range(500))

    
# Show the dataset
ds

{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 500
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 500
 })}

In [7]:
# Tokenization


def preprocess_func(examples):
    """Preprocess the imdb dataset by returning tokenized examples."""
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True)
    return tokens

tokenized_ds = {}
for split in splits:
    tokenized_ds[split] = ds[split].map(preprocess_func, batched=True)

    
# Show the first example of the tokenized training set
print(tokenized_ds["train"][0]["input_ids"])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

[1858, 318, 645, 8695, 379, 477, 1022, 6401, 959, 290, 4415, 5329, 475, 262, 1109, 326, 1111, 389, 1644, 2168, 546, 6590, 6741, 13, 4415, 5329, 3073, 42807, 11, 6401, 959, 3073, 6833, 13, 4415, 5329, 21528, 389, 2407, 2829, 13, 6401, 959, 338, 7110, 389, 1290, 517, 8253, 986, 6401, 959, 3073, 517, 588, 5537, 8932, 806, 11, 611, 356, 423, 284, 4136, 20594, 986, 383, 1388, 2095, 318, 4939, 290, 7650, 78, 11, 475, 423, 366, 27659, 40024, 590, 1911, 4380, 588, 284, 8996, 11, 284, 5052, 11, 284, 13446, 13, 1374, 546, 655, 13226, 30, 40473, 1517, 1165, 11, 661, 3597, 6401, 959, 3073, 1605, 475, 11, 319, 262, 584, 1021, 11, 11810, 484, 4702, 1605, 2168, 357, 10185, 737, 6674, 340, 338, 262, 3303, 11, 393, 262, 4437, 11, 475, 314, 892, 428, 2168, 318, 517, 3594, 621, 1605, 13, 2750, 262, 835, 11, 262, 10544, 389, 1107, 922, 290, 8258, 13, 383, 7205, 318, 407, 31194, 379, 477, 986, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50

In [8]:
# Evaluation metrics


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [9]:
# Evaluation 


# Training arguments
training_args = TrainingArguments(
    output_dir='./results_base',
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Trainer instance 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'].rename_column('label', 'labels'),
    eval_dataset=tokenized_ds['test'].rename_column('label', 'labels'),
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)



# Perform the evaluation
eval_results = trainer.evaluate()

# Print the evaluation results
print(eval_results)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 5.234320163726807, 'eval_accuracy': 0.508, 'eval_runtime': 44.2551, 'eval_samples_per_second': 11.298, 'eval_steps_per_second': 2.825}


## Performing Parameter-Efficient Fine-Tuning

Creating a PEFT model from the loaded model, running a training loop, and saving the PEFT model weights.

In [10]:
# Imports


from transformers import BitsAndBytesConfig
from peft import LoraConfig, TaskType, get_peft_model

In [11]:
# Base model 


model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},
    label2id={"NEGATIVE": 0, "POSITIVE": 1},
    pad_token_id=tokenizer.pad_token_id
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# PEFT Config


config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1,
    inference_mode=False,
    target_modules=["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"], 
    task_type=TaskType.SEQ_CLS,
)

In [13]:
# PEFT model


lora_model = get_peft_model(model, config)



In [14]:
# Checking the number of trainable parameters


lora_model.print_trainable_parameters()

trainable params: 1,182,720 || all params: 125,622,528 || trainable%: 0.9414871829358505


In [15]:
# Training


# Training arguments
training_args = TrainingArguments(
    output_dir='./results_peft',
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Trainer 
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_ds['train'].rename_column('label', 'labels'),
    eval_dataset=tokenized_ds['test'].rename_column('label', 'labels'),
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)


trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.550627,0.718
2,No log,0.399751,0.818


TrainOutput(global_step=250, training_loss=0.5463308715820312, metrics={'train_runtime': 337.643, 'train_samples_per_second': 2.962, 'train_steps_per_second': 0.74, 'total_flos': 529850695680000.0, 'train_loss': 0.5463308715820312, 'epoch': 2.0})

In [16]:
# Saving the model


lora_model.save_pretrained("/tmp/gpt-lora")

## Performing Inference with a PEFT Model

Loading the saved PEFT model weights and evaluating the performance of the trained PEFT model. 

This includes comparing the results to the results from prior to fine-tuning.

In [17]:
from peft import AutoPeftModelForSequenceClassification
import pandas as pd

In [18]:
# Loading the PEFT model weights


lora_model = AutoPeftModelForSequenceClassification.from_pretrained("/tmp/gpt-lora",
                                                                    num_labels=2,
                                                                    id2label={0: "NEGATIVE", 1: "POSITIVE"},
                                                                    label2id={"NEGATIVE": 0, "POSITIVE": 1},
                                                                    pad_token_id=tokenizer.pad_token_id
                                                                   )

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Evaluation


# Training arguments
training_args = TrainingArguments(
    output_dir='./results_peft_eval',
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Trainer 
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_ds['train'].rename_column('label', 'labels'),
    eval_dataset=tokenized_ds['test'].rename_column('label', 'labels'),
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

# Perform the evaluation
lora_results = trainer.evaluate()

# Print the evaluation results
print(lora_results)

{'eval_loss': 0.399751216173172, 'eval_accuracy': 0.818, 'eval_runtime': 48.7094, 'eval_samples_per_second': 10.265, 'eval_steps_per_second': 2.566}


In [21]:
# Compare the results

df = pd.DataFrame([eval_results, lora_results], index=['Base Model', 'LoRA Model']).T
df['Diff (LoRA - Base Model)'] = df['LoRA Model'] - df['Base Model']

print(df)

                         Base Model  LoRA Model  Diff (LoRA - Base Model)
eval_loss                   5.23432    0.399751                 -4.834569
eval_accuracy               0.50800    0.818000                  0.310000
eval_runtime               44.25510   49.155600                  4.900500
eval_samples_per_second    11.29800   10.172000                 -1.126000
eval_steps_per_second       2.82500    2.543000                 -0.282000


In [22]:
# Accuracy increase

diff_eval_accuracy = df.loc['eval_accuracy', 'Diff (LoRA - Base Model)']

print(round(diff_eval_accuracy, 2))

0.31
