## Lightweight Fine-Tuning Project

##### Checking PyTorch and CUDA Version, and GPU Availability

In [1]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Is GPU available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU Device Name: {torch.cuda.get_device_name(0)}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}\n")

PyTorch version: 2.4.1+cu124
CUDA version: 12.4
Is GPU available: True
GPU Device Name: NVIDIA GeForce RTX 2070
Device: cuda



##### Importing Required Libraries for Data Loading and GPT-2 Model Setup

In [2]:
from tqdm import tqdm
import pandas as pd

from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification

#### Loading and Evaluating a Foundation Model

##### Loading and Preparing AG News Dataset for Classification

In [3]:
# set seed
seed = 42

# load the dataset https://huggingface.co/datasets/fancyzhx/ag_news
try:
    dataset = load_dataset('ag_news', split=['train', 'test'])
    print("Dataset loaded successfully.")
except Exception as ex:
    print(f"An error occurred while loading the dataset: {ex}")
# NOTE: ag_news is a dataset of news articles classified into four categories: World, Sports, Business, and Sci/Tech.

# print the dataset
# NOTE: training examples: 120,000; test examples: 7,600
print(dataset, '\n\n')

# shuffle the training dataset 
num_train_samples = None  # use all training samples
num_test_samples = None  # use all test samples

# get training examples
if num_train_samples is None:
    train_dataset = dataset[0].shuffle(seed=seed)
else:
    train_dataset = dataset[0].shuffle(seed=seed).select(range(num_train_samples))

# get test examples
if num_test_samples is None:
    test_dataset = dataset[1]
else:
    test_dataset = dataset[1].shuffle(seed=seed).select(range(num_test_samples))

# length of the training and test datasets
print(f"Number of training examples: {len(train_dataset)}")
print(f"Number of test examples: {len(test_dataset)}")

# get number of classes
num_classes = train_dataset.features['label'].num_classes
print(f"Number of classes: {num_classes}")

Dataset loaded successfully.
[Dataset({
    features: ['text', 'label'],
    num_rows: 120000
}), Dataset({
    features: ['text', 'label'],
    num_rows: 7600
})] 


Number of training examples: 120000
Number of test examples: 7600
Number of classes: 4


##### Selecting and Displaying a Subset of AG News Training Dataset with Class Labels

In [4]:
# select a subset of the training dataset
num_examples = 10
items = train_dataset.select(range(num_examples))

# class labels
class_labels = {0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}

# create a dataframe from the dataset
dataset_dict = { 
    "text": [item['text'] for item in items],
    "label": [item['label'] for item in items],
    "class": [class_labels[item['label']] for item in items]
}
df = pd.DataFrame(dataset_dict)

# display the dataframe
pd.set_option('display.max_colwidth', None)
df.head(num_examples)

Unnamed: 0,text,label,class
0,"Bangladesh paralysed by strikes Opposition activists have brought many towns and cities in Bangladesh to a halt, the day after 18 people died in explosions at a political rally.",0,World
1,Desiring Stability Redskins coach Joe Gibbs expects few major personnel changes in the offseason and wants to instill a culture of stability in Washington.,1,Sports
2,"Will Putin #39;s Power Play Make Russia Safer? Outwardly, Russia has not changed since the barrage of terrorist attacks that culminated in the school massacre in Beslan on Sept.",0,World
3,U2 pitches for Apple New iTunes ads airing during baseball games Tuesday will feature the advertising-shy Irish rockers.,3,Sci/Tech
4,S African TV in beheading blunder Public broadcaster SABC apologises after news bulletin shows footage of American beheaded in Iraq.,0,World
5,"A Cosmic Storm: When Galaxy Clusters Collide Astronomers have found what they are calling the perfect cosmic storm, a galaxy cluster pile-up so powerful its energy output is second only to the Big Bang.",3,Sci/Tech
6,West sets deadline for Iran to freeze uranium enrichment Four western countries set the scene yesterday for a showdown with Iran by demanding that it freeze its uranium enrichment activities immediately.,0,World
7,"Computer Assoc. Cuts 800 Jobs Worldwide (AP) AP - Computer Associates International Inc. announced a restructuring plan Wednesday that would reduce its work force by 800 people worldwide, saving the business software maker #36;70 million annually once the plan is fully implemented.",3,Sci/Tech
8,"CA Opens Utility Pricing for Mainframes Keeping its promise to migrate toward more flexible pricing for its software, Computer Associates (Quote, Chart) has unleashed Measured Workload Pricing for its mainframe management products.",3,Sci/Tech
9,"Economy builds steam in KC Fed district The economy continued to strengthen in September and early October in the Great Plains and Rocky Mountain regions covered by the Tenth Federal Reserve District, the Federal Reserve Bank of Kansas City said Wednesday.",2,Business


##### Loading GPT-2 Tokenizer and Sequence Classification Model with Padding Token Setup

In [5]:
# load tokenizer and model
try:
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=num_classes)
    print("Tokenizer and model loaded successfully.")
except Exception as ex:
    print(f"An error occurred while loading the tokenizer and model: {ex}")

# add padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # add padding token to tokenizer
model.resize_token_embeddings(len(tokenizer))  # resize token embeddings in model
model.config.pad_token_id = tokenizer.pad_token_id  # set pad token id in model config

# move the model to the device
model.to(device)

# Verify the padding token
print(f"Padding token: {tokenizer.pad_token}")
print(f"Padding token ID: {tokenizer.pad_token_id}\n")

# no. of trainable parameters
print(f"Number of trainable parameters: {model.num_parameters()}\n")
# NOTE: number of trainable parameters: 124,443,652

# print the model architecture
print(model)
# NOTE: last layer (score) is newly initialized with random weights

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizer and model loaded successfully.
Padding token: [PAD]
Padding token ID: 50257

Number of trainable parameters: 124443648

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, 

##### Tokenizing and Padding the First Five Training Examples

In [6]:
# first five training examples
texts = df['text'].tolist()[:5]

# Tokenize the text with padding
encoded_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Print the tokenized inputs
input_ids = encoded_inputs['input_ids']
attention_mask = encoded_inputs['attention_mask']
print(f"Input IDs shape: {input_ids.shape}")
print(f"Input IDs:\n{input_ids}\n")
print(f"Attention Mask shape: {attention_mask.shape}")
print(f"Attention Mask:\n{attention_mask}\n")

Input IDs shape: torch.Size([5, 39])
Input IDs:
tensor([[43984,    75, 13410,  1582, 47557,   416,  8956, 29560,  7941,   423,
          3181,   867, 11684,   290,  4736,   287, 19483,   284,   257, 17369,
            11,   262,  1110,   706,  1248,   661,  3724,   287, 23171,   379,
           257,  1964,  7903,    13, 50257, 50257, 50257, 50257, 50257],
        [ 5960,  3428, 47865, 22038,  3985,  5689, 41071, 13423,  1178,  1688,
          8213,  2458,   287,   262, 16349,   290,  3382,   284,   916,   359,
           257,  3968,   286, 10159,   287,  2669,    13, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257],
        [ 8743,  8144,  1303,  2670,    26,    82,  4333,  3811,  6889,  3284,
          6895,   263,    30,  3806,   904,   306,    11,  3284,   468,   407,
          3421,  1201,   262, 33633,   286,  7417,  3434,   326, 45200,   287,
           262,  1524, 19050,   287, 30837,  9620,   319,  2362,    13],
        [   52,    17,

##### Tokenization, DataLoader Setup, and Model Training/Evaluation Functions

In [7]:
# tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)


# create DataLoader
def get_dataloader(dataset, batch_size=8):
    input_ids = torch.tensor(dataset['input_ids'])
    attention_mask = torch.tensor(dataset['attention_mask'])
    labels = torch.tensor(dataset['label'])

    dataset = TensorDataset(input_ids, attention_mask, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    return dataloader


# training loop
def train(model: torch.nn.Module, dataloader: DataLoader, optimizer: torch.optim.Optimizer, 
          scheduler: torch.optim.lr_scheduler.LambdaLR, device: torch.device, print_every: int = 250):
    model.train()

    # initialize variable to store total loss
    total_loss = 0
    num_batches = len(dataloader)

    for batch_idx, batch in enumerate(tqdm(dataloader, desc="Training")):
        # move batch to device and unpack
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        # zero the gradients
        optimizer.zero_grad()
        # forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = torch.nn.CrossEntropyLoss()(logits, labels)

        # backward pass and optimization step
        loss.backward()
        optimizer.step()
        scheduler.step()

        # update total loss
        total_loss += loss.item()

        # print loss periodically
        if batch_idx % print_every == 0:
            avg_loss = total_loss / (batch_idx + 1)
            print(f"Batch {batch_idx}/{num_batches}, Loss: {avg_loss:.4f}")

    # calculate average loss
    avg_loss = total_loss / num_batches
    return avg_loss


# evaluation loop
def evaluate(model: torch.nn.Module, dataloader: DataLoader, device: torch.device):
    model.eval()
    # initialize variables to store loss and accuracy
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluation"):
            # move batch to device and unpack
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            # forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = torch.nn.CrossEntropyLoss()(logits, labels)

            # update counts
            total_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    # calculate average loss and accuracy
    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions
    return avg_loss, accuracy


# print model architecture and number of trainable parameters
def print_model(model: torch.nn.Module, num_show_parameters=3):
    # print model architecture
    print(model)
    print(f"Number of trainable parameters: {model.num_parameters()}\n")

    # print model parameters
    for name, param in model.named_parameters():
        print(f"Parameter: {name}, Trainable: {param.requires_grad}")
        params_head = param.flatten()[:num_show_parameters].tolist()
        params_head = [f"{p:.4f}" for p in params_head]
        print(f"Size: {param.size()}; First three parameters: {params_head}", end="\n\n")

##### Evaluating Model Accuracy on Training and Test Datasets

In [8]:
# evaluate the model on training and test datasets
batch_size = 8

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
train_dataloader = get_dataloader(tokenized_train_dataset, batch_size=batch_size)
# _, train_accuracy = evaluate(model, train_dataloader, device)
# print(f"Train Accuracy: {train_accuracy:.4f}")

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataloader = get_dataloader(tokenized_test_dataset, batch_size=batch_size)
_, test_accuracy = evaluate(model, test_dataloader, device)
print(f"Test Accuracy: {test_accuracy:.4f}")

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Evaluation: 100%|██████████| 950/950 [02:59<00:00,  5.30it/s]

Test Accuracy: 0.2461





#### Performing Parameter-Efficient Fine-Tuning

##### Configuring and Initializing LoRA Model for Training

In [9]:
from peft import LoraConfig, get_peft_model
from transformers import AdamW, get_linear_schedule_with_warmup


# configure LoRA model
lora_config = LoraConfig(
    r=8,  # rank of the low-rank adaptation
    lora_alpha=32,  # scaling factor
    lora_dropout=0.1,  # dropout rate
    target_modules=["attn.c_attn", "attn.c_proj"]  # target modules to adapt
)

# get LoRA model
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()
lora_model = lora_model.to(device)

# set last layer to be trainable
print("\n========== Trainable Layers ==========")
for name, param in lora_model.named_parameters():
    if "score" in name:
        param.requires_grad = True
    
    # print name of layer with trainable parameters
    if param.requires_grad:
        print(f"Layer Name: {name}")



trainable params: 442,368 || all params: 124,886,016 || trainable%: 0.3542

Layer Name: base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight
Layer Name: base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight
Layer Name: base_model.model.transformer.h.0.attn.c_proj.lora_A.default.weight
Layer Name: base_model.model.transformer.h.0.attn.c_proj.lora_B.default.weight
Layer Name: base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight
Layer Name: base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight
Layer Name: base_model.model.transformer.h.1.attn.c_proj.lora_A.default.weight
Layer Name: base_model.model.transformer.h.1.attn.c_proj.lora_B.default.weight
Layer Name: base_model.model.transformer.h.2.attn.c_attn.lora_A.default.weight
Layer Name: base_model.model.transformer.h.2.attn.c_attn.lora_B.default.weight
Layer Name: base_model.model.transformer.h.2.attn.c_proj.lora_A.default.weight
Layer Name: base_model.model.transformer.h.2.attn.c_pro

##### Display LoRA Model Architecture and Trainable Parameters Before Training

In [10]:
# show model before training
print_model(lora_model)

PeftModel(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50258, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lo

##### Training and Evaluating the LoRA Model

In [11]:
# training parameters
num_epochs = 1
learning_rate = 2e-5
num_warmup_steps = 0

# define optimizer and scheduler
optimizer = AdamW(lora_model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs  # total number of training steps
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=total_steps)

# training and evaluation
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    train_loss = train(lora_model, train_dataloader, optimizer, scheduler, device)
    print(f"Train Loss: {train_loss:.4f}")

    eval_loss, eval_accuracy = evaluate(lora_model, test_dataloader, device)
    print(f"Eval Loss: {eval_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}")



Epoch 1/1


Training:   0%|          | 1/15000 [00:00<2:32:39,  1.64it/s]

Batch 0/15000, Loss: 6.1291


Training:   2%|▏         | 251/15000 [02:22<2:21:09,  1.74it/s]

Batch 250/15000, Loss: 2.6859


Training:   3%|▎         | 501/15000 [04:46<2:19:28,  1.73it/s]

Batch 500/15000, Loss: 2.0398


Training:   5%|▌         | 751/15000 [07:10<2:16:37,  1.74it/s]

Batch 750/15000, Loss: 1.7247


Training:   7%|▋         | 1001/15000 [09:34<2:15:47,  1.72it/s]

Batch 1000/15000, Loss: 1.5148


Training:   8%|▊         | 1251/15000 [11:57<2:08:37,  1.78it/s]

Batch 1250/15000, Loss: 1.3501


Training:  10%|█         | 1501/15000 [14:18<2:06:54,  1.77it/s]

Batch 1500/15000, Loss: 1.2077


Training:  12%|█▏        | 1751/15000 [16:42<2:09:03,  1.71it/s]

Batch 1750/15000, Loss: 1.0957


Training:  13%|█▎        | 2001/15000 [19:07<2:04:33,  1.74it/s]

Batch 2000/15000, Loss: 1.0109


Training:  15%|█▌        | 2251/15000 [21:31<2:03:46,  1.72it/s]

Batch 2250/15000, Loss: 0.9421


Training:  17%|█▋        | 2501/15000 [23:55<1:59:43,  1.74it/s]

Batch 2500/15000, Loss: 0.8838


Training:  18%|█▊        | 2751/15000 [26:19<1:57:46,  1.73it/s]

Batch 2750/15000, Loss: 0.8382


Training:  20%|██        | 3001/15000 [28:44<1:56:45,  1.71it/s]

Batch 3000/15000, Loss: 0.7956


Training:  22%|██▏       | 3251/15000 [31:10<1:54:24,  1.71it/s]

Batch 3250/15000, Loss: 0.7611


Training:  23%|██▎       | 3501/15000 [33:35<1:50:28,  1.73it/s]

Batch 3500/15000, Loss: 0.7314


Training:  25%|██▌       | 3751/15000 [36:00<1:47:33,  1.74it/s]

Batch 3750/15000, Loss: 0.7035


Training:  27%|██▋       | 4001/15000 [38:25<1:46:43,  1.72it/s]

Batch 4000/15000, Loss: 0.6812


Training:  28%|██▊       | 4251/15000 [40:49<1:43:28,  1.73it/s]

Batch 4250/15000, Loss: 0.6595


Training:  30%|███       | 4501/15000 [43:13<1:41:09,  1.73it/s]

Batch 4500/15000, Loss: 0.6394


Training:  32%|███▏      | 4751/15000 [45:39<1:42:20,  1.67it/s]

Batch 4750/15000, Loss: 0.6224


Training:  33%|███▎      | 5001/15000 [48:04<1:35:41,  1.74it/s]

Batch 5000/15000, Loss: 0.6054


Training:  35%|███▌      | 5251/15000 [50:30<1:34:18,  1.72it/s]

Batch 5250/15000, Loss: 0.5919


Training:  37%|███▋      | 5501/15000 [52:54<1:31:27,  1.73it/s]

Batch 5500/15000, Loss: 0.5791


Training:  38%|███▊      | 5751/15000 [55:19<1:29:35,  1.72it/s]

Batch 5750/15000, Loss: 0.5672


Training:  40%|████      | 6001/15000 [57:44<1:27:08,  1.72it/s]

Batch 6000/15000, Loss: 0.5563


Training:  42%|████▏     | 6251/15000 [1:00:08<1:23:26,  1.75it/s]

Batch 6250/15000, Loss: 0.5455


Training:  43%|████▎     | 6501/15000 [1:02:31<1:20:44,  1.75it/s]

Batch 6500/15000, Loss: 0.5360


Training:  45%|████▌     | 6751/15000 [1:04:56<1:18:41,  1.75it/s]

Batch 6750/15000, Loss: 0.5257


Training:  47%|████▋     | 7001/15000 [1:07:22<1:19:23,  1.68it/s]

Batch 7000/15000, Loss: 0.5174


Training:  48%|████▊     | 7251/15000 [1:09:51<1:16:43,  1.68it/s]

Batch 7250/15000, Loss: 0.5102


Training:  50%|█████     | 7501/15000 [1:12:20<1:14:20,  1.68it/s]

Batch 7500/15000, Loss: 0.5032


Training:  52%|█████▏    | 7751/15000 [1:14:48<1:10:54,  1.70it/s]

Batch 7750/15000, Loss: 0.4964


Training:  53%|█████▎    | 8001/15000 [1:17:15<1:07:40,  1.72it/s]

Batch 8000/15000, Loss: 0.4902


Training:  55%|█████▌    | 8251/15000 [1:19:41<1:04:59,  1.73it/s]

Batch 8250/15000, Loss: 0.4845


Training:  57%|█████▋    | 8501/15000 [1:22:05<1:02:34,  1.73it/s]

Batch 8500/15000, Loss: 0.4793


Training:  58%|█████▊    | 8751/15000 [1:24:28<59:13,  1.76it/s]  

Batch 8750/15000, Loss: 0.4741


Training:  60%|██████    | 9001/15000 [1:26:51<57:13,  1.75it/s]

Batch 9000/15000, Loss: 0.4684


Training:  62%|██████▏   | 9251/15000 [1:29:14<54:44,  1.75it/s]

Batch 9250/15000, Loss: 0.4633


Training:  63%|██████▎   | 9501/15000 [1:31:37<52:31,  1.74it/s]

Batch 9500/15000, Loss: 0.4584


Training:  65%|██████▌   | 9751/15000 [1:34:00<50:09,  1.74it/s]

Batch 9750/15000, Loss: 0.4540


Training:  67%|██████▋   | 10001/15000 [1:36:23<47:31,  1.75it/s]

Batch 10000/15000, Loss: 0.4495


Training:  68%|██████▊   | 10251/15000 [1:38:45<45:11,  1.75it/s]

Batch 10250/15000, Loss: 0.4454


Training:  70%|███████   | 10501/15000 [1:41:08<42:34,  1.76it/s]

Batch 10500/15000, Loss: 0.4409


Training:  72%|███████▏  | 10751/15000 [1:43:32<40:14,  1.76it/s]

Batch 10750/15000, Loss: 0.4375


Training:  73%|███████▎  | 11001/15000 [1:45:55<38:01,  1.75it/s]

Batch 11000/15000, Loss: 0.4338


Training:  75%|███████▌  | 11251/15000 [1:48:18<35:43,  1.75it/s]

Batch 11250/15000, Loss: 0.4302


Training:  77%|███████▋  | 11501/15000 [1:50:41<33:21,  1.75it/s]

Batch 11500/15000, Loss: 0.4271


Training:  78%|███████▊  | 11751/15000 [1:53:04<31:09,  1.74it/s]

Batch 11750/15000, Loss: 0.4238


Training:  80%|████████  | 12001/15000 [1:55:28<28:53,  1.73it/s]

Batch 12000/15000, Loss: 0.4202


Training:  82%|████████▏ | 12251/15000 [1:57:51<26:22,  1.74it/s]

Batch 12250/15000, Loss: 0.4174


Training:  83%|████████▎ | 12501/15000 [2:00:15<23:57,  1.74it/s]

Batch 12500/15000, Loss: 0.4146


Training:  85%|████████▌ | 12751/15000 [2:02:38<21:27,  1.75it/s]

Batch 12750/15000, Loss: 0.4113


Training:  87%|████████▋ | 13001/15000 [2:05:02<19:14,  1.73it/s]

Batch 13000/15000, Loss: 0.4087


Training:  88%|████████▊ | 13251/15000 [2:07:26<16:38,  1.75it/s]

Batch 13250/15000, Loss: 0.4069


Training:  90%|█████████ | 13501/15000 [2:09:50<14:32,  1.72it/s]

Batch 13500/15000, Loss: 0.4044


Training:  92%|█████████▏| 13751/15000 [2:12:16<12:21,  1.68it/s]

Batch 13750/15000, Loss: 0.4020


Training:  93%|█████████▎| 14001/15000 [2:14:44<09:34,  1.74it/s]

Batch 14000/15000, Loss: 0.3998


Training:  95%|█████████▌| 14251/15000 [2:17:06<07:09,  1.74it/s]

Batch 14250/15000, Loss: 0.3976


Training:  97%|█████████▋| 14501/15000 [2:19:31<04:42,  1.77it/s]

Batch 14500/15000, Loss: 0.3952


Training:  98%|█████████▊| 14751/15000 [2:21:55<02:21,  1.76it/s]

Batch 14750/15000, Loss: 0.3935


Training: 100%|██████████| 15000/15000 [2:24:15<00:00,  1.73it/s]


Train Loss: 0.3913


Evaluation: 100%|██████████| 950/950 [03:24<00:00,  4.66it/s]

Eval Loss: 0.2689, Eval Accuracy: 0.9059





##### Final Model Evaluation and Saving

In [12]:
# show model after training
print_model(lora_model)


# save models
model_path = "lora_gpt2_ag_news"
lora_model.save_pretrained(f"{model_path}/lora_adapter")
# save the score weights (output layer) separately
torch.save(lora_model.score.weight, f"{model_path}/score_weights.pth")
print(f"Model saved at: {model_path}")

# delete models and variables
del model, lora_model, optimizer, scheduler, tokenizer, lora_config

PeftModel(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50258, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lo



Model saved at: lora_gpt2_ag_news


#### Performing Inference with a PEFT Model

##### Loading and Preparing the Fine-Tuned LoRA Model

In [13]:
from safetensors.torch import load_file


# load tokenizer and model
new_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
new_model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=num_classes)

# add padding token
new_tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # add padding token to tokenizer
new_model.resize_token_embeddings(len(new_tokenizer))  # resize token embeddings in model
new_model.config.pad_token_id = new_tokenizer.pad_token_id  # set pad token id in model config

# recall model path
model_path = "lora_gpt2_ag_news"
# load LoRA config and weights
loaded_lora_config = LoraConfig.from_pretrained(f"{model_path}/lora_adapter")
weights = load_file(f"{model_path}/lora_adapter/adapter_model.safetensors")

# get LoRA model
loaded_lora_model = get_peft_model(new_model, loaded_lora_config)

# replace model weights with loaded LoRA weights
for name, param in loaded_lora_model.named_parameters():
    # name e.g. 'base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight'
    if 'lora_' in name:
        assert 'default' in name, f"Layer {name} is not a default layer"
        new_name = name.replace('.default', '')  # remove '.default' from the name
        param.data.copy_(weights[new_name])  # replace the weights

# replace score weights (output layer)
score_weights = torch.load(f"{model_path}/score_weights.pth")
loaded_lora_model.score.weight.data.copy_(score_weights)

# move model to specified device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_lora_model = loaded_lora_model.to(device)
print_model(loaded_lora_model)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  score_weights = torch.load(f"{model_path}/score_weights.pth")


PeftModel(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50258, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lo

##### Evaluating the Loaded LoRA Model and Comparing Performance

In [16]:
# evaluate loaded model
loaded_model_loss, loaded_model_accuracy = evaluate(loaded_lora_model, test_dataloader, device)
print(f"Loaded Model Loss: {loaded_model_loss:.4f}, Loaded Model Accuracy: {loaded_model_accuracy:.4f}", end="\n\n")

# compare performance of original and trained models
print("Compare performance of original and trained models")
print(f"Original Model Accuracy: {test_accuracy * 100:.2f}%")
print(f"Trained Model Accuracy: {loaded_model_accuracy * 100:.2f}%")

Evaluation: 100%|██████████| 950/950 [03:18<00:00,  4.78it/s]

Loaded Model Loss: 0.2689, Loaded Model Accuracy: 0.9059

Compare performance of original and trained models
Original Model Accuracy: 24.61%
Trained Model Accuracy: 90.59%



