# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

In [None]:
! pip install -q "scikit-learn~=1.6" \
    "datasets==3.3.2" "huggingface-hub==0.29.1" \
    "transformers==4.49.0" "evaluate==0.4.3" \
    "peft==0.14.0"

In [None]:
import torch

dataset_name = "SetFit/ag_news"
dataset_splits = ["train", "test"]
model_name = "openai-community/gpt2"


## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [None]:
from datasets import load_dataset

# the ag_news dataset is split into 120k train rows and 7.6k test rows.
dataset = load_dataset(dataset_name)

# reduce record cound to fit in memory (and run faster)
dataset["train"] = dataset["train"].shuffle(seed=8).select(range(800))
dataset["test"] = dataset["test"].shuffle(seed=8).select(range(200))

# view the dataset characteristics
print("train =", dataset["train"])
print("train[0] =", dataset["train"][0])
print("--------------------")
print("test =", dataset["test"])
print("test[0] =", dataset["test"][0])


In [None]:
from transformers import AutoTokenizer

# https://huggingface.co/docs/transformers/en/model_doc/gpt2#usage-tips
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "left"
tokenizer.pad_token=tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

# 
tokenized_dataset = {}
for split in dataset_splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda row: tokenizer(row["text"], truncation=True), batched=True
    )

# inspect the special token ids
print(f"{tokenizer.eos_token_id=}")
print(f"{tokenizer.pad_token_id=}")

# inspect the columns in the tokenized dataset
print(f"{tokenized_dataset['train']=}")
print(f"{tokenized_dataset['test']=}")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
    id2label={ 0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech" },
    label2id={ "World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3 },
    pad_token_id=tokenizer.pad_token_id,
)

# ensure all the parameters of the base model are frozen.
# see: https://huggingface.co/transformers/v4.2.2/training.html
for param in model.base_model.parameters():
    param.requires_grad = False

print(model)

In [None]:
import numpy as np
import evaluate
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/news_gpt2",
        # set the learning rate
        learning_rate=0.005,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        # evaluate and save the model after each epoch
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


In [None]:
# we are interested in the performance of the base model,
# so will not train with this dataset.
# trainer.train()

In [None]:
trainer.evaluate()

## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [None]:
from transformers import AutoTokenizer

# https://huggingface.co/docs/transformers/en/model_doc/gpt2#usage-tips
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="left"
)
# tokenizer.padding_side = "left"
tokenizer.pad_token=tokenizer.eos_token
# tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

def row_processor(row):
    inputs = tokenizer(
        row["text"],
        truncation=True, 
        padding="max_length", 
        max_length=256,
    )
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

# 
tokenized_dataset = {}
for split in dataset_splits:
    tokenized_dataset[split] = dataset[split].map(row_processor, batched=True)

# inspect the special token ids
print(f"{tokenizer.eos_token_id=}")
print(f"{tokenizer.pad_token_id=}")

# inspect the columns in the tokenized dataset
print(f"{tokenized_dataset['train']=}")
print(f"{tokenized_dataset['test']=}")

In [None]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM
from transformers import AutoModelForSequenceClassification


gpt2_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
    id2label={ 0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech" },
    label2id={ "World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3 },
    pad_token_id=tokenizer.pad_token_id,
    #return_dict=True,
)

# ensure all the parameters of the base model are frozen.
# see: https://huggingface.co/transformers/v4.2.2/training.html
for param in gpt2_model.base_model.parameters():
    param.requires_grad = False

# for name, module in gpt2_model.named_modules():
#     print(name)

In [None]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    fan_in_fan_out=True, # this is required for gpt2
    modules_to_save=["h.10"]
)

lora_model = get_peft_model(gpt2_model, lora_config)

lora_model.print_trainable_parameters()
# lora_model

In [None]:
import numpy as np
import evaluate
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

training_args=TrainingArguments(
    output_dir="./data/news_gpt2_lora",
    # set the learning rate
    learning_rate=0.0025,
    # Set the per device train batch size and eval batch size
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # evaluate and save the model after each epoch
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    weight_decay=0.02,
    load_best_model_at_end=True,
)
print(f"{training_args.device=}")
print(f"{training_args.per_device_train_batch_size=}")

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
lora_pretrained_path="./gpt-lora"
lora_model.save_pretrained(lora_pretrained_path)


## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [None]:
from peft import PeftModel, PeftModelForSequenceClassification

model_to_merge = PeftModel.from_pretrained(
    gpt2_model, 
    lora_pretrained_path,
)

merged_model = model_to_merge.merge_and_unload()
merged_model.save_pretrained(lora_pretrained_path + "-merged-model", merged_model)


In [None]:
gpt2_merged_model = AutoModelForSequenceClassification.from_pretrained(
    lora_pretrained_path + "-merged-model",
    # num_labels=4,
    # id2label={ 0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech" },
    # label2id={ "World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3 },
    # pad_token_id=tokenizer.pad_token_id,
)

# ensure all the parameters of the base model are frozen.
# see: https://huggingface.co/transformers/v4.2.2/training.html
for param in gpt2_merged_model.base_model.parameters():
    param.requires_grad = False

print(model)

In [None]:

trainer = Trainer(
    model=gpt2_merged_model,
    # args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate()