# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

In [None]:
! pip install -q "scikit-learn~=1.6" \
    "datasets==3.3.2" "huggingface-hub==0.29.1" \
    "transformers==4.49.0" "evaluate==0.4.3" \
    "peft==0.14.0"

In [None]:

dataset_name = "SetFit/ag_news"
dataset_splits = ["train", "test"]

dataset_id2label={ 0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech" }
dataset_label2id={v:k for k, v in dataset_id2label.items()}

untuned_model_name = "openai-community/gpt2"

lora_tuned_path="./data/gpt2-lora-tuned"

print(f"{dataset_id2label=}")
print(f"{dataset_label2id=}")

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [None]:
from datasets import load_dataset

# the ag_news dataset is split into 120k train rows and 7.6k test rows.
dataset = load_dataset(dataset_name)

# reduce record cound to fit in memory (and run faster)
dataset["train"] = dataset["train"].shuffle(seed=7).select(range(2000))
dataset["test"] = dataset["test"].shuffle(seed=11).select(range(200))

# view the dataset characteristics
print("train =", dataset["train"])
print("train[0] =", dataset["train"][0])
print("--------------------")
print("test =", dataset["test"])
print("test[0] =", dataset["test"][0])


In [None]:
from transformers import AutoTokenizer

# https://huggingface.co/docs/transformers/en/model_doc/gpt2#usage-tips
tokenizer = AutoTokenizer.from_pretrained(
    untuned_model_name,
    padding_side = "right",
)
tokenizer.pad_token=tokenizer.eos_token
# tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

def row_processor(row):
    inputs = tokenizer(
        row["text"],
        truncation=True, 
        padding="max_length", 
        max_length=256,
    )
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

tokenized_dataset = {}
for split in dataset_splits:
    tokenized_dataset[split] = dataset[split].map(row_processor, batched=True)

# inspect the special token ids
print(f"{tokenizer.eos_token_id=}")
print(f"{tokenizer.pad_token_id=}")

# inspect the columns in the tokenized dataset
print(f"{tokenized_dataset['train']=}")
print(f"{tokenized_dataset['test']=}")

In [None]:
from transformers import AutoModelForSequenceClassification

untuned_model = AutoModelForSequenceClassification.from_pretrained(
    untuned_model_name,
    num_labels=len(dataset_id2label),
    id2label=dataset_id2label,
    label2id=dataset_id2label,
    pad_token_id=tokenizer.pad_token_id,
)

# ensure all the parameters of the base model are frozen.
# see: https://huggingface.co/transformers/v4.2.2/training.html
for param in untuned_model.base_model.parameters():
    param.requires_grad = False

print(untuned_model)

In [None]:
import numpy as np
import evaluate
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

training_args=TrainingArguments(
    output_dir="./data/gpt2-untuned",
    # set the learning rate
    learning_rate=0.005,
    # set the per device train batch size and eval batch size
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    # evaluate and save the model after each epoch
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)
print(f"{training_args.device=}")

trainer = Trainer(
    model=untuned_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


In [None]:
# we are interested in the accuracy of the untuned gpt2 model.
trainer.evaluate()

## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [None]:
from transformers import AutoTokenizer

# https://huggingface.co/docs/transformers/en/model_doc/gpt2#usage-tips
tokenizer = AutoTokenizer.from_pretrained(
    untuned_model_name,
    padding_side="right"
)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

def row_processor(row):
    inputs = tokenizer(
        row["text"],
        truncation=True, 
        padding="max_length", 
        max_length=256,
    )
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

# 
tokenized_dataset = {}
for split in dataset_splits:
    tokenized_dataset[split] = dataset[split].map(row_processor, batched=True)

# inspect the special token ids
print(f"{tokenizer.eos_token_id=}")
print(f"{tokenizer.pad_token_id=}")

# inspect the columns in the tokenized dataset
print(f"{tokenized_dataset['train']=}")
print(f"{tokenized_dataset['test']=}")

In [None]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForSequenceClassification

untuned_model = AutoModelForSequenceClassification.from_pretrained(
    untuned_model_name,
    num_labels=len(dataset_id2label),
    id2label=dataset_id2label,
    label2id=dataset_id2label,    
    pad_token_id=tokenizer.pad_token_id,
)

# ensure all the parameters of the base model are frozen.
# see: https://huggingface.co/transformers/v4.2.2/training.html
for param in untuned_model.base_model.parameters():
    param.requires_grad = False

# for name, module in untuned_model.named_modules():
#     print(name, type(module))

# print("named_parameters...")
# for name, param in untuned_model.base_model.named_parameters():
#     if param.requires_grad:
#         print(name, param.requires_grad)

print(untuned_model)

In [None]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForSequenceClassification

# these two steps need to be in the same cell because get_peft_model
# will modify the untrained_model and target modules names will change.

untuned_model = AutoModelForSequenceClassification.from_pretrained(
    untuned_model_name,
    num_labels=len(dataset_id2label),
    id2label=dataset_id2label,
    label2id=dataset_id2label,    
    pad_token_id=tokenizer.pad_token_id,
)

# ensure all the parameters of the base model are frozen.
# see: https://huggingface.co/transformers/v4.2.2/training.html
for param in untuned_model.base_model.parameters():
    param.requires_grad = False

# for name, module in untuned_model.named_modules():
#     print(name, type(module))

print("untuned_model...")
print(untuned_model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=128,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    fan_in_fan_out=True, # this is required for gpt2
    target_modules=[
        "transformer.h.10.attn.c_proj",
        "transformer.h.10.attn.c_proj"
        "transformer.h.11.attn.c_proj",
        "transformer.h.11.attn.c_proj"
    ],
    modules_to_save=["score"]
)

lora_model = get_peft_model(untuned_model, lora_config)

print("--------------------")
lora_model.print_trainable_parameters()

# for name, module in lora_model.named_modules():
#     print(name, type(module))

# for name, param in untuned_model.named_parameters():
#     if param.requires_grad:
#         print(name, param.requires_grad)

print("lora_model...")
print(lora_model)

In [None]:
import numpy as np
import evaluate
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

training_args=TrainingArguments(
    output_dir="./data/gpt2-lora-tuned",
    # set the learning rate
    learning_rate=0.005,
    # Set the per device train batch size and eval batch size
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # evaluate and save the model after each epoch
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
)
print(f"{training_args.device=}")

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()
training_args = None
trainer = None

In [None]:

lora_model.save_pretrained(lora_tuned_path)
lora_model = None


## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [None]:
from peft import PeftConfig, PeftModel
from peft import PeftModelForSequenceClassification
from transformers import AutoModelForSequenceClassification

peft_config = PeftConfig.from_pretrained(lora_tuned_path)
print(f"{peft_config.base_model_name_or_path=}")
print(peft_config)

untuned_model = AutoModelForSequenceClassification.from_pretrained(
    peft_config.base_model_name_or_path,
    num_labels=len(dataset_id2label),
    id2label=dataset_id2label,
    label2id=dataset_id2label,
    pad_token_id=tokenizer.pad_token_id,
    return_dict=True,
)

lora_tuned_model = PeftModel.from_pretrained(
    untuned_model, 
    lora_tuned_path,
)

lora_tuned_merged_model = lora_tuned_model.merge_and_unload()

# ensure all the parameters of the base model are frozen.
for param in lora_tuned_merged_model.base_model.parameters():
    param.requires_grad = False


# merged_model.save_pretrained(lora_tuned_path + "-merged-model", merged_model)
print(lora_tuned_merged_model)

In [None]:

trainer = Trainer(
    model=lora_tuned_merged_model,
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate()

In [None]:
import pandas as pd
from IPython.display import display

pd.set_option('display.max_colwidth', None)

df = pd.DataFrame(tokenized_dataset["test"])
df = df[["text", "label"]]

predictions = trainer.predict(tokenized_dataset["test"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)
df["predicted_label"] = df["predicted_label"].map(lambda id: dataset_id2label[id]) 
df["label"] = df["label"].map(lambda id: dataset_id2label[id]) 

In [None]:
df[df["label"] == df["predicted_label"]].head(10)

In [None]:
df[df["label"] != df["predicted_label"]].head(10)