# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

In [5]:
! pip install -q "scikit-learn~=1.6" \
    "datasets==3.3.2" "huggingface-hub==0.29.1" \
    "transformers==4.49.0" "evaluate==0.4.3" \
    "peft==0.14.0"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [80]:
dataset_name = "SetFit/ag_news"
dataset_splits = ["train", "test"]
model_name = "openai-community/gpt2"

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [81]:
from datasets import load_dataset

# the ag_news dataset is split into 120k train rows and 7.6k test rows.
dataset = load_dataset(dataset_name)

# reduce record cound to fit in memory (and run faster)
dataset["train"] = dataset["train"].shuffle(seed=8).select(range(500))
dataset["test"] = dataset["test"].shuffle(seed=8).select(range(100))

# view the dataset characteristics
print("train =", dataset["train"])
print("train[0] =", dataset["train"][0])
print("--------------------")
print("test =", dataset["test"])
print("test[0] =", dataset["test"][0])


train = Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 500
})
train[0] = {'text': 'Marias from heaven, Venus #39; form marred! ZURICH, Oct. 23. - Wimbledon champion Maria Sharapova set up an all-Russian semi-final clash with third seed Elena Dementieva after defeating Venus Williams 6-3, 6-4 at the 1.3 million-dollar WTA event in Zurich.', 'label': 1, 'label_text': 'Sports'}
--------------------
test = Dataset({
    features: ['text', 'label', 'label_text'],
    num_rows: 100
})
test[0] = {'text': 'Dual Internal Clocks Control Fruit Flies -Study (Reuters) Reuters - Humans are not the only creatures with\\an internal biological clock. Fruit flies have two, which\\separately control morning and evening activity, scientists\\said Wednesday.', 'label': 3, 'label_text': 'Sci/Tech'}


In [82]:
from transformers import AutoTokenizer

# https://huggingface.co/docs/transformers/en/model_doc/gpt2#usage-tips
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"
tokenizer.pad_token=tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

# 
tokenized_dataset = {}
for split in dataset_splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda row: tokenizer(row["text"], truncation=True), batched=True
    )

# inspect the special token ids
print(f"{tokenizer.eos_token_id=}")
print(f"{tokenizer.pad_token_id=}")

# inspect the columns in the tokenized dataset
print(f"{tokenized_dataset['train']=}")
print(f"{tokenized_dataset['test']=}")

tokenizer.eos_token_id=50256
tokenizer.pad_token_id=50257
tokenized_dataset['train']=Dataset({
    features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
    num_rows: 500
})
tokenized_dataset['test']=Dataset({
    features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
    num_rows: 100
})


In [83]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
    id2label={ 0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech" },
    label2id={ "World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3 },
    pad_token_id=tokenizer.pad_token_id,
)

# ensure all the parameters of the base model are frozen.
# see: https://huggingface.co/transformers/v4.2.2/training.html
for param in model.base_model.parameters():
    param.requires_grad = False

print(model)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=4, bias=False)
)


In [84]:
import numpy as np
import evaluate
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/news_gpt2",
        # set the learning rate
        learning_rate=0.005,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        # evaluate and save the model after each epoch
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


In [85]:
# we are interested in the performance of the base model,
# so will not train with this dataset.
# trainer.train()

In [86]:
trainer.evaluate()

{'eval_loss': 7.6148552894592285,
 'eval_model_preparation_time': 0.0039,
 'eval_accuracy': 0.22,
 'eval_runtime': 1.7909,
 'eval_samples_per_second': 55.839,
 'eval_steps_per_second': 13.96}

## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [102]:
from transformers import AutoTokenizer

# https://huggingface.co/docs/transformers/en/model_doc/gpt2#usage-tips
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"
tokenizer.pad_token=tokenizer.eos_token
# tokenizer.add_special_tokens({'pad_token': '<|pad|>'})

# 
tokenized_dataset = {}
for split in dataset_splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda row: tokenizer(row["text"], truncation=True), batched=True
    )

# inspect the special token ids
print(f"{tokenizer.eos_token_id=}")
print(f"{tokenizer.pad_token_id=}")

# inspect the columns in the tokenized dataset
print(f"{tokenized_dataset['train']=}")
print(f"{tokenized_dataset['test']=}")

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

tokenizer.eos_token_id=50256
tokenizer.pad_token_id=50256
tokenized_dataset['train']=Dataset({
    features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
    num_rows: 500
})
tokenized_dataset['test']=Dataset({
    features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
    num_rows: 100
})


In [103]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM

gpt2_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    num_labels=4,
    id2label={ 0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech" },
    label2id={ "World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3 },
    pad_token_id=tokenizer.pad_token_id,    
)
# for name, module in base_model.named_modules():
#     print(name)

# ensure all the parameters of the base model are frozen.
# see: https://huggingface.co/transformers/v4.2.2/training.html
for param in gpt2_model.base_model.parameters():
    param.requires_grad = False


lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    fan_in_fan_out=True, # this is required for gpt2
    modules_to_save=["h.11"]
)

lora_model = get_peft_model(gpt2_model, lora_config)
lora_model.save_pretrained("gpt-lora")

lora_model.print_trainable_parameters()
lora_model

trainable params: 8,169,216 || all params: 132,609,024 || trainable%: 6.1604


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-10): 11 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
         

In [104]:
from peft import AutoPeftModelForCausalLM
lora_model = AutoPeftModelForCausalLM.from_pretrained(
    "gpt-lora",
    num_labels=4,
    id2label={ 0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech" },
    label2id={ "World": 0, "Sports": 1, "Business": 2, "Sci/Tech": 3 },
    pad_token_id=tokenizer.pad_token_id,    
    
)

for name, param in lora_model.named_parameters():
    # print(name)
    if "modules_to_save" in name:
        param.requires_grad = True
    else:        
        param.requires_grad = False


lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-10): 11 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
         

In [107]:
import numpy as np
import evaluate
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./data/news_gpt2_lora",
        # set the learning rate
        learning_rate=0.005,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        # evaluate and save the model after each epoch
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [106]:
trainer.train()

ValueError: Expected input batch_size (94) to match target batch_size (1).

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.