# Lightweight Fine-Tuning Project

## Import libraries

In [None]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelforCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModelling
from itertools import chain
import pandas as pd

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [3]:
import torch
import numpy as np

import evaluate
from evaluate import evaluator
from datasets import load_dataset
from torch.utils.data import DataLoader
from peft import LoraConfig, get_peft_model, TaskType, AutoPeftModelForSequenceClassification
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.model_selection import train_test_split

In [4]:
MODEL_NAME = "distilbert-base-uncased"
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128

In [5]:
dataset = load_dataset("sms_spam", split="train")
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)

Downloading data: 100%|██████████| 359k/359k [00:00<00:00, 2.38MB/s]


Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
def tokenize(examples):
    # return_tensors="pt" ensures that the tokenized output is in pytorch tensors
    # truncation=True ensures that all input into the model has consistent size.
    # Padded/truncated to the max_length of the model

    return tokenizer(examples["sms"], padding="max_length",
                     truncation=True, return_tensors="pt")

def compute_metrics(eval_pred):
    """
    Wrapper method to do calculation for metrics that we are interested in
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    eval_loss = torch.nn.CrossEntropyLoss()(torch.tensor(logits), torch.tensor(labels))
    metrics = metric.compute(predictions=predictions, references=labels)
    metrics['eval_loss'] = eval_loss.item()
    return metrics

In [8]:
metric = evaluate.load("accuracy", "cross_entropy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [9]:
tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2,
                                                          id2label={0: "not spam", 1: "spam"},
                                                          label2id={"not spam": 0, "spam": 1})

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [12]:
# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.evaluate()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [None]:
## Specifying the task_type will then create the relevant instance of the object
## Reference: https://github.com/huggingface/peft/blob/02ae6bcb373d9d9d3bec9ba920d63316418ff64a/src/peft/peft_model.py#L1094C7-L1094C41

## Available task types: https://huggingface.co/docs/peft/en/package_reference/peft_types
lora_config = LoraConfig(task_type="SEQ_CLS", r=4, lora_alpha=1,
                         lora_dropout=0, target_modules=["pre_classifier", "classifier"],
                         inference_mode=False)

In [None]:
peft_lora_model = get_peft_model(model, lora_config)
peft_lora_model.print_trainable_parameters()

In [None]:
peft_lora_model

In [None]:
# Define training arguments
peft_training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=15,
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
#     eval_steps=5,
#     logging_steps=5,
    save_strategy='epoch',
    save_safetensors=False,
    ## https://github.com/huggingface/transformers/issues/27613#issuecomment-1848645557

    warmup_steps=500,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",


    # Can play with remove_unused_columns. Initially set this to False because Trainer is returning
    # IndexError: Invalid key: 4437 is out of bounds for size 0
    # https://discuss.huggingface.co/t/indexerror-invalid-key-16-is-out-of-bounds-for-size-0/14298/4
    remove_unused_columns=False,
#     label_names=['label']

    # By default, Trainer uses GPU on the device
    # However, if you want to explicitly set GPU device(s)
    # no_cuda=False,  # Set to False to enable GPU usage
    # device=[0],  # Use GPU device with index 0, in case you have multiple GPUs
)

In [None]:
# Create a Trainer instance
peft_trainer = Trainer(
    model=peft_lora_model,
    args=peft_training_args,

    # We are dropping the SMS column because the size of this input column is not consistent
    # Not removing this column will lead to
    # ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`sms` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
    train_dataset=tokenized_dataset["train"].remove_columns(["sms"]),
    eval_dataset=tokenized_dataset["test"].remove_columns(["sms"]),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
#.rename_column("label", "labels")

In [None]:
## https://stackoverflow.com/questions/76012700/validation-loss-shows-no-log-during-fine-tuning-model

In [None]:
# Train the model
peft_trainer.train()

In [None]:
# Save the trained model
peft_trainer.save_model(f"./results/{MODEL_NAME}-best")

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [None]:
output_dir = f"./results/{MODEL_NAME}-best"

In [None]:
saved_tokenizer = AutoTokenizer.from_pretrained(output_dir)
saved_model = AutoPeftModelForSequenceClassification.from_pretrained(output_dir)

In [None]:
# Create a Trainer instance
saved_trainer = Trainer(
    model=saved_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
saved_trainer.evaluate()