## Prepare Dataset

In [None]:
! pip install -q accelerate peft lightning nltk

In [None]:
! pip install -U bitsandbytes

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
# download dataset

import transformers

from datasets import load_dataset

DATA_REPO_ID = "tanganke/stanford_cars"



train_dataset = load_dataset(DATA_REPO_ID, split="train")

test_dataset  = load_dataset(DATA_REPO_ID, split="test")
# valid_dataset = load_dataset(DATA_REPO_ID, split="validation")


In [17]:
# prepare collate function

from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")

def train_collate_fn(samples):

    images = [sample["image"] for sample in samples]

    texts = ["<image" + "<bos>" + "What is the model of a car in the image?" for sample in samples]

    labels = [sample["label"]+"<eos>" for sample in samples]



    # define a processor that handle max_length 128(not including the number of image tokens)

    inputs = processor(text=texts, images=images, suffix=labels, return_tensors="pt", 

                      padding=True, truncation=True, max_length=128)



    input_ids = inputs["input_ids"]

    token_type_ids = inputs["token_type_ids"]

    attention_mask = inputs["attention_mask"]

    pixel_values = inputs["pixel_values"]

    labels = inputs["labels"]



    return input_ids, token_type_ids, attention_mask, pixel_values, labels



def test_collate_fn(samples):

    images = [sample["image"] for sample in samples]

    texts = ["<image" + "<bos>" + "What is the model of a car in the image?" for sample in samples]

    labels = [sample["label"] + "<eos>" for sample in samples]



    # define a processor that handle max_length 128(not including the number of image tokens)

    inputs = processor(text=texts, images=images, return_tensors="pt", 

                      padding=True, truncation=True, max_length=128)



    input_ids = inputs["input_ids"]

    attention_mask = inputs["attention_mask"]

    pixel_values = inputs["pixel_values"]



    return input_ids, attention_mask, pixel_values, labels

    

## Prepare Quantization Settings

In [4]:
# Using bitsandbytes library, setting the quantization

from transformers import BitsAndBytesConfig

import torch



bnb_config = BitsAndBytesConfig(

    load_in_4bit=True,

    bnb_4bit_compute_dtype=torch.bfloat16, # convert float32 to bf16 to speed up computation

    bnb_4bit_quant_type="nf4", # NF4 is 4-bit data type from QLoRA paper

)

## Prepare LoRA Settings

In [5]:
from peft import LoraConfig



lora_config = LoraConfig(

    r=4, # set the low-rank as 4 

    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], #modules that we are going to apply LoRA adapter

    task_type="CAUSAL_LM", # type of model. PaliGemma is causal language model

)

## Prepare pytorch-lightning Trainer

In [27]:
import lightning as L

from transformers import AutoProcessor

import torch 

from nltk import edit_distance

from torch.utils.data import DataLoader





class PaliGemma_Finetuned_Model(L.LightningModule):

    def __init__(self, config, model, processor):
        super().__init__()

        self.model = model 

        self.processor = processor

        self.config = config



        self.batch_size = config.get("batch_size")



        self.train_losses = []

        self.val_losses = []

        self.val_scores = []



    def training_step(self, batch, batch_idx):
        print(batch)

        input_ids, token_type_ids, attention_mask, pixel_values, labels = batch



        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,

                            pixel_values=pixel_values, labels=labels)



        loss = outputs.loss



        self.train_losses.append(loss.item())



        self.log("train_loss", loss)

        return loss



    def validation_step(self, batch, batch_idx):
        print(batch)

        input_ids, attention_mask, pixel_values, labels = batch



        generated_ids = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values, max_new_tokens=80)
        predictions = self.processor.batch_decode(generated_ids[:, input_ids.size(1)+1:], skip_special_tokes=True)
    

        scores = []

        print(labels)
        print(predictions)
        

        for pred, label in zip(predictions, labels):

            score = edit_distance(pred, label) / max(len(pred), len(label))

            self.val_scores.append(score)

            scores.append(score)

        self.log("val_edit_distance", np.mean(scores), on_epoch=True)



        return scores



    def configure_optimizers(self):

        optimizer = torch.optim.AdamW(self.parameters(), lr=config.get("lr", 3e-4))

        return optimizer



    def train_dataloader(self):

        return DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=train_collate_fn, num_workers=3)

    def val_dataloader(self):

        return DataLoader(test_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=test_collate_fn, num_workers=3)

        

## Define the model 

In [28]:
from transformers import PaliGemmaForConditionalGeneration

from peft import get_peft_model



model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-pt-224", quantization_config=bnb_config)

model = get_peft_model(model, lora_config)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
model.print_trainable_parameters()

trainable params: 5,649,408 || all params: 2,929,115,888 || trainable%: 0.1929


## Setup Training Configurations

In [30]:
config = config = {
            "max_epochs": 100,

          "check_val_every_n_epoch": 1,

          "gradient_clip_val": 1.0,

          "accumulate_grad_batches": 8,

          "lr": 3e-4,

          "batch_size": 2,

          "seed":1234,

          "num_nodes": 1,

          "warmup_steps": 50,

          "result_path": "./result",

          "verbose": True,

}



model_module = PaliGemma_Finetuned_Model(config, model, processor)

## Define Callbacks

In [31]:
from lightning.pytorch.callbacks import Callback

from lightning.pytorch.callbacks.early_stopping import EarlyStopping

from huggingface_hub import HfApi



api = HfApi()



FINETUNED_MODEL_ID="ball1433/PaliGemma-StanfordCars-finetuned"



class Print_TrainValidation_ResultCallback(Callback):

    def on_train_epoch_end(self, trainer, pl_module):

        # print the average of training loss 

        print(f'Average Training Loss: {np.mean(pl_module.train_losses)}')



        # print the average of edit distance score

        print(f'Average Validation Score: {np.mean(pl_module.val_scores)}')



        # reset the list

        pl_module.train_losses = []

        pl_module.val_scores = []





class PushToHubCallback(Callback):

    def on_train_epoch_end(self, trainer, pl_module):

        print(f"Pushing model to the hub, epoch {trainer.current_epoch}")

        pl_module.model.push_to_hub(FINETUNED_MODEL_ID,

                                    commit_message=f"Training in progress, epoch {trainer.current_epoch}")



    def on_train_end(self, trainer, pl_module):

        print(f"Pushing model to the hub after training")

        pl_module.processor.push_to_hub(FINETUNED_MODEL_ID,

                                    commit_message=f"Training done")

        pl_module.model.push_to_hub(FINETUNED_MODEL_ID,

                                    commit_message=f"Training done")



early_stop_callback = EarlyStopping(monitor="val_edit_distance", patience=20, verbose=False, mode="min")

## Training

In [32]:
# define trainer 

trainer = L.Trainer(

        devices=-1, 

        accelerator="auto",

        max_epochs=config.get("max_epochs"),

        accumulate_grad_batches=config.get("accumulate_grad_batches"),

        check_val_every_n_epoch=config.get("check_val_every_n_epoch"),

        gradient_clip_val=config.get("gradient_clip_val"),

        precision="16-mixed",

        limit_val_batches=1.0,

        num_sanity_val_steps=2,

        callbacks=[PushToHubCallback(), Print_TrainValidation_ResultCallback(), early_stop_callback],

)



trainer.fit(model_module)

INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO: `Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type                 | Params | Mode 
-------------------------------------------------------
0 | model | PeftModelForCausalLM | 1.7 B  | train
-------------------------------------------------------
5.6 M     Trainable params
1.7 B     Non-trainable params
1.7 B     Total params
6,925.987 Total estimated model params size (MB)
2072      Modules in train mode
593       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "/tmp/ipykernel_502/2405541102.py", line 47, in test_collate_fn
    labels = [sample["label"] + "<eos>" for sample in samples]
  File "/tmp/ipykernel_502/2405541102.py", line 47, in <listcomp>
    labels = [sample["label"] + "<eos>" for sample in samples]
TypeError: unsupported operand type(s) for +: 'int' and 'str'


## Inference