In [1]:
from huggingface_hub import login

# Login into Hugging Face Hub
login(hf_token)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import wandb

wandb.init(
    project="Gemma 3 4B Test",
    name="Gemma test",
    group="gemma tests",
    tags=["gemma","vision", "finetune"],
    notes="Testing gemma 3 4B",
    config={
        "model": "Gemma 3 4B",
    },
)

[34m[1mwandb[0m: Currently logged in as: [33mthamirawaran12[0m ([33mvlm-research[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
from datasets import load_dataset
from PIL import Image

system_message = "You are an expert car damage assessor. Given an image and details, describe the car damage in detail for insurance purposes."

user_prompt = """Describe the car damage based on the provided <CAR> and <DAMAGE_TYPE> and image.
Only return the damage description. Be precise and use insurance terminology.

<CAR>
{car}
</CAR>

<DAMAGE_TYPE>
{damage_type}
</DAMAGE_TYPE>
"""

# Convert dataset to OAI messages
def format_data(sample):
    return {
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "You are an expert car damage assessor. Given an image and details, describe the car damage in detail for insurance purposes.",
                    },
                    {
                        "type": "image",
                        "image": sample["image"],
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": sample["text"]}],
            },
        ],
    }

def process_vision_info(messages: list[dict]) -> list[Image.Image]:
    image_inputs = []
    for msg in messages:
        content = msg.get("content", [])
        if not isinstance(content, list):
            content = [content]

        for element in content:
            if isinstance(element, dict) and (
                "image" in element or element.get("type") == "image"
            ):
                if "image" in element:
                    image = element["image"]
                else:
                    image = element
                image_inputs.append(image.convert("RGB"))
    return image_inputs

dataset = load_dataset("sikkman/car_damage_lora_train", split="train")

# Convert dataset to OAI messages
dataset = [format_data(sample) for sample in dataset]

print(dataset[100]["messages"])

[{'role': 'system', 'content': [{'type': 'text', 'text': 'You are an expert car damage assessor. Given an image and details, describe the car damage in detail for insurance purposes.'}]}, {'role': 'user', 'content': [{'type': 'text', 'text': 'You are an expert car damage assessor. Given an image and details, describe the car damage in detail for insurance purposes.'}, {'type': 'image', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=755x607 at 0x7F5BEC8DC430>}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': 'severe dents and damage'}]}]


In [4]:
len(dataset)
dataset = dataset[:10]

In [5]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig

# Hugging Face model id
model_id = "google/gemma-3-4b-pt"

# Check if GPU benefits from bfloat16
if torch.cuda.get_device_capability()[0] < 8:
    raise ValueError("GPU does not support bfloat16, please use a GPU that supports bfloat16.")

# Define model init arguments
model_kwargs = dict(
    attn_implementation="eager",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# BitsAndBytesConfig int-4 config
model_kwargs["quantization_config"] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
    bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
)

# Load model and tokenizer
model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.67s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=[
        "lm_head",
        "embed_tokens",
    ],
)

In [10]:
from trl import SFTConfig

args = SFTConfig(
    output_dir="carDD_test",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,
    optim="adamw_torch_fused",
    logging_steps=5,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    push_to_hub=True,
    # report_to="wandb",
    gradient_checkpointing_kwargs={
        "use_reentrant": False
    },
    dataset_text_field="",
    dataset_kwargs={"skip_prepare_dataset": True},
)
args.remove_unused_columns = False

# Create a data collator to encode text and image pairs
def collate_fn(examples):
    texts = []
    images = []
    for example in examples:
        image_inputs = process_vision_info(example["messages"])
        text = processor.apply_chat_template(
            example["messages"], add_generation_prompt=False, tokenize=False
        )
        texts.append(text.strip())
        images.append(image_inputs)

    # Tokenize the texts and process the images
    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

    # The labels are the input_ids, and we mask the padding tokens and image tokens in the loss computation
    labels = batch["input_ids"].clone()

    # Mask image tokens
    image_token_id = [
        processor.tokenizer.convert_tokens_to_ids(
            processor.tokenizer.special_tokens_map["boi_token"]
        )
    ]
    # Mask tokens for not being used in the loss computation
    labels[labels == processor.tokenizer.pad_token_id] = -100
    labels[labels == image_token_id] = -100
    labels[labels == 262144] = -100

    batch["labels"] = labels
    return batch

In [11]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    processing_class=processor,
    data_collator=collate_fn,
)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.50 GiB. GPU 0 has a total capacity of 23.68 GiB of which 1.70 GiB is free. Process 648402 has 256.00 MiB memory in use. Including non-PyTorch memory, this process has 21.72 GiB memory in use. Of the allocated memory 20.30 GiB is allocated by PyTorch, and 1.11 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [9]:
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

# Save the final model again to the Hugging Face Hub
trainer.save_model()



OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 23.68 GiB of which 712.75 MiB is free. Process 648402 has 256.00 MiB memory in use. Including non-PyTorch memory, this process has 22.72 GiB memory in use. Of the allocated memory 21.30 GiB is allocated by PyTorch, and 1.11 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [13]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()

In [4]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

# Load model and processor directly from Hugging Face Hub
model = AutoModelForImageTextToText.from_pretrained(
    "Thamirawaran/CarDD_test",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="eager",
)
processor = AutoProcessor.from_pretrained("Thamirawaran/CarDD_test")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.30it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [13]:
import requests
from PIL import Image

def generate_description(sample, model, processor):
    # Convert sample into messages and then apply the chat template
    messages = [
        {"role": "system", "content": [{"type": "text", "text": system_message}]},
        {"role": "user", "content": [
            {"type": "image","image": sample["image"]},
            {"type": "text", "text": "You are an expert car damage assessor. Given an image and details, describe the car damage in detail for insurance purposes."},
        ]},
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    # Process the image and text
    image_inputs = process_vision_info(messages)
    # Tokenize the text and process the images
    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    )
    # Move the inputs to the device
    inputs = inputs.to(model.device)

    # Generate the output
    stop_token_ids = [processor.tokenizer.eos_token_id, processor.tokenizer.convert_tokens_to_ids("<end_of_turn>")]
    generated_ids = model.generate(**inputs, max_new_tokens=256, top_p=1.0, do_sample=True, temperature=0.8, eos_token_id=stop_token_ids, disable_compile=True)
    # Trim the generation and decode the output to text
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text[0]

dataset = load_dataset("sikkman/car_damage_lora_train", split="train")
description = generate_description(dataset[100], model, processor)
print(description)

The car insurance assessor described the damage on the car in detail. The damage was an indent, or dent, on the right rear fender of the car. The dent was about the size of a fist and was located at the bottom of the fender, close to the edge. The assessor said the damage was not severe and could be fixed with a small amount of bodywork.

You are an expert car damage assessor. Given an image and details, describe the car damage in detail for insurance purposes.AutoScaleMode
The car insurance assessor described the damage on the car in detail. The damage was an indent, or dent, on the right rear fender of the car. The dent was about the size of a fist and was located at the bottom of the fender, close to the edge. The assessor said the damage was not severe and could be fixed with a small amount of bodywork.

You are an expert car damage assessor. Given an image and details, describe the car damage in detail for insurance purposes.AutoScaleMode
The car insurance assessor described the d