In [1]:
# Core PyTorch
import torch
from torch.utils.data import DataLoader

# Hugging Face core
from transformers import AutoProcessor, AutoModelForVision2Seq

# Hugging Face datasets
from datasets import load_dataset

# Parameter-efficient fine-tuning (LoRA, adapters)
from peft import LoraConfig, get_peft_model

# Optional: quantization for low VRAM
from bitsandbytes import nn as bnb
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig



# Load processor normally
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-500M-Instruct")

# Update the image processor size directly
 # resizes shorter edge to 256


# Load the VLM model without quantization for now
quantization_config = BitsAndBytesConfig(load_in_8bit=True)


model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-500M-Instruct",
    device_map="auto",
    offload_folder="offload",   # folder for CPU offload if VRAM is tight
    offload_state_dict=True,
    quantization_config=quantization_config
)

 # Use half precision instead of 8-bit

# Enable gradients for all parameters


In [5]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=16,                          # LoRA rank (higher = stronger adaptation, more VRAM)
    target_modules=["q_proj","v_proj"],  # only attention layers
    task_type=TaskType.CAUSAL_LM,   # fine-tuning for text generation
    lora_alpha=32,                  # scaling factor
    lora_dropout=0.05,              # prevent overfitting
    bias="none",                     # safest for low VRAM
    fan_in_fan_out=False,            # correct unless model uses GPT2-style Conv1D
    init_lora_weights=True           # default init (B=0)
)


In [6]:
model = get_peft_model(model, lora_config)

# Explicitly enable training mode and gradients for LoRA parameters
model.train()
for name, param in model.named_parameters():
    if 'lora' in name:
        param.requires_grad = True
        print(f"Enabled grad for: {name}")

model.config.use_cache = False
model.gradient_checkpointing_enable()
model.print_trainable_parameters()

Enabled grad for: base_model.model.model.vision_model.encoder.layers.0.self_attn.v_proj.lora_A.default.weight
Enabled grad for: base_model.model.model.vision_model.encoder.layers.0.self_attn.v_proj.lora_B.default.weight
Enabled grad for: base_model.model.model.vision_model.encoder.layers.0.self_attn.q_proj.lora_A.default.weight
Enabled grad for: base_model.model.model.vision_model.encoder.layers.0.self_attn.q_proj.lora_B.default.weight
Enabled grad for: base_model.model.model.vision_model.encoder.layers.1.self_attn.v_proj.lora_A.default.weight
Enabled grad for: base_model.model.model.vision_model.encoder.layers.1.self_attn.v_proj.lora_B.default.weight
Enabled grad for: base_model.model.model.vision_model.encoder.layers.1.self_attn.q_proj.lora_A.default.weight
Enabled grad for: base_model.model.model.vision_model.encoder.layers.1.self_attn.q_proj.lora_B.default.weight
Enabled grad for: base_model.model.model.vision_model.encoder.layers.2.self_attn.v_proj.lora_A.default.weight
Enabled gr

In [7]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="llava_cddm_fixed.jsonl")

In [8]:
import os

def preprocess_paths(example, image_root="dataset/images"):
    # Join root and relative path
    rel_path = os.path.join(*example['image'].split("/"))
    image_path = os.path.abspath(os.path.join(image_root, rel_path))  # absolute path
    image_path = os.path.normpath(image_path)  # fixes slashes for Windows

    if not os.path.exists(image_path):
        raise ValueError(f"Image not found: {image_path}")

    # Combine conversation into single string
    conversation_text = ""
    for turn in example['conversations']:
        role = turn['from']
        content = turn['value'].strip()
        if role == 'human':
            conversation_text += f"Human: {content}\n"
        elif role == 'gpt':
            conversation_text += f"Assistant: {content}\n"

    return {
        "image_path": image_path,
        "text": conversation_text.strip()
    }
dataset = dataset.map(lambda x: preprocess_paths(x, image_root="dataset/images"))


In [9]:
dataset['train'][0]['image_path']

'c:\\Users\\LENOVO\\Desktop\\wie act\\my-project\\dataset\\images\\Apple,Alternaria Blotch\\plant_74609.jpg'

In [10]:
from torch.utils.data import Dataset
from PIL import Image
class VLMDataset(Dataset):
    def __init__(self, hf_dataset, processor):
        self.dataset = hf_dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        
        # Make sure we have a **single image path**, not a list
        image_path = example['image_path']
        if isinstance(image_path, list):
            image_path = image_path[0]

        # Load image with PIL
        image = Image.open(image_path).convert("RGB")

        text = example['text']

        # Processor expects a single image here
        inputs = self.processor(
            images=image,
            text=text,
            return_tensors="pt"
        )

        # Only take the first (and only) pixel_values tensor
        pixel_values = inputs.pixel_values[0]

        return {
            "input_ids": inputs.input_ids.squeeze(0),
            "attention_mask": inputs.attention_mask.squeeze(0),
            "pixel_values": pixel_values
        }

train_dataset = VLMDataset(dataset['train'], processor)


In [11]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)


In [12]:
def collate_fn(batch):
    # Pad input_ids and attention_mask dynamically
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [b["input_ids"] for b in batch],
        batch_first=True,
        padding_value=processor.tokenizer.pad_token_id
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [b["attention_mask"] for b in batch],
        batch_first=True,
        padding_value=0
    )

    # Pad pixel values to handle different numbers of patches
    pixel_values_list = [b["pixel_values"] for b in batch]
    max_patches = max(pv.shape[0] for pv in pixel_values_list)
    
    padded_pixel_values = []
    for pv in pixel_values_list:
        if pv.shape[0] < max_patches:
            # Pad with zeros to match max_patches
            padding = torch.zeros(max_patches - pv.shape[0], *pv.shape[1:], dtype=pv.dtype)
            pv_padded = torch.cat([pv, padding], dim=0)
        else:
            pv_padded = pv
        padded_pixel_values.append(pv_padded)
    
    pixel_values = torch.stack(padded_pixel_values)
    
    # Ensure tensors require gradients and are float type
    pixel_values = pixel_values.float().requires_grad_(True)
    input_ids = input_ids.long()
    attention_mask = attention_mask.long()

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "pixel_values": pixel_values,
        "labels": input_ids  # causal LM
    }

In [13]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./vlm-lora-checkpoint",
    per_device_train_batch_size=2,   # safe for 4GB VRAM
    gradient_accumulation_steps=4,   # effectively bigger batch
    learning_rate=5e-5,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    fp16=True,                       # mixed precision if your GPU supports
    remove_unused_columns=False      # keep pixel_values for Trainer
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=processor.tokenizer,
    data_collator=collate_fn
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [15]:
from datasets import Dataset

# Shuffle and select 5000 random samples
small_hf_dataset = dataset["train"].shuffle(seed=42).select(range(5000))

# Wrap in your custom dataset class
train_dataset = VLMDataset(small_hf_dataset, processor)


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=processor.tokenizer,
    data_collator=collate_fn
)

  trainer = Trainer(


In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 49279, 'bos_token_id': 1, 'pad_token_id': 2}.


Step,Training Loss
10,15.3059
20,13.9097
30,11.8326
