In [5]:
# !pip install -q accelerate datasets peft bitsandbytes tensorboard

In [4]:
# !pip install -q flash-attn --no-build-isolation

In [6]:
import torch
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration

USE_LORA = False
USE_QLORA = False
SMOL = True

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# model_id = "HuggingFaceTB/SmolVLM-256M-Instruct"
# model_id = "HuggingFaceTB/SmolVLM-500M-Instruct"
model_id = "HuggingFaceTB/SmolVLM-Instruct"

processor = AutoProcessor.from_pretrained(
    model_id
)

if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
        use_dora=False if USE_QLORA else True,
        init_lora_weights="gaussian"
    )
    lora_config.inference_mode = False
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

    model = Idefics3ForConditionalGeneration.from_pretrained(
        model_id,
        quantization_config=bnb_config if USE_QLORA else None,
        _attn_implementation="flash_attention_2",
        device_map="auto"
    )
    model.add_adapter(lora_config)
    model.enable_adapters()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    print(model.get_nb_trainable_parameters())
else:
    model = Idefics3ForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        _attn_implementation="flash_attention_2",
    ).to(DEVICE)

    # vision params
    # for param in model.model.vision_model.parameters():
    #     param.requires_grad = False

In [7]:
from datasets import load_dataset
ds = load_dataset('flaviagiammarino/vqa-rad', trust_remote_code=True)

In [None]:
train_ds = ds["train"]
val_ds = ds["test"]

In [None]:
image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")]

def collate_fn(examples):
  texts = []
  images = []
  for example in examples:
      image = example["image"]
      if image.mode != 'RGB':
        image = image.convert('RGB')
      question = example["question"]
      answer = example["answer"]
      messages = [
          {
              "role": "user",
              "content": [
                  {"type": "text", "text": "Answer briefly."},
                  {"type": "image"},
                  {"type": "text", "text": question}
              ]
          },
          {
              "role": "assistant",
              "content": [
                  {"type": "text", "text": answer}
              ]
          }
      ]
      text = processor.apply_chat_template(messages, add_generation_prompt=False)
      texts.append(text.strip())
      images.append([image])

  batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
  labels = batch["input_ids"].clone()
  labels[labels == processor.tokenizer.pad_token_id] = -100
  labels[labels == image_token_id] = -100
  batch["labels"] = labels

  return batch

In [None]:
from transformers import TrainingArguments, Trainer

model_name = model_id.split("/")[-1]

training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_steps=50,
    learning_rate=1e-3,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="epoch",
    save_total_limit=3,
    optim="adamw_hf",
    bf16=True,
    output_dir=f"./{model_name}-med-vqav1",
    hub_model_id=f"{model_name}-med-vqav1",
    report_to="tensorboard",
    remove_unused_columns=False,
    gradient_checkpointing=True,
    eval_strategy="steps",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_ds,
    eval_dataset=val_ds,
)

In [8]:
trainer.train()

In [9]:
from huggingface_hub import notebook_login
notebook_login()

In [1]:
# trainer.push_to_hub()

In [10]:
from transformers import AutoProcessor, Idefics3ForConditionalGeneration
import torch

base_model_id = "HuggingFaceTB/SmolVLM-Instruct"
fine_tuned_model_path = "hasan-farooq/SmolVLM-Instruct-vqav1"
processor = AutoProcessor.from_pretrained(base_model_id)

model = Idefics3ForConditionalGeneration.from_pretrained(
    fine_tuned_model_path,
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2"
).to("cuda")

model.eval();

In [None]:
from PIL import Image
import torch

image = Image.open("./synpic18319.jpg")  # replace with your image path
image = image.convert("RGB")

question = "What can cause asymmetrical breasts?"

# template
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Answer briefly."},
            {"type": "image"},
            {"type": "text", "text": question}
        ]
    }
]

# token processing
inputs = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=inputs, images=[image], return_tensors="pt", padding=True)

inputs = {k: v.to("cuda") for k, v in inputs.items()}

with torch.no_grad():
    outputs = model.generate(**inputs, max_length=2048)

response = processor.batch_decode(outputs, skip_special_tokens=True)[0]
print("Answer:", response)


Answer: User: Answer briefly.<image>What can cause asymmetrical breasts?
Assistant: breast cancer
