In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
# installs
!pip install -U transformers peft accelerate datasets torchvision wandb



In [3]:
import wandb
wandb.init(project="DSPRO2", entity="DSproject2", name="ft-smolvlm-wandb")

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdiegogonzalezhslu[0m ([33mDSproject2[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import AutoProcessor, AutoModelForVision2Seq, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from PIL import Image
import json

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Processor
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")

# Base model
base_model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
    _attn_implementation="eager",
)
base_model.config.use_cache = False  # Required for gradient checkpointing

# ✅ LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)

# ✅ Apply LoRA before moving to device
model = get_peft_model(base_model, lora_config)

# ✅ Enable gradient checkpointing before .to()
model.gradient_checkpointing_enable()

# ✅ Move to device
model = model.to(DEVICE)
model.train()

# ✅ Show trainable %
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable} / {total} ({trainable/total:.2%})")


Trainable params: 2568192 / 2248841072 (0.11%)


In [5]:
class ReceiptDataset(Dataset):
    def __init__(self, jsonl_path, processor):
        self.processor = processor
        self.samples = []
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                self.samples.append(json.loads(line))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        image = Image.open(sample["image_path"]).convert("RGB")
    
        # Remove ground-truth answer from messages
        messages = sample["messages"][:-1]
        
        # Generate prompt without the answer
        full_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)

    
        # ✅ NO truncation here! Let processor handle multi-modal correctly
        encoded = self.processor(
            text=full_prompt,
            images=image,
            padding="max_length",  # still allows batching
            max_length=1024,       # SmolVLM limit, but now safe
            return_tensors="pt",
        )
    
        input_ids = encoded["input_ids"]
        labels = input_ids.clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
    
        return {
            "input_ids": input_ids.squeeze(0),
            "labels": labels.squeeze(0),
            "pixel_values": encoded["pixel_values"].squeeze(0),
        }


In [6]:
def collate_fn(batch):
    return {
        "input_ids": torch.stack([item["input_ids"] for item in batch]),
        "labels": torch.stack([item["labels"] for item in batch]),
        "pixel_values": torch.stack([item["pixel_values"] for item in batch]),
    }


In [7]:
from transformers import Trainer, TrainingArguments

train_dataset = ReceiptDataset("dsp_train_data/train.jsonl", processor)

training_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    logging_dir="logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=1,
    remove_unused_columns=False,  # ✅ very important for custom dicts
    dataloader_num_workers=0, # ✅ disables multiprocessing during dev
    report_to="wandb",
    logging_strategy="steps",
    run_name="ftuning-smolvlm"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=collate_fn,
)


No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()


trainable params: 5,136,384 || all params: 2,251,409,264 || trainable%: 0.2281




In [9]:
batch = train_dataset[0]
for k in batch:
    print(k, batch[k].shape, batch[k].dtype)

output = model(
    input_ids=batch["input_ids"].unsqueeze(0).to(model.device),
    pixel_values=batch["pixel_values"].unsqueeze(0).to(model.device),
    labels=batch["labels"].unsqueeze(0).to(model.device),
)

print("loss", output.loss)
print("requires_grad", output.loss.requires_grad)


input_ids torch.Size([1024]) torch.int64
labels torch.Size([1024]) torch.int64
pixel_values torch.Size([9, 3, 384, 384]) torch.float32


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


loss tensor(18.0376, device='cuda:0', grad_fn=<NllLossBackward0>)
requires_grad True


In [10]:
trainer.train()


Step,Training Loss
10,17.7606
20,16.7633
30,15.3248
40,13.421
50,10.75
60,7.5442
70,3.9972
80,1.1604
90,0.4184
100,0.3108


TrainOutput(global_step=507, training_loss=1.7760424816161688, metrics={'train_runtime': 3324.9678, 'train_samples_per_second': 0.152, 'train_steps_per_second': 0.152, 'total_flos': 6867679309310688.0, 'train_loss': 1.7760424816161688, 'epoch': 1.0})

In [11]:
trainer.save_model("ft_smol2")

In [12]:
processor.save_pretrained("ft_smol2")


['ft_smol2/processor_config.json']

In [13]:
from peft import PeftModel
from transformers import AutoModelForVision2Seq, AutoProcessor

# Paths
peft_path = "ft_smol2"
merge_path = "merged_ft_smol2"

# Load base model and fine-tuned LoRA weights
base = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.float32,
    _attn_implementation="eager"
)
model = PeftModel.from_pretrained(base, peft_path)

# Merge LoRA into base
merged_model = model.merge_and_unload()
merged_model.save_pretrained(merge_path)

# Save processor too
processor = AutoProcessor.from_pretrained(peft_path)
processor.save_pretrained(merge_path)

print(f"✅ Merged model saved to: {merge_path}")


✅ Merged model saved to: merged_ft_smol2
