## Unsloth Installation: *ONLY FOR COLAB*

In [1]:
# %%capture
# import os
# if "COLAB_" not in "".join(os.environ.keys()):
#     !pip install unsloth
# else:
#     # Do this only in Colab and Kaggle notebooks! Otherwise use pip install unsloth
#     !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
#     !pip install --no-deps cut_cross_entropy unsloth_zoo
#     !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
#     !pip install --no-deps unsloth

# Import Libraries

In [1]:
from unsloth import FastVisionModel, is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTConfig, SFTTrainer
import torch
from datasets import load_dataset, Dataset
from transformers import Qwen2VLImageProcessor
import pandas as pd

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


#### Expandable Memory Segements

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Make the Model

## Define Models

In [3]:
qwen_models = {
    "2B": {
        'model': "unsloth/Qwen2-VL-2B-Instruct-unsloth-bnb-4bit",
        'repo' : "hamzamooraj99/MedQA-Qwen-2B-LoRA16"
    },
    "7B": {
        'model': "unsloth/Qwen2-VL-7B-Instruct-unsloth-bnb=4bit",
        'repo' : "hamzamooraj99/MedQA-Qwen-7B-LoRA16"
    }
}

In [4]:
model_name = qwen_models['2B']['model']
save_repo = qwen_models['2B']['repo']

## Model Prep

In [5]:
model, tokenizer = FastVisionModel.from_pretrained(
    model_name,
    load_in_4bit= True,
    use_gradient_checkpointing="unsloth",
    max_seq_length=1024
    
    
)
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=True, 
    finetune_attention_modules=True, 
    finetune_language_layers=True, 
    finetune_mlp_modules=True,

    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias='none',
    random_state=3407,
    use_rslora=False,
    loftq_config=None
    
)

  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.3.14: Fast Qwen2_Vl patching. Transformers: 4.49.0.
   \\   /|    NVIDIA GeForce RTX 4080 SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Unsloth: Making `model.base_model.model.visual` require gradients


In [6]:
tokenizer.image_processor = Qwen2VLImageProcessor(
    do_resize=True,
    size={"height": 256, "width": 256},
    max_pixels=256*256,
    min_pixels=224*224,
    num_workers = 16,
    use_fast = True
)

# Dataset

## Load the Data & Preview

In [7]:
train_set = load_dataset("hamzamooraj99/PMC-VQA-1", split='train')
val_set = load_dataset("hamzamooraj99/PMC-VQA-1", split='validation')

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/40 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

## Format the Dataset

In [8]:
def convert_to_conversation(sample):
    conversation = [
        {"role": "system",
        "content": [
                {"type": "text", "text": "You are a medical image analysis assistant. Provide accurate and detailed answers to questions about medical images."}
            ]
        },
        {"role": "user",
        "content": [
                {"type": "text", "text": sample['Question']},
                {"type": "image", "image": sample['image']}
            ]
        },
        {"role": "assistant",
        "content": [
                {"type": "text", "text": sample['Answer']}
            ]
        }
    ]
    return({"messages": conversation})

# Fine-Tune the Model

In [9]:
# print("SETTING MODEL FOR TRAINING")
FastVisionModel.for_training(model)

# print("MAKING TRAINER")
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model=model, processor=tokenizer, formatting_func=convert_to_conversation),
    train_dataset = train_set,
    eval_dataset = val_set,
    args = SFTConfig(
        per_device_train_batch_size=6,  #Each GPU processes 2 samples per batch,
        gradient_accumulation_steps=2,  #Gradients are accumulated for 4 steps before updating model
        warmup_steps=50,                #Gradually increases learning rate for first n steps to prevent instability
        num_train_epochs=1,             #Parameter to perform full fine-tune (use max_steps=30 for a quick test)
        # Optimisation & Mixed Precision
        learning_rate=2e-4,
        fp16=False,       #Use float16 if GPU does not support bf16
        bf16=True,           #Use bfloat16 if GPU supports it (better stability)
        # Optimiser & Weight Decay
        optim="adamw_8bit",
        weight_decay=0.01,              #Regularisation to prevent overfitting
        lr_scheduler_type='linear',     #Decay type for learning rate from learning_rate to 0
        seed=3407,
        output_dir='outputs',
        # Logging & Reporting
        report_to='none',               #Integration with Weights & Biases ('none' disables, 'wandb' enables)
        # Settings for Vision Fine-Tuning
        remove_unused_columns=False,
        dataset_text_field="",
        dataset_kwargs={"skip_prepare_dataset": True},
        dataset_num_proc=16,             #CPU processes for parallel dataset processing
        # max_seq_length=512,
        gradient_checkpointing = True,
        # Validation Settings
        do_eval=True,
        eval_strategy='steps',
        eval_steps=1000,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        per_device_eval_batch_size=4,
        # Save Settings
        save_strategy='steps',
        save_steps=1000,
        save_total_limit=2
    )
)

Unsloth: Model does not have a default image size - using 512


In [10]:
# @title Reset memory allocation
import gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.cuda.reset_max_memory_allocated()



In [11]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   2469 MiB |   2469 MiB |   2490 MiB |  21189 KiB |
|       from large pool |   2325 MiB |   2325 MiB |   2325 MiB |      0 KiB |
|       from small pool |    144 MiB |    144 MiB |    164 MiB |  21189 KiB |
|---------------------------------------------------------------------------|
| Active memory         |   2469 MiB |   2469 MiB |   2490 MiB |  21189 KiB |
|       from large pool |   2325 MiB |   2325 MiB |   2325 MiB |      0 KiB |
|       from small pool |    144 MiB |    144 MiB |    164 MiB |  21189 KiB |
|---------------------------------------------------------------

In [12]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4080 SUPER. Max memory = 15.992 GB.
2.543 GB of memory reserved.


In [13]:
trainer_stats = trainer.train(resume_from_checkpoint=True)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 154,253 | Num Epochs = 1 | Total steps = 15,425
O^O/ \_/ \    Batch size per device = 5 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (5 x 2 x 1) = 10
 "-____-"     Trainable parameters = 28,950,528/2,000,000,000 (1.45% trained)
	per_device_train_batch_size: 6 (from args) != 5 (from trainer_state.json)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
9000,0.8771,0.778581
10000,0.7423,0.774723
11000,0.7494,0.772927
12000,0.8576,0.769148
13000,0.7867,0.766358
14000,0.7175,0.765929
15000,0.7377,0.764269


Unsloth: Not an error, but Qwen2VLForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [14]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

21164.0092 seconds used for training.
352.73 minutes used for training.
Peak reserved memory = 8.0 GB.
Peak reserved memory for training = 5.457 GB.
Peak reserved memory % of max memory = 50.025 %.
Peak reserved memory for training % of max memory = 34.123 %.


# Save the Model

In [16]:
model.push_to_hub(save_repo)
tokenizer.push_to_hub(save_repo)

README.md:   0%|          | 0.00/644 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/116M [00:00<?, ?B/s]

Saved model to https://huggingface.co/hamzamooraj99/MedQA-Qwen-2B-LoRA16


README.md:   0%|          | 0.00/644 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [17]:
# @title Reset memory allocation
import gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.cuda.reset_max_memory_allocated()

