In [1]:
from unsloth import FastVisionModel 
import torch
import wandb

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

==((====))==  Unsloth 2025.3.17: Fast Mllama patching. Transformers: 4.49.0.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.168 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=True,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)


Unsloth: Making `model.base_model.model.vision_model.transformer` require gradients


# DATASET

In [4]:
from datasets import load_dataset

dataset = load_dataset("Aditya-Khedekar/SarvamAI-VLM-dataset", split="train")

README.md:   0%|          | 0.00/450 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/25.2M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/6.95M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/107 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/29 [00:00<?, ? examples/s]

In [5]:
dataset

Dataset({
    features: ['image', 'text'],
    num_rows: 107
})

In [6]:
dataset[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1758x2267>,
 'text': 'Kargil and Beyond\n\nSpeech by Sh. Jaswant Singh, Minister of External Affairs at\nIndia International Centre on July 20, 1999\n\nMr. Chairman, Ladies and\nGentlemen,\n\nIt is now time to look ahead; to\nlook beyond Kargil. But even in\ncharting our course for the fu-\nture we have to assess what\nKargil 1999 was all about? What\nwere the challenges — military\nand diplomatic? What new fac-\nets of our total national com-\nmitment and endeavour\nemerged? What lessons for the\nyears that lie ahead? ‘Operation\nVijay’ — as the Prime Minister\nsaid some days back — has\nresulted in ‘Vijay’ for India. As\nwe re-examine the military and\ndiplomatic challenges that then\nconfronted us, and which were\nsuccessfully managed, we need\nto have a preliminary analysis,\ndraw some first conclusions and\nabove all, looking beyond Kargil,\ndraw a route chart for the to-\nmorrows to come.\n\nFirst, the military dime

# CONVERTED DATASET

In [8]:
instruction = "Extract the text from the provided image."

def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : sample["image"]} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["text"]} ]
        },
    ]
    return { "messages" : conversation }
pass

In [9]:
converted_dataset = [convert_to_conversation(sample) for sample in dataset]

In [10]:
converted_dataset[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': 'Extract the text from the provided image.'},
    {'type': 'image',
     'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1758x2267>}]},
  {'role': 'assistant',
   'content': [{'type': 'text',
     'text': 'Kargil and Beyond\n\nSpeech by Sh. Jaswant Singh, Minister of External Affairs at\nIndia International Centre on July 20, 1999\n\nMr. Chairman, Ladies and\nGentlemen,\n\nIt is now time to look ahead; to\nlook beyond Kargil. But even in\ncharting our course for the fu-\nture we have to assess what\nKargil 1999 was all about? What\nwere the challenges — military\nand diplomatic? What new fac-\nets of our total national com-\nmitment and endeavour\nemerged? What lessons for the\nyears that lie ahead? ‘Operation\nVijay’ — as the Prime Minister\nsaid some days back — has\nresulted in ‘Vijay’ for India. As\nwe re-examine the military and\ndiplomatic challenges that then\nconfronted us, and which wer

# INFERENCE EXAMPLE

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = dataset[0]["image"]
instruction = "Extract the text from the provided image."

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

# FINETUNNING 

In [29]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

wandb.init(project="SarvamAI-VLM-FineTuning", name="FineTuning_V2")


# trainer = SFTTrainer(
#     model = model,
#     tokenizer = tokenizer,
#     data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
#     train_dataset = converted_dataset,
#     args = SFTConfig(
#         per_device_train_batch_size = 2,
#         gradient_accumulation_steps = 4,
#         warmup_steps = 5,
#         # max_steps = 30,
#         num_train_epochs = 1, # Set this instead of max_steps for full training runs
#         learning_rate = 2e-4,
#         fp16 = not is_bf16_supported(),
#         bf16 = is_bf16_supported(),
#         logging_steps = 1,
#         optim = "adamw_8bit",
#         weight_decay = 0.01,
#         lr_scheduler_type = "linear",
#         seed = 3407,
#         output_dir = "outputs",
#         report_to = "wandb",     # For Weights and Biases

#         # You MUST put the below items for vision finetuning:
#         remove_unused_columns = False,
#         dataset_text_field = "",
#         dataset_kwargs = {"skip_prepare_dataset": True},
#         dataset_num_proc = 1,
#         max_seq_length = 2048,
#     ),
# )


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        # gradient_accumulation_steps = 4,
        # warmup_steps = 5,
        # max_steps = 30,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        # learning_rate = 2e-4,
        # TODO NEW LR, EPOCH STEPS, ACCUMULATION FOR SMOOTH LOSS CURVE 
        gradient_accumulation_steps = 4,
        warmup_steps = 2,
        num_train_epochs = 3, # Set this instead of max_steps for full training runs
        learning_rate = 1e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        # lr_scheduler_type = "linear",
        # TODO NEW LR SCHEDULER FOR SMOOTH LOSS CURVE 
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 1,
        max_seq_length = 2048,
    ),
)


# START FINETUNNIG 

In [30]:
trainer_stats = trainer.train()
model.save_pretrained("lora_model_V2")
tokenizer.save_pretrained("lora_model_V2")
print("Training complete. Model saved to lora_model/")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 107 | Num Epochs = 3 | Total steps = 39
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 67,174,400/11,000,000,000 (0.61% trained)


Step,Training Loss
1,0.1671
2,0.2276
3,0.2282
4,0.1427
5,0.244
6,0.2675
7,0.4275
8,0.2935
9,0.194
10,0.2904


Training complete. Model saved to lora_model/


In [31]:
if False:
    from unsloth import FastVisionModel
    model, tokenizer = FastVisionModel.from_pretrained(
        model_name = "lora_model_V2", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = True, # Set to False for 16bit LoRA
    )
    FastVisionModel.for_inference(model) # Enable for inference!

image = dataset[0]["image"]

instruction = "Extract the text from the provided image"

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100000,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

Kargil and Beyond

Speech by Sh. Jaswant Singh, Minister of External Affairs at
India International Centre on July 20, 1999

Mr. Chairman, Ladies and
Gentlemen,

It is now time to look ahead; to
look beyond Kargil. But even in
charting our course for the fu-
ture we have to assess what
Kargil 1999 was all about? What
were the challenges — military
and diplomatic? What new fac-
ets of our total national com-
mitment and endeavour
emerged? What lessons for the
years that lie ahead? ‘Operation
Vijay’ — as the Prime Minister
said some days back — has
resulted in ‘Vijay’ for India. As
we re-examine the military and
diplomatic challenges that then
confronted us, and which were
successfully managed, we need
to have a preliminary analysis,
draw some first conclusions and
above all, looking beyond Kargil,
draw a route chart for the to-
morrows to come.

First, the military dimension.
Kargil was a military aggression
by Pakistan, with Pak army regu-
lars, across a stretch of the LoC,
in four poc