<a href="https://colab.research.google.com/github/kusha31393/Finetune_qwen2.5_VL/blob/main/finetune_qwen_vl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Qwen Vision Language Model Fine-tuning for LaTeX OCR

This script fine-tunes the Qwen2-VL-7B-Instruct model to convert mathematical equation
images into their corresponding LaTeX representations. The fine-tuning process uses
LoRA (Low-Rank Adaptation) for efficient parameter-efficient training.

Model: Qwen2-VL-7B-Instruct (4-bit quantized)
Dataset: unsloth/Latex_OCR (mathematical equations with LaTeX labels)
Training Method: Supervised Fine-Tuning (SFT) with LoRA adaptation

Original Colab notebook located at:
    https://colab.research.google.com/drive/1Ng4PP2AMkL69IApMyKt7QM2u-YF6sPHy

In [None]:
# Install required dependencies for fine-tuning
# - bitsandbytes: 4-bit quantization support
# - accelerate: Multi-GPU training acceleration
# - xformers: Memory-efficient attention implementation
# - peft: Parameter-Efficient Fine-Tuning (LoRA)
# - trl: Transformer Reinforcement Learning library
# - unsloth: Fast training library for LLMs

!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [None]:
from unsloth import FastVisionModel
import torch

In [None]:
# Available 4-bit quantized vision-language models
# These models are optimized for efficient training with reduced memory usage

fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Meta's Llama 3.2 Vision model
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit" # Alibaba's Qwen2-VL model
]

In [None]:
# Load the pre-trained Qwen2-VL model with 4-bit quantization
# This reduces memory usage from ~28GB to ~7GB while maintaining performance

model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit", # Pre-quantized model from Unsloth
    load_in_4bit=True,                       # Enable 4-bit quantization
    use_gradient_checkpointing="unsloth"     # Memory-efficient gradient computation
)

In [None]:
# Configure LoRA (Low-Rank Adaptation) for parameter-efficient fine-tuning
# This allows training only a small subset of parameters while maintaining performance

model = FastVisionModel.get_peft_model(
    model,
    # Component-specific fine-tuning flags
    finetune_vision_layers=True,      # Train vision encoder layers
    finetune_language_layers=True,    # Train language model layers
    finetune_attention_modules=True,  # Train attention mechanisms
    finetune_mlp_modules=True,        # Train feed-forward networks

    # LoRA hyperparameters
    r=16,                             # Rank of adaptation matrices (higher = more parameters)
    lora_alpha=16,                    # LoRA scaling parameter (typically same as rank)
    lora_dropout=0,                   # Dropout rate for LoRA layers (0 = no dropout)
    bias="none",                      # Bias configuration ("none", "all", or "lora_only")
    random_state = 3407,              # Random seed for reproducibility
    use_rslora=False,                 # Rank-Stabilized LoRA (experimental)
    loftq_config=None                 # LoftQ quantization config (None = disabled)
)

In [None]:
# Load the LaTeX OCR dataset containing mathematical equation images and their LaTeX code
# This dataset contains thousands of mathematical expressions for training

from datasets import load_dataset
dataset = load_dataset("unsloth/Latex_OCR", split="train")

In [None]:
dataset

In [None]:
dataset[0]

In [None]:
dataset[0]['image']

In [None]:
dataset[42837]['image']

In [None]:
dataset[42837]['text']

In [None]:
# Define the instruction prompt that will be used for all training samples
# This tells the model what task to perform on the input image

instruction = "Write the LaTex representation for this image."

In [None]:
def convert_to_conversation(sample):
  """
    Convert a dataset sample into a conversation format for fine-tuning.

    This function transforms raw image-text pairs into a structured conversation
    format that the model expects during training. The conversation follows
    the standard chat template with user and assistant roles.

    Args:
        sample (dict): Dataset sample containing 'image' and 'text' keys
                      - 'image': PIL Image of mathematical equation
                      - 'text': Corresponding LaTeX representation

    Returns:
        dict: Formatted conversation with 'messages' key containing:
              - User message with instruction text and image
              - Assistant message with LaTeX response
  """
  conversation = [
      {
          "role": "user",
          "content": [
              {"type": "text", "text": instruction},        # Task instruction
              {"type": "image", "image": sample["image"]}   # Input image
          ]
      },
      {
          "role": "assistant",
          "content": [
              {"type": "text", "text": sample["text"]}      # Target LaTeX output
          ]
      }

  ]
  return {"messages": conversation}

In [None]:
convert_to_conversation(dataset[0])

In [None]:
# Convert the entire dataset to conversation format for training
# This creates a list of formatted conversations for the SFTTrainer

converted_dataset = [convert_to_conversation(sample) for sample in dataset]

In [None]:
converted_dataset[1]

In [None]:
# Switch model to inference mode for testing before training
# This optimizes the model for generation rather than training

FastVisionModel.for_inference(model)

In [None]:
# Test the pre-trained model's performance before fine-tuning
# Use the second image from the dataset as a test case

image = dataset[1]["image"]
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": instruction},  # Task instruction
            {"type": "image", "image": image}       # Test image
        ]
    }
]

In [None]:
# Prepare inputs for the model using the chat template
# The tokenizer handles both text and image inputs simultaneously

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image, input_text,          # Process image and text together
    add_special_tokens = False, # Special tokens already in chat template
    return_tensors = "pt",      # Return PyTorch tensors
).to("cuda")                    # Move to GPU for inference

In [None]:
# Generate LaTeX output with streaming display
# TextStreamer shows tokens as they're generated for real-time feedback

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True) # Don't repeat input
_ = model.generate(
    **inputs,
    streamer=text_streamer, # Stream output tokens
    max_new_tokens=128,     # Maximum LaTeX length
    use_cache=True,         # Use KV cache for efficiency
    temperature=1.5,        # Sampling temperature (higher = more creative)
    min_p=0.1               # Minimum probability threshold
)

In [None]:
image

In [None]:
# Import training components and prepare model for fine-tuning

from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

In [None]:
# Switch model back to training mode after inference testing

FastVisionModel.for_inference(model)

In [None]:
# Configure and initialize the Supervised Fine-Tuning trainer
# Uses specialized data collator for vision-language models

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=UnslothVisionDataCollator(model, tokenizer),  # Handles image-text pairs
    train_dataset=converted_dataset,
    args = SFTConfig(
        # Batch size and gradient settings
        per_device_train_batch_size=2,  # Small batch size for memory efficiency
        gradient_accumulation_steps=4,  # Effective batch size = 2 * 4 = 8

        # Training schedule
        warmup_steps=5,   # Learning rate warmup
        max_steps=30,     # Total training steps (quick demo)
        learning_rate=2e-4,# Learning rate for LoRA parameters

        # Precision settings (use bf16 if supported, else fp16)
        fp16=not is_bf16_supported(),   # 16-bit floating point
        bf16 = is_bf16_supported(),     # Brain float 16 (better than fp16)

        # Logging and optimization
        logging_steps=1,      # Log every step
        optim="adamw_8bit",   # 8-bit AdamW optimizer
        weight_decay=0.01,    # L2 regularization
        lr_scheduler_type="linear", # Linear learning rate decay

        # Reproducibility and output
        seed=3407,               # Random seed
        output_dir="outputs",    # Save directory
        report_to="none",        # Disable wandb/tensorboard

        # Dataset configuration
        remove_unused_columns=False,                    # Keep all dataset columns
        dataset_text_field="",                          # No specific text field (using messages)
        dataset_kwargs={"skip_prepare_dataset": True},  # Use pre-formatted data
        dataset_num_proc=4,                             # Parallel data processing
        max_seq_length=2048,                            # Maximum sequence length
    )
)

In [None]:
# Start the fine-tuning process
# This will train the LoRA adapters on the LaTeX OCR task

trainer.train()

In [None]:
# Switch trained model back to inference mode for testing

FastVisionModel.for_inference(model)

In [None]:
image = dataset[2]["image"]

In [None]:
instruction = "Write the LateX representation for this image."

In [None]:
# Create test message for the fine-tuned model
# Note: Image and text order is swapped compared to training format

messages = [
    {"role": "user", "content": [
        {"type": "image"},                      # Image first
        {"type": "text", "text": instruction}   # Then instruction
    ]}
]

In [None]:
# Prepare inputs for fine-tuned model inference

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image,                      # Input image
    input_text,                 # Formatted text prompt
    add_special_tokens=False,   # Template already includes special tokens
    return_tensors="pt",        # PyTorch tensor format
).to("cuda")                    # Move to GPU

In [None]:
# Generate LaTeX output from the fine-tuned model
# Compare this output with the pre-training results to see improvement

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(
    **inputs,
    streamer=text_streamer, # Real-time output streaming
    max_new_tokens=128,     # Maximum LaTeX code length
    use_cache=True,         # Enable KV caching
    temperature=1.5,        # Sampling temperature
    min_p=0.1               # Minimum probability threshold
)

In [None]:
# Display the test image to compare with fine-tuned model output

image