In [4]:
# ruff: noqa
import sys
from pathlib import Path

REPO_ROOT = Path("..").absolute()
sys.path.append(str(REPO_ROOT))

from contextlib import contextmanager
from unsloth import FastVisionModel  # FastLanguageModel for LLMs
from unsloth.trainer import UnslothVisionDataCollator

import torch
from datasets import load_dataset
from transformers import TextStreamer
from trl import SFTConfig, SFTTrainer
from IPython.display import display, Math, Latex

MODEL_NAME = "unsloth/Qwen2-VL-7B-Instruct"
DATASET_NAME = "unsloth/LaTeX_OCR"

INSTRUCTION = "Write the LaTeX representation for this image."

FINE_TUNE_CONFIG = {
    "finetune_vision_layers": True,
    "finetune_language_layers": True,
    "finetune_attention_modules": True,
    "finetune_mlp_modules": True,
}
LORA_CONFIG = {
    "r": 16,
    "lora_alpha": 16,
    "lora_dropout": 0,
    "bias": "none",
    "use_rslora": False,
    "loftq_config": None,
}
DTYPE = torch.bfloat16
TRAIN_CONFIG = {
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "warmup_steps": 5,
    "max_steps": 30,
    "learning_rate": 2e-4,
    "fp16": DTYPE == torch.float16,
    "bf16": DTYPE == torch.bfloat16,
    "optim": "adamw_8bit",
    "weight_decay": 0.01,
    "lr_scheduler_type": "linear",
    "seed": 3407,
}

LOG_CONFIG = {
    "logging_steps": 1,
    "output_dir": "qwen-vl-outputs",
    "report_to": "none",
}

DATASET_CONFIG = {
    "remove_unused_columns": False,
    "dataset_text_field": "",
    "dataset_kwargs": {"skip_prepare_dataset": True},
    "dataset_num_proc": 4,
    "max_seq_length": 2048,
}

SAVE_PATH = "qwen_vl_lora_model"

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
DEBUG::VLLLM VLLM_LOGGING_CONFIG_PATH None
INFO 03-22 11:35:21 /home/jeromeku/dev/third_party/unsloth/.unsloth.env/lib/python3.11/site-packages/vllm/platforms/__init__.py:207] Automatically detected platform cuda.
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


🦥 Unsloth Zoo will now patch everything to make training faster!
DEBUG::VLLLM VLLM_LOGGING_CONFIG_PATH None
INFO 03-22 11:35:27 /home/jeromeku/dev/third_party/unsloth/.unsloth.env/lib/python3.11/site-packages/vllm/platforms/__init__.py:207] Automatically detected platform cuda.


In [5]:
def prepare_model_and_tokenizer(
    model_name,
    fine_tune_config: dict,
    lora_config: dict,
    load_in_4bit=True,
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    **kwargs,
):
    model, tokenizer = FastVisionModel.from_pretrained(
        model_name,
        load_in_4bit=load_in_4bit,  # Use 4bit to reduce memory use. False for 16bit LoRA.
        use_gradient_checkpointing=use_gradient_checkpointing,  # True or "unsloth" for long context
    )

    model = FastVisionModel.get_peft_model(
        model,
        **fine_tune_config,
        **lora_config,
        random_state=random_state,
        # target_modules = "all-linear", # Optional now! Can specify a list if needed
    )

    return model, tokenizer


def prepare_dataset(dataset_name, split="train"):
    dataset = load_dataset(dataset_name, split=split)
    return dataset


def convert_to_conversation(sample, instruction=INSTRUCTION):
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": instruction},
                {"type": "image", "image": sample["image"]},
            ],
        },
        {"role": "assistant", "content": [{"type": "text", "text": sample["text"]}]},
    ]
    return {"messages": conversation}


def generate_image_text(
    model,
    tokenizer,
    image,
    instruction=INSTRUCTION,
    temperature=1.5,
    min_p=0.1,
    max_new_tokens=128,
    use_cache=True,
):
    messages = [
        {
            "role": "user",
            "content": [{"type": "image"}, {"type": "text", "text": instruction}],
        }
    ]
    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(
        image,
        input_text,
        add_special_tokens=False,
        return_tensors="pt",
    ).to("cuda")

    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
    return model.generate(
        **inputs,
        streamer=text_streamer,
        max_new_tokens=max_new_tokens,
        use_cache=use_cache,
        temperature=temperature,
        min_p=min_p,
    )


@contextmanager
def inference_context(model):
    FastVisionModel.for_inference(model)
    yield
    FastVisionModel.for_training(model)

In [6]:
model, tokenizer = prepare_model_and_tokenizer(
    MODEL_NAME, FINE_TUNE_CONFIG, LORA_CONFIG
)
dataset = prepare_dataset(DATASET_NAME)
converted_dataset = [convert_to_conversation(sample) for sample in dataset]
image = dataset[2]["image"]
latex = dataset[2]["text"]
display(Math(latex))

==((====))==  Unsloth 2025.3.17: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.109 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Unsloth: Making `model.base_model.model.visual` require gradients
Unsloth: Model does not have a default image size - using 512
$$H ^ { \prime } = \: \::::
:::
:

<|im_start|>:::-template

:::
:-:::

::-::

::
:
:-::
:
:-::
:
:-::
:
:(""+system:::
:(""+:::-::(""+::,<|im_end|>
tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198, 151652, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151653,   7985,    279,  97913,  13042,    369,    419,   2168,
             13, 151645,    198, 151644,  77091,    198,  14085,     39,   6306,
            314,   1124,  32338,    335,    284,   1124,     25,   1124,     25,
             25,     25,     25,    198,     25,     25,     25,    198,     25,
            271, 151644,     25,     25,     25,     12,   4214,    271,   

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=UnslothVisionDataCollator(model, tokenizer),  # Must use!
    train_dataset=converted_dataset,
    args=SFTConfig(
        **TRAIN_CONFIG,
        **LOG_CONFIG,
        **DATASET_CONFIG,
    ),
)

with inference_context(model):
    outputs = generate_image_text(model, tokenizer, image, instruction=INSTRUCTION)
    print(outputs)