In [19]:
import torch
import os, time
from datetime import datetime

In [2]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

In [3]:
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-4B-Instruct",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# !git clone https://github.com/kXborg/vlm-bench.git

Cloning into 'vlm-bench'...
remote: Enumerating objects: 284, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 284 (delta 32), reused 66 (delta 11), pack-reused 191 (from 1)[K
Receiving objects: 100% (284/284), 55.30 MiB | 27.26 MiB/s, done.
Resolving deltas: 100% (101/101), done.
Updating files: 100% (101/101), done.


In [7]:
# !pip install pickleshare

Collecting pickleshare
  Downloading pickleshare-0.7.5-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading pickleshare-0.7.5-py2.py3-none-any.whl (6.9 kB)
Installing collected packages: pickleshare
Successfully installed pickleshare-0.7.5
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [11]:
%cd vlm-bench
!ls

/workspace/vlm-bench


In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-4B-Instruct")

In [12]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "./tasks/multi-page-pdf.png"
            },
            {
                "type": "text",
                "text": (
                    "This image contains **all pages of a multi-page PDF**, stacked vertically from top to bottom.\n"
                    "Perform OCR and return the **full text** in **GitHub-Flavored Markdown**, preserving the original layout as closely as possible:\n"
                    "- Use `#`, `##`, `###` for headings.\n"
                    "- Keep bullet points (`-`, `*`) and numbered lists.\n"
                    "- Render tables with Markdown table syntax (`| col | col |`).\n"
                    "- Mark **bold** with `**text**` and *italic* with `*text*`.\n"
                    "- Preserve indentation and blank lines for paragraphs.\n"
                    "- Insert a clear page separator **exactly** like this before each new page:\n"
                    "  ```\n"
                    "  ---PAGE 1---\n"
                    "  ```\n"
                    "  (replace `1` with the correct page number).\n\n"
                    "Start with page 1 at the top of the image. Do **not** add any extra commentary."
                )
            }
        ]
    }
]

In [16]:
# Function to save texts
def save_ocr(output_text_list):
    os.makedirs("ocr_results", exist_ok=True)
    filepath = f"ocr_results/ocr_{datetime.now():%Y%m%d_%H%M%S}.md"
    text = "".join(output_text_list).strip()
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Saved to {filepath}")
    return filepath

In [13]:
# Preparation for inference
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
)
inputs = inputs.to(model.device)

In [21]:
# Inference: Generation of the output
print('Extracting texts.')
t1 = time.time()

generated_ids = model.generate(**inputs, max_new_tokens=5024)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

t2=time.time()
save_ocr(output_text)
print(f"Saved output to markdown file. \nTotal time taken : {round(t2-t1, 2)} s")

Extracting texts.
Saved to ocr_results/ocr_20251105_130417.md
Saved output to markdown file. 
Total time taken : 246.77 s


In [22]:
!ls

 Medical	        Models	    Results	  deconstruct	    ocr_results
'Model architectures'   README.md   VLM-on-Edge   img_cls_metrics   tasks


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
