In [23]:
import torch
import os, time, glob
from datetime import datetime

In [2]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

In [3]:
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-4B-Instruct",
    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# !git clone https://github.com/kXborg/vlm-bench.git

Cloning into 'vlm-bench'...
remote: Enumerating objects: 284, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 284 (delta 32), reused 66 (delta 11), pack-reused 191 (from 1)[K
Receiving objects: 100% (284/284), 55.30 MiB | 27.26 MiB/s, done.
Resolving deltas: 100% (101/101), done.
Updating files: 100% (101/101), done.


In [7]:
# !pip install pickleshare

Collecting pickleshare
  Downloading pickleshare-0.7.5-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading pickleshare-0.7.5-py2.py3-none-any.whl (6.9 kB)
Installing collected packages: pickleshare
Successfully installed pickleshare-0.7.5
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [27]:
# %cd vlm-bench
!ls tasks

61439.pdf		     github-readme.png	      potholes.png
apples.png		     icons.png		      table.png
bird.jpg		     multi-page-pdf-half.png  tlcm-3.jpg
cable-trip.jpg		     multi-page-pdf.png       tlcm-4.jpg
dine.jpg		     ocr.png		      tlcm.jpg
driving-gaze.jpg	     paper-blade.png	      vlm-evaluation.pdf
esp32-devkitC-v4-pinout.png  pdf-images		      work-table.jpg


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-4B-Instruct")

In [28]:
image_dir = "./tasks/pdf-images/"
image_extensions = ["*.png", "*.jpg", "*.jpeg", "*.bmp", "*.tiff"]

image_paths = sorted(
    [p for ext in image_extensions for p in glob.glob(os.path.join(image_dir, ext))],
    key=os.path.basename
)

messages = [
    {
        "role": "user",
        "content": [
            *[ {"type": "image", "image": p} for p in image_paths ],
            {
                "type": "text",
                "text": (
                    "This input contains **multiple images**, each representing **one page of a document** "
                    "(page 1 = first image, page 2 = second image, …).\n\n"

                    "Perform OCR on **all images** and return the **full text** in **GitHub-Flavored Markdown**.\n"
                    "Preserve the original layout **exactly**, paying special attention to **code cells**:\n\n"

                    "### General rules\n"
                    "- Use `#`, `##`, `###` for headings.\n"
                    "- Keep bullet/numbered lists, bold (`**`), italic (`*`).\n"
                    "- Render tables with `| col | col |` syntax (detect alignment if possible).\n"
                    "- Preserve indentation, blank lines, and paragraph separation.\n\n"

                    "### Code-cell handling (critical)\n"
                    "- **Detect any fenced code block** (starts with ``` or indented 4 spaces).\n"
                    "- **Wrap it in a Markdown fenced block** using three backticks (```).\n"
                    "- **Add the language identifier** immediately after the opening fence if detectable "
                    "(e.g., ```python, ```bash, ```json). If unsure, use ```text.\n"
                    "- **Never split a code block across pages** – keep the *entire* block together.\n"
                    "- If a block is interrupted by a page break, **close it**, insert the page separator, "
                    "then **re-open the same fence** with the same language tag and continue.\n"
                    "- Preserve **exact indentation** inside the block.\n"
                    "- Keep Jupyter cell markers (`# In[1]:`, `# %%`, etc.) **inside** the block.\n\n"

                    "### Page separators\n"
                    "- Insert **exactly** this before each new page (including page 1):\n"
                    " ```\n"
                    " ---PAGE {n}---\n"
                    " ```\n"
                    " (replace `{n}` with the 1-based page number). **Never place inside a code block**.\n\n"

                    "**Start with page 1 (first image). Return *only* the Markdown – no extra text.**"
                )
            }
        ]
    }
]

In [29]:
# Function to save texts
def save_ocr(output_text_list):
    os.makedirs("ocr_results", exist_ok=True)
    filepath = f"ocr_results/ocr_{datetime.now():%Y%m%d_%H%M%S}.md"
    text = "".join(output_text_list).strip()
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Saved to {filepath}")
    return filepath

In [32]:
# Preparation for inference
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
)
inputs = inputs.to(model.device)

# Inference: Generation of the output
print('Extracting texts.')
t1 = time.time()

generated_ids = model.generate(**inputs, max_new_tokens=16000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=False
)

t2=time.time()
save_ocr(output_text)
print(f"Saved output to markdown file. \nTotal time taken : {round(t2-t1, 2)} s")

Extracting texts.
Saved to ocr_results/ocr_20251105_140614.md
Saved output to markdown file. 
Total time taken : 1017.56 s
