In [1]:
!pip install --upgrade unsloth

Collecting unsloth
  Downloading unsloth-2025.12.5-py3-none-any.whl.metadata (65 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.9/65.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.12.4 (from unsloth)
  Downloading unsloth_zoo-2025.12.4-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.1-py3-none-any.whl.metadata (11 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth)
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from u

In [2]:
!pip install addict

Collecting addict
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)
Installing collected packages: addict
Successfully installed addict-2.4.0


In [3]:
!wget https://huggingface.co/datasets/lehoangan02/nlp/resolve/main/UIT_HWDB_word_clean.zip?download=true
!unzip -q UIT_HWDB_word_clean.zip?download=true

--2025-12-15 01:27:39--  https://huggingface.co/datasets/lehoangan02/nlp/resolve/main/UIT_HWDB_word_clean.zip?download=true
Resolving huggingface.co (huggingface.co)... 13.35.202.97, 13.35.202.121, 13.35.202.40, ...
Connecting to huggingface.co (huggingface.co)|13.35.202.97|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/693ae93f73433849123ec646/f80928171bbb0bfd3ca6bdda9236173f661232881b5702423d99de1136e289d3?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251215%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251215T012739Z&X-Amz-Expires=3600&X-Amz-Signature=82c2d1a3513b823354fa3c48dcdf25f560069ebd88061da71327537d50723453&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27UIT_HWDB_word_clean.zip%3B+filename%3D%22UIT_HWDB_word_clean.zip%22%3B&response-content-type=application%2Fzip&x-id=GetObject&Expires=1

In [None]:
import os
import io
import contextlib
import sys
from unsloth import FastVisionModel
from transformers import AutoModel
from huggingface_hub import snapshot_download

# --- Setup ---
os.environ["UNSLOTH_WARN_UNINITIALIZED"] = "0"

snapshot_download("unsloth/DeepSeek-OCR", local_dir="deepseek_ocr")

model, tokenizer = FastVisionModel.from_pretrained(
    "./deepseek_ocr",
    load_in_4bit=False,
    auto_model=AutoModel,
    trust_remote_code=True,
)

prompt = "<image>\nTranscribe the Vietnamese text in the image."
# Simple prompt that works reliably
# prompt = """<image>
# You are an OCR system.
# The image contains a SINGLE Vietnamese word.
# The text is NOT a math formula.
# The text is NOT LaTeX.
# The text is NOT an equation.
# Output ONLY the Vietnamese word.
# Do NOT add explanations.
# Do NOT add symbols.
# Do NOT add punctuation.
# Return plain UTF-8 text.
# """
input_folder = "./UIT_HWDB_word_clean/test/images"
output_txt = "./deepseek_ocr_results.txt"

# --- Main Loop ---
print(f"Starting OCR. Results will be saved to {output_txt}")

with open(output_txt, "w", encoding="utf-8") as f:
    for filename in sorted(os.listdir(input_folder)):
        if not filename.lower().endswith((".jpg", ".png", ".jpeg")):
            continue

        image_path = os.path.join(input_folder, filename)

        # 1. Create a buffer to capture the print output
        f_capture = io.StringIO()

        # 2. Redirect stdout (print) to our buffer during inference
        with contextlib.redirect_stdout(f_capture):
            try:
                model.infer(
                    tokenizer,
                    prompt=prompt,
                    image_file=image_path,
                    output_path=".",
                    crop_mode=False,
                    save_results=False,
                )
            except Exception as e:
                print(f"Error processing {filename}: {e}")

        # 3. Retrieve the text from the buffer
        raw_output = f_capture.getvalue()

        # 4. Filter out debug logs (like 'directly resize') to get clean text
        lines = raw_output.splitlines()
        clean_lines = [line for line in lines if "directly resize" not in line]
        text = "\n".join(clean_lines).strip()

        # 5. Write to file
        # Check if text is empty to avoid writing blank lines unnecessarily
        if text:
            f.write(f"{filename}\t{text}\n")
        else:
            f.write(f"{filename}\t[NO TEXT DETECTED]\n")

        f.flush() # Force write to disk immediately

        # 6. Print progress to the REAL console
        print(f"OCR done: {filename} (Length: {len(text)})")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


.gitattributes: 0.00B [00:00, ?B/s]

README-checkpoint.md: 0.00B [00:00, ?B/s]

LICENSE: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

assets/fig1.png:   0%|          | 0.00/396k [00:00<?, ?B/s]

assets/show1.jpg:   0%|          | 0.00/117k [00:00<?, ?B/s]

assets/show2.jpg:   0%|          | 0.00/216k [00:00<?, ?B/s]

assets/show3.jpg:   0%|          | 0.00/247k [00:00<?, ?B/s]

assets/show4.jpg:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration_deepseek_v2.py: 0.00B [00:00, ?B/s]

conversation.py: 0.00B [00:00, ?B/s]

deepencoder.py: 0.00B [00:00, ?B/s]

model-00001-of-000001.safetensors:   0%|          | 0.00/6.67G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

modeling_deepseekocr.py: 0.00B [00:00, ?B/s]

modeling_deepseekv2.py: 0.00B [00:00, ?B/s]

processor_config.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

You are using a model of type deepseek_vl_v2 to instantiate a model of type DeepseekOCR. This is not supported for all configurations of models and can yield errors.


Are you certain you want to do remote code execution?
==((====))==  Unsloth 2025.12.5: Fast Deepseekocr patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


You are using a model of type deepseek_vl_v2 to instantiate a model of type DeepseekOCR. This is not supported for all configurations of models and can yield errors.
You are using a model of type deepseek_vl_v2 to instantiate a model of type DeepseekOCR. This is not supported for all configurations of models and can yield errors.


Unsloth: Deepseekocr does not support SDPA - switching to fast eager.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Some weights of DeepseekOCRForCausalLM were not initialized from the model checkpoint at ./deepseek_ocr and are newly initialized: ['model.vision_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting OCR. Results will be saved to ./deepseek_ocr_results.txt


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


OCR done: 250_1.jpg (Length: 4)
OCR done: 250_10.jpg (Length: 9)
OCR done: 250_100.jpg (Length: 29)
OCR done: 250_101.jpg (Length: 5)
OCR done: 250_102.jpg (Length: 7)
OCR done: 250_103.jpg (Length: 11)
OCR done: 250_104.jpg (Length: 3)
OCR done: 250_105.jpg (Length: 15)
OCR done: 250_106.jpg (Length: 8)
OCR done: 250_107.jpg (Length: 7)
OCR done: 250_108.jpg (Length: 4)
OCR done: 250_109.jpg (Length: 7)
OCR done: 250_11.jpg (Length: 15)
OCR done: 250_110.jpg (Length: 4)
OCR done: 250_111.jpg (Length: 17)
OCR done: 250_112.jpg (Length: 3)
OCR done: 250_113.jpg (Length: 6)
OCR done: 250_114.jpg (Length: 2)
OCR done: 250_115.jpg (Length: 5)
OCR done: 250_116.jpg (Length: 3)
OCR done: 250_117.jpg (Length: 3)
OCR done: 250_118.jpg (Length: 4)
OCR done: 250_119.jpg (Length: 4)
OCR done: 250_12.jpg (Length: 3)
OCR done: 250_120.jpg (Length: 17)
OCR done: 250_121.jpg (Length: 4)
OCR done: 250_122.jpg (Length: 2)
OCR done: 250_123.jpg (Length: 8)
OCR done: 250_124.jpg (Length: 3)
OCR done: 250

This is a friendly reminder - the current text generation call has exceeded the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


OCR done: 253_263.jpg (Length: 22587)
OCR done: 253_264.jpg (Length: 3)
OCR done: 253_265.jpg (Length: 3)
OCR done: 253_266.jpg (Length: 2)
OCR done: 253_267.jpg (Length: 3)
OCR done: 253_268.jpg (Length: 10)
OCR done: 253_269.jpg (Length: 6)
OCR done: 253_27.jpg (Length: 7)
OCR done: 253_270.jpg (Length: 5)
OCR done: 253_271.jpg (Length: 3)
OCR done: 253_272.jpg (Length: 4)
OCR done: 253_273.jpg (Length: 5)
OCR done: 253_274.jpg (Length: 9)
OCR done: 253_275.jpg (Length: 3)
OCR done: 253_276.jpg (Length: 2)
OCR done: 253_277.jpg (Length: 7)
OCR done: 253_278.jpg (Length: 20)
OCR done: 253_279.jpg (Length: 13)
OCR done: 253_28.jpg (Length: 5)
OCR done: 253_280.jpg (Length: 5)
OCR done: 253_281.jpg (Length: 4)
OCR done: 253_282.jpg (Length: 3)
OCR done: 253_283.jpg (Length: 2)
OCR done: 253_284.jpg (Length: 2)
OCR done: 253_285.jpg (Length: 3)
OCR done: 253_286.jpg (Length: 10)
OCR done: 253_287.jpg (Length: 7)
OCR done: 253_288.jpg (Length: 6)
OCR done: 253_289.jpg (Length: 3)
OCR done