In [2]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    mineru_model_dir: str
settings = Settings()

# 1. Download mineru weights

In [1]:
import json
import os

import requests
from huggingface_hub import snapshot_download

In [7]:
# download code from https://github.com/opendatalab/MinerU/blob/master/scripts/download_models_hf.py
def download_json(url):
    response = requests.get(url)
    response.raise_for_status() 
    return response.json()


def download_and_modify_json(url, local_filename, modifications):
    if os.path.exists(local_filename):
        data = json.load(open(local_filename))
        config_version = data.get('config_version', '0.0.0')
        if config_version < '1.0.0':
            data = download_json(url)
    else:
        data = download_json(url)

    for key, value in modifications.items():
        data[key] = value

    with open(local_filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [4]:
## pdf-extract-kit
mineru_patterns = [
    "models/Layout/LayoutLMv3/*",
    "models/Layout/YOLO/*",
    "models/MFD/YOLO/*",
    "models/MFR/unimernet_small/*",
    "models/TabRec/TableMaster/*",
    "models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download(
    'opendatalab/PDF-Extract-Kit-1.0',
    allow_patterns=mineru_patterns,
    local_dir=os.path.join(settings.mineru_model_dir, 'pdf-extract-kit'),
    local_dir_use_symlinks=False
)
print(f'model_dir is: {model_dir}')

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 42 files:   0%|          | 0/42 [00:00<?, ?it/s]

models/MFR/unimernet_small/.mdl:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

models/MFR/unimernet_small/.msc:   0%|          | 0.00/524 [00:00<?, ?B/s]

models/MFR/unimernet_small/.mv:   0%|          | 0.00/36.0 [00:00<?, ?B/s]

models/Layout/LayoutLMv3/config.json:   0%|          | 0.00/857 [00:00<?, ?B/s]

doclayout_yolo_ft.pt:   0%|          | 0.00/40.7M [00:00<?, ?B/s]

yolo_v8_ft.pt:   0%|          | 0.00/350M [00:00<?, ?B/s]

yolov10l_ft.pt:   0%|          | 0.00/52.3M [00:00<?, ?B/s]

model_final.pth:   0%|          | 0.00/564M [00:00<?, ?B/s]

(…)unimernet_small/preprocessor_config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

models/MFR/unimernet_small/config.json:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

(…)s/MFR/unimernet_small/configuration.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

models/MFR/unimernet_small/README.md:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

(…)MFR/unimernet_small/unimernet_small.yaml:   0%|          | 0.00/833 [00:00<?, ?B/s]

(…)odels/MFR/unimernet_small/tokenizer.json:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

pytorch_model.pth:   0%|          | 0.00/810M [00:00<?, ?B/s]

(…)FR/unimernet_small/tokenizer_config.json:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

models/TabRec/StructEqTable/README.md:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

(…)s/TabRec/StructEqTable/added_tokens.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

models/TabRec/StructEqTable/config.json:   0%|          | 0.00/5.25k [00:00<?, ?B/s]

(…)tructEqTable/configuration_intern_vit.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

(…)ctEqTable/configuration_internvl_chat.py:   0%|          | 0.00/3.80k [00:00<?, ?B/s]

(…)els/TabRec/StructEqTable/conversation.py:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

(…)Rec/StructEqTable/generation_config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

models/TabRec/StructEqTable/merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

(…)Rec/StructEqTable/modeling_intern_vit.py:   0%|          | 0.00/18.1k [00:00<?, ?B/s]

(…)/StructEqTable/modeling_internvl_chat.py:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

(…)c/StructEqTable/preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

(…)ec/StructEqTable/special_tokens_map.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

(…)bRec/StructEqTable/tokenizer_config.json:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

models/TabRec/StructEqTable/vocab.json:   0%|          | 0.00/3.38M [00:00<?, ?B/s]

inference.pdiparams:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

(…)OCRv4_det_infer/inference.pdiparams.info:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

inference.pdmodel:   0%|          | 0.00/166k [00:00<?, ?B/s]

inference.pdiparams:   0%|          | 0.00/10.8M [00:00<?, ?B/s]

(…)OCRv4_rec_infer/inference.pdiparams.info:   0%|          | 0.00/30.6k [00:00<?, ?B/s]

inference.pdmodel:   0%|          | 0.00/169k [00:00<?, ?B/s]

(…)els/TabRec/TableMaster/ppocr_keys_v1.txt:   0%|          | 0.00/26.2k [00:00<?, ?B/s]

(…)leMaster/table_master_structure_dict.txt:   0%|          | 0.00/435 [00:00<?, ?B/s]

inference.pdiparams:   0%|          | 0.00/262M [00:00<?, ?B/s]

(…)blemaster_infer/inference.pdiparams.info:   0%|          | 0.00/25.9k [00:00<?, ?B/s]

inference.pdmodel:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

model_dir is: /Users/id4thomas/models/mineru


In [5]:
layoutreader_pattern = [
    "*.json",
    "*.safetensors",
]
layoutreader_model_dir = snapshot_download(
    'hantian/layoutreader',
    allow_patterns=layoutreader_pattern,
    local_dir=os.path.join(settings.mineru_model_dir, 'layoutreader'),
    local_dir_use_symlinks=False
)

print(f'layoutreader_model_dir is: {layoutreader_model_dir}')

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/24.1k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/713M [00:00<?, ?B/s]

layoutreader_model_dir is: /Users/id4thomas/models/mineru/layoutreader


In [9]:
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
config_file = os.path.join(settings.mineru_model_dir, config_file_name)

json_mods = {
    'models-dir': os.path.join(settings.mineru_model_dir, 'pdf-extract-kit/models'),
    'layoutreader-model-dir': os.path.join(settings.mineru_model_dir, 'layoutreader'),
}

download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')

The configuration file has been configured successfully, the path is: /Users/id4thomas/models/mineru/magic-pdf.json


# 2. mineru test
* https://mineru.readthedocs.io/en/latest/user_guide/quick_start/to_markdown.html

In [2]:
os.environ["MINERU_TOOLS_CONFIG_JSON"] = "models/mineru/magic-pdf.json"

In [3]:
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

[93mimport tensorrt_llm failed, if do not use tensorrt, ignore this message[0m
[93mimport lmdeploy failed, if do not use lmdeploy, ignore this message[0m


In [4]:
pdf_file_name = "samples/1706.03762v7.pdf"
name_without_suff = pdf_file_name.split(".")[0]

In [5]:
local_image_dir, local_md_dir = "results/mineru/images", "results/mineru/output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)

In [6]:
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
    local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))

In [7]:
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) 

In [8]:
ds = PymuDocDataset(pdf_bytes)
print(ds.classify())

[32m2025-01-01 02:28:10.919[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m57[0m - [1mcid_count: 1, text_len: 26978, cid_chars_radio: 3.7076860331467135e-05[0m


SupportedPdfParseMethod.TXT


In [9]:
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
    infer_result = ds.apply(doc_analyze, ocr=True)

    ## pipeline
    pipe_result = infer_result.pipe_ocr_mode(image_writer)

else:
    infer_result = ds.apply(doc_analyze, ocr=False)

    ## pipeline
    pipe_result = infer_result.pipe_txt_mode(image_writer)

[32m2025-01-01 02:28:12.414[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m57[0m - [1mcid_count: 0, text_len: 26304, cid_chars_radio: 0.0[0m
[32m2025-01-01 02:28:12.425[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m78[0m - [1mDocAnalysis init, this may take some times, layout_model: layoutlmv3, apply_formula: True, apply_ocr: False, apply_table: False, table_model: rapid_table, lang: None[0m
[32m2025-01-01 02:28:12.426[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m91[0m - [1musing device: cpu[0m
[32m2025-01-01 02:28:12.426[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m95[0m - [1musing models_dir: /Users/id4thomas/models/mineru/pdf-extract-kit/models[0m


CustomVisionEncoderDecoderModel init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
CustomMBartForCausalLM init
CustomMBartDecoder init
[32m[01/01 02:28:22 detectron2]: [0mRank of current process: 0. World size: 1
[32m[01/01 02:28:22 detectron2]: [0mEnvironment info:
-------------------------------  ----------------------------------------------------------------------------
sys.platform                     darwin
Python                           3.10.16 (main, Dec 11 2024, 10:22:29) [Clang 14.0.6 ]
numpy                            1.26.4
detectron2                       0.6 @/opt/miniconda3/envs/MinerU/lib/python3.10/site-packages/detectron2
Compiler                         clang 16.0.0
CUDA compiler                    not available
DETECTRON2_ENV_MODULE            <not set>
PyTorch                          2.5.1 @/opt/miniconda3/envs/MinerU/lib/python3.10/site-packages/torch
PyTorch debug build  

100%|██████████| 4780/4780 [00:14<00:00, 338.82it/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv4/chinese/ch_PP-OCRv4_rec_infer.tar to /Users/id4thomas/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/ch_PP-OCRv4_rec_infer.tar


100%|██████████| 10720/10720 [00:04<00:00, 2538.33it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /Users/id4thomas/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:06<00:00, 330.91it/s]
[32m2025-01-01 02:28:52.214[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m170[0m - [1mDocAnalysis init done![0m
[32m2025-01-01 02:28:52.214[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mcustom_model_init[0m:[36m181[0m - [1mmodel init cost: 39.79191517829895[0m
[32m2025-01-01 02:29:07.288[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m202[0m - [1mlayout detection time: 15.05[0m
[32m2025-01-01 02:29:10.096[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m210[0m - [1mmfd time: 2.8[0m
[32m2025-01-01 02:29:10.099[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m217[0m - [1mformula nums: 0, mfr time: 0.0[0m
[32m2025-01-01 02:29:11.229[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m: