In [1]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    mineru_model_dir: str
settings = Settings()

# 1. Download mineru weights

In [2]:
import json
import os

import requests
from huggingface_hub import snapshot_download

In [3]:
# download code from https://github.com/opendatalab/MinerU/blob/master/scripts/download_models_hf.py
def download_json(url):
    response = requests.get(url)
    response.raise_for_status() 
    return response.json()


def download_and_modify_json(url, local_filename, modifications):
    if os.path.exists(local_filename):
        data = json.load(open(local_filename))
        config_version = data.get('config_version', '0.0.0')
        if config_version < '1.0.0':
            data = download_json(url)
    else:
        data = download_json(url)

    for key, value in modifications.items():
        data[key] = value

    with open(local_filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [4]:
## pdf-extract-kit
mineru_patterns = [
    "models/Layout/LayoutLMv3/*",
    "models/Layout/YOLO/*",
    "models/MFD/YOLO/*",
    "models/MFR/unimernet_small/*",
    "models/TabRec/TableMaster/*",
    "models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download(
    'opendatalab/PDF-Extract-Kit-1.0',
    allow_patterns=mineru_patterns,
    local_dir=os.path.join(settings.mineru_model_dir, 'pdf-extract-kit'),
    local_dir_use_symlinks=False
)
print(f'model_dir is: {model_dir}')

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 42 files:   0%|          | 0/42 [00:00<?, ?it/s]

model_dir is: /Users/id4thomas/models/mineru/pdf-extract-kit


In [5]:
layoutreader_pattern = [
    "*.json",
    "*.safetensors",
]
layoutreader_model_dir = snapshot_download(
    'hantian/layoutreader',
    allow_patterns=layoutreader_pattern,
    local_dir=os.path.join(settings.mineru_model_dir, 'layoutreader'),
    local_dir_use_symlinks=False
)

print(f'layoutreader_model_dir is: {layoutreader_model_dir}')

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

layoutreader_model_dir is: /Users/id4thomas/models/mineru/layoutreader


In [6]:
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
config_file = os.path.join(settings.mineru_model_dir, config_file_name)

json_mods = {
    'models-dir': os.path.join(settings.mineru_model_dir, 'pdf-extract-kit/models'),
    'layoutreader-model-dir': os.path.join(settings.mineru_model_dir, 'layoutreader'),
}

download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')

The configuration file has been configured successfully, the path is: /Users/id4thomas/models/mineru/magic-pdf.json


# 2. mineru test
* https://mineru.readthedocs.io/en/latest/user_guide/quick_start/to_markdown.html

In [7]:
os.environ["MINERU_TOOLS_CONFIG_JSON"] = "models/mineru/magic-pdf.json"

In [8]:
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

[93mimport tensorrt_llm failed, if do not use tensorrt, ignore this message[0m
[93mimport lmdeploy failed, if do not use lmdeploy, ignore this message[0m


In [16]:
fname = "1706.03762v7"
fname = "ai_parl_2" # 국회도서관 문서 - 이미지 PDF
fname = "ai_parl_3" # 국회도서관 문서 - 텍스트 PDF

pdf_file_name = f"samples/{fname}.pdf"
name_without_suff = pdf_file_name.split(".")[0]

In [17]:
local_image_dir, local_md_dir = f"results/mineru/{fname}/images", f"results/mineru/{fname}/output"
image_dir = str(os.path.basename(local_image_dir))
os.makedirs(local_image_dir, exist_ok=True)

In [18]:
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
    local_md_dir
)
image_dir = str(os.path.basename(local_image_dir))

In [19]:
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name) 

In [20]:
ds = PymuDocDataset(pdf_bytes)
print(ds.classify())

[32m2025-01-08 23:52:44.892[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m57[0m - [1mcid_count: 0, text_len: 9614, cid_chars_radio: 0.0[0m


SupportedPdfParseMethod.TXT


In [21]:
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
    print("OCR MODE")
    infer_result = ds.apply(doc_analyze, ocr=True)
    pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
    print("TEXT MODE")
    infer_result = ds.apply(doc_analyze, ocr=False)
    pipe_result = infer_result.pipe_txt_mode(image_writer)

[32m2025-01-08 23:52:45.529[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m57[0m - [1mcid_count: 0, text_len: 9614, cid_chars_radio: 0.0[0m
[32m2025-01-08 23:52:45.535[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m78[0m - [1mDocAnalysis init, this may take some times, layout_model: layoutlmv3, apply_formula: True, apply_ocr: False, apply_table: False, table_model: rapid_table, lang: None[0m
[32m2025-01-08 23:52:45.535[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m91[0m - [1musing device: cpu[0m
[32m2025-01-08 23:52:45.535[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m95[0m - [1musing models_dir: /Users/id4thomas/models/mineru/pdf-extract-kit/models[0m
[32m2025-01-08 23:52:45.535[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m170[0m - [1mDocAnalysis init

TEXT MODE


[32m2025-01-08 23:52:53.866[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m202[0m - [1mlayout detection time: 8.3[0m
[32m2025-01-08 23:52:56.213[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m210[0m - [1mmfd time: 2.34[0m
[32m2025-01-08 23:53:00.025[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m217[0m - [1mformula nums: 5, mfr time: 3.81[0m
[32m2025-01-08 23:53:01.117[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m251[0m - [1mdet time: 1.09[0m
[32m2025-01-08 23:53:01.118[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mdoc_analyze[0m:[36m224[0m - [1m-----page_id : 0, page total time: 15.55-----[0m
[32m2025-01-08 23:53:07.693[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__call__[0m:[36m202[0m - [1mlayout detection time: 6.55[0m
[32m2

In [22]:
### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{fname}_model.pdf"))

### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{fname}_layout.pdf"))

### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{fname}_spans.pdf"))

### dump markdown
pipe_result.dump_md(md_writer, f"{fname}.md", image_dir)

### dump content list
pipe_result.dump_content_list(md_writer, f"{fname}_content_list.json", image_dir)