MinerU - PDF Test

- for installization : https://github.com/opendatalab/MinerU/blob/master/docs/README_Ubuntu_CUDA_Acceleration_en_US.md
- usage for python API : https://mineru.readthedocs.io/en/latest/user_guide/usage/api.html



In [1]:
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

  from .autonotebook import tqdm as notebook_tqdm


##### Example, convert pdf

In [5]:

input_dir = '/ephemeral/home/xiong/data/Fund/pdf_parse'
output_dir = os.path.join(input_dir, 'temp','output')
pdf_file_path = os.path.join(input_dir, 'temp','input','111_2023_0_table1.pdf')  # replace with the real pdf path

In [6]:
## specify output dir
name_without_suffix = os.path.splitext(os.path.basename(pdf_file_path))[0]
pdf_output_dir = os.path.join(output_dir, name_without_suffix)
local_image_dir = os.path.join(pdf_output_dir, "images")
local_md_dir = pdf_output_dir
os.makedirs(local_image_dir, exist_ok=True)
os.makedirs(local_md_dir, exist_ok=True)

In [9]:
### run pdf processing
# read bytes
image_writer = FileBasedDataWriter(local_image_dir)
md_writer = FileBasedDataWriter(local_md_dir)
# Readers
reader = FileBasedDataReader("")
pdf_bytes = reader.read(pdf_file_path)
# Processing
ds = PymuDocDataset(pdf_bytes)
## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
    infer_result = ds.apply(doc_analyze, ocr=True)
    pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
    infer_result = ds.apply(doc_analyze, ocr=False)
    pipe_result = infer_result.pipe_txt_mode(image_writer)

[32m2025-04-13 01:27:28.455[0m | [1mINFO    [0m | [36mmagic_pdf.data.dataset[0m:[36m__init__[0m:[36m157[0m - [1mlang: None[0m
[32m2025-04-13 01:27:28.513[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m67[0m - [1mcid_count: 0, text_len: 3258, cid_chars_radio: 0.0[0m
[32m2025-04-13 01:27:28.532[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mmay_batch_image_analyze[0m:[36m275[0m - [1mgpu_memory: 79 GB, batch_ratio: 16[0m
Layout Predict: 100%|██████████| 1/1 [00:00<00:00, 29.65it/s]
MFD Predict: 100%|██████████| 1/1 [00:00<00:00, 24.61it/s]
MFR Predict: 100%|██████████| 13/13 [00:00<00:00, 27.03it/s]
OCR-det Predict: 100%|██████████| 1/1 [00:00<00:00, 32.77it/s]
Table Predict: 100%|██████████| 1/1 [00:03<00:00,  3.65s/it]
Processing pages: 100%|██████████| 1/1 [00:00<00:00,  5.89it/s]


In [10]:
from IPython.display import HTML, display

In [11]:
### get model inference result
model_inference_result = infer_result.get_infer_res()
md_content = pipe_result.get_markdown(local_image_dir)
content_list_content = pipe_result.get_content_list(local_image_dir)

In [12]:
display(HTML(content_list_content[0]['table_body']))

0,1,2,3,4,5,6,7,8,9,10
"Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)","Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)","Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)","Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)","Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)","Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)","Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)","Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)","Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)","Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)",
,,,,,,,,,,
,2019 2020,,2021,2022,2023,2024,Projections 2025,2026 2027,2028,
,,,,,,,,,,
National Production and Income,,,,,,,,,,2.1
Real GDP Real GDP (q4/q4),2.3 2.6,-2.8,5.9,2.1,1.7,1.0,1.8,2.1,2.1,2.1
,,-1.5,5.7,0.9,1.2,1.1,2.0,2.1,2.1,0.0
Net exports 1/,-0.1,-0.3,-1.2 7.0,-0.4,0.5,0.0,0.0,0.0,0.0,
Total domestic demand Final domestic demand,2.3,-2.4 -1.9,6.7,2.4,1.1,1.0,1.7,2.0,2.1,2.1
,2.3,-3.0,8.3,1.7 2.7,1.6,1.0,1.7 1.4,2.1 1.7,2.1 2.0,2.1 2.0


In [16]:
class PDFConverter:
    def __init__(self, input_dir, output_dir):
        self.input_dir = input_dir
        self.output_dir = output_dir

    def process_pdf(self, pdf_file_path,pdf_output_dir=None,draw_pdf=False):
        # Create a unique directory for each PDF
        name_without_suffix = os.path.splitext(os.path.basename(pdf_file_path))[0]
        if pdf_output_dir is None:
            pdf_output_dir = os.path.join(self.output_dir, name_without_suffix)
        else:
            pdf_output_dir = os.path.join(pdf_output_dir, name_without_suffix)
        local_image_dir = os.path.join(pdf_output_dir, "images")
        local_md_dir = pdf_output_dir
        os.makedirs(local_image_dir, exist_ok=True)
        os.makedirs(local_md_dir, exist_ok=True)
        # read bytes
        image_writer = FileBasedDataWriter(local_image_dir)
        md_writer = FileBasedDataWriter(local_md_dir)
        # Readers
        reader = FileBasedDataReader("")
        pdf_bytes = reader.read(pdf_file_path)
        # Processing
        ds = PymuDocDataset(pdf_bytes)

        ## inference
        if ds.classify() == SupportedPdfParseMethod.OCR:
            infer_result = ds.apply(doc_analyze, ocr=True)
            pipe_result = infer_result.pipe_ocr_mode(image_writer)
        else:
            infer_result = ds.apply(doc_analyze, ocr=False)
            pipe_result = infer_result.pipe_txt_mode(image_writer)

        # Output
        if draw_pdf:
            infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suffix}_model.pdf"))
            pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suffix}_layout.pdf"))
            pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suffix}_spans.pdf"))

        md_content = pipe_result.get_markdown(local_image_dir)
        pipe_result.dump_md(md_writer, f"{name_without_suffix}.md", local_image_dir)
        pipe_result.dump_content_list(md_writer, f"{name_without_suffix}_content_list.json", local_image_dir)
        pipe_result.dump_middle_json(md_writer, f"{name_without_suffix}_middle.json")

    def process_all_pdfs(self,input_dir=None,output_dir=None):
        if input_dir is None:
            input_dir = self.input_dir
        if output_dir is None:
            output_dir = self.output_dir
        # Walk through all directories and subdirectories
        for root, dirs, files in os.walk(input_dir):
            for filename in files:
                # Case insensitive check for .pdf extension
                if filename.lower().endswith('.pdf'):
                    pdf_file_path = os.path.join(root, filename)
                    self.process_pdf(pdf_file_path,output_dir)



In [17]:
input_folder = '/ephemeral/home/xiong/data/Fund/pdf_parse/temp/input'
output_folder = '/ephemeral/home/xiong/data/Fund/pdf_parse/temp/output'
#pdf_file_path = os.path.join(input_dir, '111_2023_0_table1.pdf')  # replace with the real pdf path
converter = PDFConverter(input_folder, output_folder)
converter.process_all_pdfs()

[32m2025-04-13 01:52:41.227[0m | [1mINFO    [0m | [36mmagic_pdf.data.dataset[0m:[36m__init__[0m:[36m157[0m - [1mlang: None[0m
[32m2025-04-13 01:52:41.284[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m67[0m - [1mcid_count: 0, text_len: 3258, cid_chars_radio: 0.0[0m
[32m2025-04-13 01:52:41.292[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mmay_batch_image_analyze[0m:[36m275[0m - [1mgpu_memory: 79 GB, batch_ratio: 16[0m
Layout Predict: 100%|██████████| 1/1 [00:00<00:00, 38.85it/s]
MFD Predict: 100%|██████████| 1/1 [00:00<00:00, 30.77it/s]
MFR Predict: 100%|██████████| 13/13 [00:00<00:00, 33.69it/s]
OCR-det Predict: 100%|██████████| 1/1 [00:00<00:00, 32.88it/s]
Table Predict: 100%|██████████| 1/1 [00:03<00:00,  3.43s/it]
Processing pages: 100%|██████████| 1/1 [00:00<00:00,  5.86it/s]
[32m2025-04-13 01:52:45.840[0m | [1mINFO    [0m | [36mmagic_pdf.data.dataset[0m:[36m__init__[0m

#### Print out the markdown content 

In [22]:
# Get the markdown file path from the processed PDF
name_without_suffix = os.path.splitext(os.path.basename(pdf_file_path))[0]
md_file_path = os.path.join(output_folder,name_without_suffix, name_without_suffix + '.md')

# Read and display markdown content if file exists
if os.path.exists(md_file_path):
    # Read markdown content
    with open(md_file_path, 'r', encoding='utf-8') as f:
        md_content = f.read()
    # Display markdown content inline using IPython
    from IPython.display import display, Markdown
    display(Markdown(md_content))
else:
    print(f"Markdown file not found at: {md_file_path}")


<html><body><table><tr><td colspan="10">Table 1. United States: Selected Economic Indicators (Percentage change from previous period, unless otherwise indicated)</td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td></tr><tr><td></td><td>2019 2020</td><td></td><td>2021</td><td>2022</td><td>2023</td><td>2024</td><td>Projections 2025</td><td>2026 2027</td><td>2028</td><td></td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td></tr><tr><td>National Production and Income</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>2.1</td></tr><tr><td>Real GDP Real GDP (q4/q4)</td><td>2.3 2.6</td><td>-2.8</td><td>5.9</td><td>2.1</td><td>1.7</td><td>1.0</td><td>1.8</td><td>2.1</td><td>2.1</td><td>2.1</td></tr><tr><td></td><td></td><td>-1.5</td><td>5.7</td><td>0.9</td><td>1.2</td><td>1.1</td><td>2.0</td><td>2.1</td><td>2.1</td><td>0.0</td></tr><tr><td>Net exports 1/</td><td>-0.1</td><td>-0.3</td><td>-1.2 7.0</td><td>-0.4</td><td>0.5</td><td>0.0</td><td>0.0</td><td>0.0</td><td>0.0</td><td></td></tr><tr><td>Total domestic demand Final domestic demand</td><td>2.3</td><td>-2.4 -1.9</td><td>6.7</td><td>2.4</td><td>1.1</td><td>1.0</td><td>1.7</td><td>2.0</td><td>2.1</td><td>2.1</td></tr><tr><td></td><td>2.3</td><td>-3.0</td><td>8.3</td><td>1.7 2.7</td><td>1.6</td><td>1.0</td><td>1.7 1.4</td><td>2.1 1.7</td><td>2.1 2.0</td><td>2.1 2.0</td></tr><tr><td>Private final consumption</td><td>2.0</td><td>2.2</td><td>1.3</td><td></td><td>2.0</td><td>0.8</td><td></td><td>1.3</td><td>1.3</td><td>1.3</td></tr><tr><td>Public consumption expenditure</td><td>3.4</td><td>-1.2</td><td>5.7</td><td>-0.2 -0.5</td><td>3.0 -0.8</td><td>1.4</td><td>1.3</td><td>3.7</td><td>3.2</td><td></td></tr><tr><td>Gross fixed domestic investment</td><td>2.6</td><td>-2.3</td><td>7.4</td><td>-0.2</td><td></td><td>1.3</td><td>3.2</td><td>3.7</td><td>3.7</td><td>3.1 3.8</td></tr><tr><td> Private fixed investment</td><td>2.5</td><td>3.9</td><td>-2.3</td><td>-2.1</td><td>-1.9 4.9</td><td>0.7</td><td>3.0 3.9</td><td>3.5</td><td>0.5</td><td>0.0</td></tr><tr><td> Public fixed investment</td><td>3.1</td><td>-0.5</td><td>0.2</td><td>0.7</td><td>-0.5</td><td>4.1 0.0</td><td>0.0</td><td>0.0</td><td>0.0</td><td>0.0</td></tr><tr><td>Change in private inventories 1/</td><td>0.0</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td></tr><tr><td>Nominal GDP</td><td>4.1</td><td>-1.5</td><td>10.7</td><td>9.2</td><td>6.0</td><td>4.1</td><td>4.3</td><td>4.2</td><td>4.1</td><td>4.1</td></tr><tr><td>Personal saving rate (% of disposable income)</td><td>8.8</td><td>16.8</td><td>11.9 17.6</td><td>3.5 18.2</td><td>4.1</td><td>4.0</td><td>4.9</td><td>4.9</td><td>4.9 17.8</td><td>5.4</td></tr><tr><td>Private investment rate (% of GDP)</td><td>17.8</td><td>17.3</td><td></td><td></td><td>17.2</td><td>17.1</td><td>17.3</td><td>17.5</td><td></td><td>18.0</td></tr><tr><td>Unemployment and Potential Output</td><td></td><td></td><td>5.4</td><td></td><td></td><td></td><td></td><td>4.0</td><td></td><td>4.0</td></tr><tr><td>Unemployment rate</td><td>3.7</td><td>8.1</td><td></td><td>3.6</td><td>3.7</td><td>4.2</td><td>4.3</td><td></td><td>4.0</td><td>62.2</td></tr><tr><td>Labor force participation rate Potential GDP</td><td>63.1</td><td>61.7</td><td>61.7</td><td>62.2</td><td>62.5</td><td>62.5</td><td>62.4</td><td>62.2</td><td>62.2 2.0</td><td></td></tr><tr><td>Output gap (% of potential GDP)</td><td>1.6 0.7</td><td>0.4 -2.5</td><td>1.8 1.5</td><td>2.2 1.4</td><td>2.1 0.9</td><td>2.0 0.0</td><td>2.0 -0.2</td><td>2.0 -0.2</td><td>-0.1</td><td>2.0</td></tr><tr><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>0.0</td></tr><tr><td> Inflation</td><td></td><td></td><td>6.8</td><td>7.1</td><td>3.6</td><td>2.5</td><td>2.4</td><td>2.1</td><td></td><td>2.2</td></tr><tr><td>CPl inflation (q4/q4)</td><td>2.0 2.3</td><td>1.2 1.6</td><td>5.0</td><td>6.0</td><td>4.2</td><td>2.9</td><td>2.7</td><td>2.3</td><td>2.1 2.3</td><td>2.3</td></tr><tr><td>Core CPI Inflation (q4/q4) PCE Inflation (q4/q4)</td><td>1.4</td><td>1.1</td><td>5.7</td><td>5.7</td><td>3.8</td><td>2.6</td><td>2.3</td><td>1.9</td><td>1.9</td><td></td></tr><tr><td>Core PCE Inflation (q4/q4)</td><td>1.6</td><td>1.4</td><td>4.7</td><td>4.8</td><td>4.1</td><td>2.8</td><td>2.5</td><td>2.0</td><td>2.0 </td><td>2.0</td></tr><tr><td>GDP deflator</td><td></td><td>1.3</td><td>4.5</td><td>7.0</td><td>4.3</td><td>3.0</td><td>2.5</td><td>2.1</td><td>1.9</td><td>2.0</td></tr><tr><td></td><td>1.8</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>1.9</td></tr><tr><td>Government Finances</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>-6.4</td></tr><tr><td> Federal balance (% of GDP) 2/</td><td>-4.7</td><td>-14.9</td><td>-12.3</td><td>-5.5</td><td>-5.6</td><td>-5.7</td><td>-6.4</td><td>-6.2</td><td>-5.9</td><td>108.3</td></tr><tr><td>Federal debt held by the public (% of GDP)</td><td>79.4</td><td>99.8</td><td>98.4</td><td>97.0</td><td>96.6</td><td>98.4</td><td>101.2</td><td>103.6</td><td>105.8</td><td></td></tr><tr><td>General government budget balance (% of GDP)</td><td>-5.7</td><td>-14.0</td><td>-11.6</td><td>-3.7</td><td>-6.7</td><td>-7.0</td><td>-7.3</td><td>-7.1</td><td>-6.9</td><td>-7.0</td></tr><tr><td>General government gross debt (% of GDP)</td><td>108.7</td><td>133.5</td><td>126.4</td><td>121.4</td><td>121.8</td><td>124.6</td><td>127.5</td><td>130.1</td><td>132.5</td><td>134.9</td></tr><tr><td> Interest Rates (percent; period average)</td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td></td><td>2.4</td></tr><tr><td>Fed funds rate</td><td>2.2</td><td>0.4</td><td>0.1</td><td>1.7</td><td>5.1</td><td>5.3</td><td>4.2</td><td>3.2</td><td>2.4</td><td></td></tr><tr><td>Three-month Treasury bill rate Ten-year government bond rate</td><td>2.1</td><td>0.4</td><td>0.0</td><td>2.1 3.0</td><td>5.2 3.8</td><td>5.3</td><td>4.2 3.6</td><td>3.2 3.4</td><td>2.4 3.4</td><td>2.4 3.4</td></tr><tr><td>Balance of Payments</td><td>2.1</td><td>0.9</td><td>1.4</td><td></td><td></td><td>3.8</td><td></td><td></td><td></td></table></body></html>

Sources: BEA; BLS; FRB; Haver Analytics; and IMF staff estimates. 1/ Contribution to real GDP growth, percentage points. 2/ Includes staff's adjustments for one-off items, including costs of financial sector support.  