In [3]:
import json
from pathlib import Path
import os
import time

import fitz
import pandas as pd

In [4]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    data_dir: str
    docling_model_dir: str
    
settings = Settings()
os.environ["HF_HOME"] = settings.docling_model_dir

In [5]:
import sys
sys.path.append("src")

# Clean PDF

In [6]:
pdf_dir = os.path.join(settings.data_dir, "allganize-RAG-Evaluation-Dataset-KO/finance")
pdf_fnames =[x for x in os.listdir(pdf_dir) if x.endswith(".pdf")]
print("num files:", len(pdf_fnames))

file_path = os.path.join(pdf_dir, pdf_fnames[0])
pdf_fnames[:10]

num files: 10


['★2019 제1회 증시콘서트 자료집_최종★.pdf',
 '240409(보도자료) 금융위 핀테크 투자 생태계 활성화 나선다.pdf',
 '2024년 3월_3. 향후 통화신용정책 방향.pdf',
 '133178946057443204_WP22-05.pdf',
 '240130(보도자료) 지방은행의 시중은행 전환시 인가방식 및 절차.pdf',
 '130292099630937500_KIFVIP2013-10.pdf',
 '2024년 3월_2. 통화신용정책 운영.pdf',
 '[별첨] 지방은행의 시중은행 전환시 인가방식 및 절차.pdf',
 '240320(보도자료) 금융권의 상생금융 추진현황.pdf',
 '한-호주 퇴직연금 포럼_책자(최종).pdf']

In [7]:
out_doc = fitz.Document()
page = fitz.Document(file_path)
out_doc.insert_pdf(page)
out_doc.save(
    "temp/clean-test.pdf",
    clean=True,
    # ascii=True,
    # sanitize=True
)

# Check CID font

In [8]:
import fitz  # PyMuPDF

def is_cid_font_in_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    for page in doc:
        font_list = page.get_fonts()
        print(font_list)
        for font in font_list:
            # Check the font type
            if "CID" in font[3]:  # Font info contains type at index 3
                return True
    return False

# Example usage
if is_cid_font_in_pdf(file_path):
    print("The PDF contains CID fonts.")
else:
    print("No CID fonts found in the PDF.")

[(1510, 'cid', 'Type0', 'BBLAYW+YDVYGOStd13', 'C0_0', 'Identity-H'), (1515, 'cid', 'Type0', 'ACWUKY+YDVYGOStd12', 'C0_1', 'Identity-H'), (1520, 'ttf', 'Type0', 'LVLNIC+KoPubBatangBold', 'C2_0', 'Identity-H'), (1525, 'ttf', 'Type0', 'GLYEQM+KoPubDotumBold', 'C2_1', 'Identity-H')]
[(1930, 'cid', 'Type0', '*ÇÑ¾çÁß°íµñ-Identity-H', 'C0_0', 'Identity-H'), (1924, 'ttf', 'Type0', 'NEHJGA+KoPubDotumBold', 'C2_0', 'Identity-H'), (1925, 'ttf', 'Type0', 'NEHJIB+KoPubDotumMedium', 'C2_1', 'Identity-H'), (1926, 'ttf', 'Type0', 'NEHJPP+HYHeadLine-Medium', 'C2_2', 'Identity-H'), (1927, 'ttf', 'Type0', 'NEHKAP+KoPubBatangBold', 'C2_3', 'Identity-H'), (1928, 'ttf', 'Type0', 'NEHKDP+MalgunGothicBold', 'C2_4', 'Identity-H'), (1929, 'ttf', 'Type0', 'NEHKFP+MalgunGothicRegular', 'C2_5', 'Identity-H'), (1993, 'n/a', 'Type0', 'Gulim', 'Gulim', 'UniKS-UTF16-H')]
[(1237, 'ttf', 'Type0', 'KCOMCF+KoPubBatangBold', 'C2_0', 'Identity-H'), (1236, 'ttf', 'Type0', 'KCOMDG+KoPubDotumBold', 'C2_1', 'Identity-H'), (1993

In [9]:
from PyPDF2 import PdfReader

def is_cid_font_in_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        # Access the page's resources
        resources = page.get("/Resources")
        if resources and "/Font" in resources:
            font_dict = resources["/Font"]
            print(font_dict)
            for font_name, font_obj in font_dict.items():
                # Check for CID fonts
                if "/CIDFont" in str(font_obj):
                    return True
    return False

# Example usage
if is_cid_font_in_pdf(file_path):
    print("The PDF contains CID fonts.")
else:
    print("No CID fonts found in the PDF.")

{'/C0_0': IndirectObject(1510, 0, 4585266944), '/C0_1': IndirectObject(1515, 0, 4585266944), '/C2_0': IndirectObject(1520, 0, 4585266944), '/C2_1': IndirectObject(1525, 0, 4585266944)}
{'/C0_0': IndirectObject(1930, 0, 4585266944), '/C2_0': IndirectObject(1924, 0, 4585266944), '/C2_1': IndirectObject(1925, 0, 4585266944), '/C2_2': IndirectObject(1926, 0, 4585266944), '/C2_3': IndirectObject(1927, 0, 4585266944), '/C2_4': IndirectObject(1928, 0, 4585266944), '/C2_5': IndirectObject(1929, 0, 4585266944)}
{'/C2_0': IndirectObject(1237, 0, 4585266944), '/C2_1': IndirectObject(1236, 0, 4585266944)}
{'/C2_0': IndirectObject(1236, 0, 4585266944), '/C2_1': IndirectObject(1238, 0, 4585266944)}
{'/C2_0': IndirectObject(1246, 0, 4585266944), '/C2_1': IndirectObject(1247, 0, 4585266944)}
{'/C2_0': IndirectObject(1276, 0, 4585266944), '/C2_1': IndirectObject(1246, 0, 4585266944), '/C2_2': IndirectObject(1247, 0, 4585266944), '/C2_3': IndirectObject(1266, 0, 4585266944), '/C2_4': IndirectObject(1258

# Try Extraction

In [10]:
# Import docling
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

from docling.datamodel.document import ConversionResult

from docling_core.types.doc import (
    ImageRefMode, TextItem, PictureItem, TableItem, DoclingDocument, RefItem, DocItemLabel
)

In [11]:
IMAGE_RESOLUTION_SCALE = 1.0

pdf_format_options = PdfPipelineOptions()
pdf_format_options.images_scale = IMAGE_RESOLUTION_SCALE
pdf_format_options.generate_page_images = True
pdf_format_options.generate_picture_images = True

# pdf_format_options.do_ocr = False
pdf_format_options.do_ocr = True
# pdf_format_options.do_table_structure = False
pdf_format_options.do_table_structure = True

converter = DocumentConverter(
    allowed_formats=[
            InputFormat.PDF,
            # InputFormat.IMAGE,
            # InputFormat.DOCX,
            # InputFormat.HTML,
            # InputFormat.PPTX,
    ],
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pdf_format_options, # pipeline options go here.
            backend=PyPdfiumDocumentBackend # optional: pick an alternative backend
        ),
    }
)

In [15]:
result = converter.convert("temp/clean-test.pdf")

In [16]:
result.document.save_as_markdown(Path('temp/clean-test.md'), image_mode=ImageRefMode.REFERENCED)

# Try error text recognition

In [14]:
import re
import unicodedata

def is_broken_text(text):
    """
    Determines if a given text is broken based on presence of unusual Unicode characters,
    lack of meaningful word structure, and excessive non-printable characters.
    """
    # Check for non-printable characters
    if any(unicodedata.category(char) in ("Cc", "Cf") for char in text):
        return True

    # Check for excessive special characters or isolated symbols
    if len(re.findall(r'[\x00-\x1F\x7F-\x9F]', text)) > 2:
        return True
    
    # Check for the presence of valid Hangul, English, or common script structures
    if re.search(r'[\uAC00-\uD7AFa-zA-Z가-힣0-9]', text):  
        return False
    
    return True
# Example usage
texts = [
    "발표 1",
    "-4 -",
    "֙ೞ߈ӝ",
    "ݎ੹੢धद઱",
    "*  ߈࢚ӝ 3FWJFX",
    "߈࢚ӝ द੢ DBUBMZTU GMPX ",
    "FE  ژ ೠ ߣ੄ झగझ ࢶഥ  JOTVSBODF DVU ӝ؀"
]

for text in texts:
    print(f"'{text}' -> {'Broken' if is_broken_text(text) else 'Normal'}")

'발표 1' -> Normal
'-4 -' -> Normal
'֙ೞ߈ӝ' -> Broken
'ݎ੹੢धद઱' -> Broken
'*  ߈࢚ӝ 3FWJFX' -> Normal
'߈࢚ӝ द੢ DBUBMZTU GMPX ' -> Normal
'FE  ژ ೠ ߣ੄ झగझ ࢶഥ  JOTVSBODF DVU ӝ؀' -> Broken
