In [4]:
import json
import pymupdf
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
import unicodedata
from pathlib import Path
import html
from openai import OpenAI
from underthesea import ner

# ──────────────── CẤU HÌNH ────────────────
BOOK_METADATA = {
    "NAM_HOA_KINH": {
        "TITLE": "Nam Hoa Kinh",
        "VOLUME": "",
        "AUTHOR": "Trang Tử",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "thuviensach.vn",
    },
    "TRANG_TU_NAM_HOA_KINH": {
        "TITLE": "Trang Tử Nam Hoa Kinh",
        "VOLUME": "",
        "AUTHOR": "Nguyễn Kim Vỹ",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "Nhà xuất bản văn hóa",
    },
    "TRANG_TU_NAM_HOA_KINH_2": {
        "TITLE": "Trang Tử Nam Hoa Kinh",
        "VOLUME": "",
        "AUTHOR": "Thu Giang, Ngyễn Duy Cần",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "Nhà xuất bản trẻ",
    },
}
BASED_ENTITY_GROUPS = [
    "PER",
    "ORG",
    "LOC",
    "ORG",
    "TME",
    "TITLE",
    "NUM",
]


In [15]:
import os
import re
import html
import unicodedata
import xml.etree.ElementTree as ET
import xml.dom.minidom as minidom

import pymupdf
import pytesseract
from PIL import Image


def normalize(s: str) -> str:
    return unicodedata.normalize("NFC", s.strip())

def clean_text(text: str) -> str:
    html_entities = {
        "&quot;": "", "&quote;": "", "quot;": "", "quote;": "",
        "&amp;": "&", "&lt;": "<", "&gt;": ">", "&apos;": "'",
        "&nbsp;": " ", "&hellip;": "...", "&mdash;": "—", "&ndash;": "–"
    }
    for entity, replacement in html_entities.items():
        text = text.replace(entity, replacement)
    text = re.sub(r"&[a-zA-Z]+;?", "", text)
    text = re.sub(r"[\u200b\u200e\u202a\u202c\ufeff]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return normalize(text.strip())

def is_vietnamese(text: str) -> bool:
    return bool(re.search(r'[àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđĐ]', text))

def split_sentences(text: str) -> list[str]:
    text = clean_text(text)
    sentences, current = [], ""
    for char in text:
        current += char
        if char in [".", "!", "?", "。"]:
            if len(current.strip()) > 10:
                cleaned = clean_text(current)
                if is_vietnamese(cleaned):
                    sentences.append(cleaned)
            current = ""
    if len(current.strip()) > 10:
        cleaned = clean_text(current)
        if is_vietnamese(cleaned):
            sentences.append(cleaned)
    return sentences

def ocr_page(page) -> str:
    pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2))
    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    config = r'--oem 3 --psm 6'
    try:
        return clean_text(pytesseract.image_to_string(image, lang='vie', config=config))
    except Exception as e:
        print(f"OCR error: {e}")
        return ""


def build_ocr_xml_with_dynamic_sections(
    pdf_path: str,
    metadata: dict,
    output_path="output_dynamic_ocr.xml",
    code="PKS_001"
):
    print(f"📘 OCR processing PDF: {pdf_path}")
    doc = pymupdf.open(pdf_path)

    root = ET.Element("root")
    file_el = ET.SubElement(root, "FILE", ID=code)

    meta = ET.SubElement(file_el, "meta")
    for tag in ["TITLE", "VOLUME", "AUTHOR", "PERIOD", "LANGUAGE", "TRANSLATOR", "SOURCE"]:
        ET.SubElement(meta, tag).text = metadata.get(tag, "")

    sect_counter = 0
    current_sect = None
    sentence_count = 0

    for page_num in range(len(doc)):
        page_text = ocr_page(doc[page_num])

        # Detect if new section
        chapter_match = re.search(r'\bCHƯƠNG\s+[IVXLCDM\d]+', page_text, re.IGNORECASE)
        if chapter_match:
            sect_counter += 1
            sect_id = f"{code}.{sect_counter:03}"
            current_sect = ET.SubElement(file_el, "SECT", ID=sect_id, NAME=chapter_match.group().strip())
            print(f"🔖 New section: {chapter_match.group().strip()} (Page {page_num+1})")

        if current_sect is None:
            # If no "Chương" found yet, place into unnamed SECT
            current_sect = ET.SubElement(file_el, "SECT", ID=f"{code}.000", NAME="Mở đầu")

        page_id = f"{current_sect.attrib['ID']}.{page_num+1:03}"
        page_el = ET.SubElement(current_sect, "PAGE", ID=page_id)

        sentences = split_sentences(page_text)
        for i, sentence in enumerate(sentences, start=1):
            stc_id = f"{page_id}.{i:02}"
            stc_el = ET.SubElement(page_el, "STC", ID=stc_id)
            stc_el.text = sentence
            sentence_count += 1

    # Save XML
    tree = ET.ElementTree(root)
    pretty_xml = minidom.parseString(ET.tostring(tree.getroot(), encoding="utf-8"))
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(pretty_xml.toprettyxml(indent="  "))

    print(f"✅ XML saved: {output_path}")
    print(f"📊 Total sentences: {sentence_count}, Sections: {sect_counter}")


In [16]:
pdf_path = "/home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/temp/TRANG TỬ NAM HOA KINH.pdf"
code = "PAS_003"
des = f"{code}_nam_hoa_kinh.xml"

metadata = BOOK_METADATA.get("TRANG_TU_NAM_HOA_KINH", {})
build_ocr_xml_with_dynamic_sections(pdf_path, metadata, des, code)

📘 OCR processing PDF: /home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/temp/TRANG TỬ NAM HOA KINH.pdf
🔖 New section: chương m (Page 4)
🔖 New section: CHƯƠNG I (Page 6)
🔖 New section: chương 46 (Page 8)
🔖 New section: chương 63 (Page 11)
🔖 New section: chương c (Page 14)
🔖 New section: chương XXXI (Page 16)
🔖 New section: chương X (Page 24)
🔖 New section: chương XXX (Page 27)
🔖 New section: CHƯƠNG II (Page 32)
🔖 New section: chương V (Page 33)
🔖 New section: chương XXII (Page 34)
🔖 New section: chương M (Page 36)
🔖 New section: Chương I (Page 38)
🔖 New section: Chương V (Page 39)
🔖 New section: chương c (Page 40)
🔖 New section: chương c (Page 41)
🔖 New section: chương Vi (Page 43)
🔖 New section: chương I (Page 46)
🔖 New section: chương VII (Page 48)
🔖 New section: chương XII (Page 49)
🔖 New section: chương c (Page 50)
🔖 New section: chương c (Page 51)
🔖 New section: CHƯƠNG III (Page 55)
🔖 New section: chương XXVII (Page 56)
🔖 New section: chương V (Page 58)
🔖 New se

In [8]:
import xml.etree.ElementTree as ET


def count_sentences(xml_file_path):
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Find all STC elements (sentences)
    sentences = root.findall(".//STC")

    # Count the sentences
    total_sentences = len(sentences)

    # Count sentences per section
    sections = {}
    for section in root.findall(".//SECT"):
        section_name = section.get("NAME", "Unknown")
        section_sentences = section.findall(".//STC")
        sections[section_name] = len(section_sentences)

    return total_sentences, sections


# Path to your XML file
xml_file = "/home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/nam_hoa_kinh.xml"

# Get the counts
total, section_counts = count_sentences(xml_file)

print(f"Total number of sentences: {total}")
print("\nNumber of sentences per section:")
for section, count in section_counts.items():
    print(f"- {section}: {count} sentences")

Total number of sentences: 2251

Number of sentences per section:
- Giới thiệu: 2251 sentences


In [2]:
import pymupdf
import pytesseract
from PIL import Image
import os
import re
import unicodedata
import html

def normalize(s: str) -> str:
    return unicodedata.normalize("NFC", s.strip())

def remove_html_entities(text):
    """Remove HTML entities comprehensively."""
    try:
        text = html.unescape(text)
    except:
        pass
    
    # Remove &quot; and &quote; variations
    text = text.replace("&quot;", "")
    text = text.replace("&quot", "")
    text = text.replace("&quote;", "")
    text = text.replace("&quote", "")
    text = text.replace("quot;", "")
    text = text.replace("quote;", "")
    
    # Remove other common entities
    html_entities = {
        "&amp;": "&", "&lt;": "<", "&gt;": ">", "&apos;": "'",
        "&nbsp;": " ", "&hellip;": "...", "&mdash;": "—", "&ndash;": "–"
    }
    
    for entity, replacement in html_entities.items():
        text = text.replace(entity, replacement)
    
    # Clean remaining entities with regex
    text = re.sub(r"&[a-zA-Z]+;?", "", text)
    
    return text

def clean_text(text: str) -> str:
    """Clean text comprehensively."""
    text = normalize(text)
    text = remove_html_entities(text)
    
    clean_patterns = ["***", "---", "___", "..."]
    for pattern in clean_patterns:
        text = text.replace(pattern, "")
    
    # Remove invisible characters and normalize spaces
    text = re.sub(r"[\u200b\u200e\u202a\u202c\ufeff]+", "", text)
    text = re.sub(r"\s+", " ", text)
    
    return text.strip()

def is_vietnamese(text: str) -> bool:
    """Check if text contains Vietnamese characters."""
    vietnamese_pattern = re.compile(r'[àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđĐ]')
    return bool(vietnamese_pattern.search(text))

def extract_images_from_pdf(pdf_path, output_dir="extracted_images"):
    """Extract all images from PDF and save them."""
    os.makedirs(output_dir, exist_ok=True)
    doc = pymupdf.open(pdf_path)
    image_list = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        image_dict = page.get_images()
        
        for img_index, img in enumerate(image_dict):
            xref = img[0]
            pix = pymupdf.Pixmap(doc, xref)
            
            if pix.n - pix.alpha < 4:  # GRAY or RGB
                img_filename = f"333_BLOCK{page_num+1:03d}_LINE{img_index+1:03d}.png"
                img_path = os.path.join(output_dir, img_filename)
                pix.save(img_path)
                
                image_list.append({
                    'page': page_num + 1,
                    'filename': img_filename,
                    'path': img_path,
                    'index': img_index + 1
                })
            
            pix = None
    
    doc.close()
    return image_list

def ocr_image(image_path, lang='vie'):
    """Extract text from image using OCR."""
    try:
        # Configure tesseract for Vietnamese
        custom_config = r'--oem 3 --psm 6'
        text = pytesseract.image_to_string(
            Image.open(image_path), 
            lang=lang, 
            config=custom_config
        )
        return clean_text(text)
    except Exception as e:
        print(f"OCR error for {image_path}: {e}")
        return ""

def split_sentences(text: str) -> list[str]:
    """Split text into Vietnamese sentences."""
    text = clean_text(text)
    
    # Split by common sentence delimiters
    sentences = []
    current_sentence = ""
    
    for char in text:
        current_sentence += char
        if char in [".", "!", "?", "。", ";", ":"]:
            if current_sentence.strip():
                cleaned = clean_text(current_sentence)
                # Keep both Vietnamese and meaningful text
                if cleaned and len(cleaned) > 5:
                    sentences.append(cleaned)
            current_sentence = ""
    
    # Add remaining text
    if current_sentence.strip():
        cleaned = clean_text(current_sentence)
        if cleaned and len(cleaned) > 5:
            sentences.append(cleaned)
    
    return sentences

def process_pdf_pages_ocr(
    pdf_path,
    output_file="page_ocr_results.txt",
    start_line=13013,
    max_pages=None,
    verbose=True
):
    """Process entire PDF pages with OCR. Each sentence saved as a line."""
    if verbose:
        print(f"🔄 Processing PDF pages with OCR: {pdf_path}")

    doc = pymupdf.open(pdf_path)
    total_pages = len(doc)
    pages_to_process = min(total_pages, max_pages) if max_pages else total_pages

    results = []
    line_number = start_line
    file = open(output_file, 'w', encoding='utf-8')
    for page_num in range(pages_to_process):
        page = doc[page_num]

        # Convert page to high-res image
        mat = pymupdf.Matrix(2.0, 2.0)
        pix = page.get_pixmap(matrix=mat)
        img_path = f"temp_page_{page_num+1}.png"
        pix.save(img_path)

        if verbose:
            print(f"🔍 OCR page {page_num+1}/{pages_to_process} → {img_path}")

        try:
            ocr_text = ocr_image(img_path)
            sentences = split_sentences(ocr_text) if ocr_text else []

            for i, sentence in enumerate(sentences, start=1):
                if len(sentence.strip()) > 5:
                    filename = f"333_BLOCK{page_num+1:03d}_LINE{i:03d}.png"
                    result_line = f'"{filename}": "{sentence}",'
                    file.write(f"{line_number}\t{result_line}\n")
                    results.append((line_number, result_line))
                    if verbose:
                        print(f"✅ {line_number}: {filename}")
                    line_number += 1

        except Exception as e:
            print(f"❌ Error on page {page_num+1}: {e}")

        finally:
            if os.path.exists(img_path):
                os.remove(img_path)

        pix = None

    doc.close()

    # Write output
    # with open(output_file, 'w', encoding='utf-8') as f:
    #     for line_num, content in results:
    #         f.write(f"{line_num}\t{content}\n")

    if verbose:
        print(f"\n✅ OCR results saved to: {output_file}")
        print(f"📊 Total lines processed: {len(results)}")

    return results


🔄 Processing PDF pages with OCR: /home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/temp/TRANG TỬ NAM HOA KINH.pdf
🔍 Processing OCR for page 1
🔍 Processing OCR for page 2
🔍 Processing OCR for page 3
🔍 Processing OCR for page 4
🔍 Processing OCR for page 5
🔍 Processing OCR for page 6
🔍 Processing OCR for page 7
🔍 Processing OCR for page 8
🔍 Processing OCR for page 9
🔍 Processing OCR for page 10
🔍 Processing OCR for page 11
🔍 Processing OCR for page 12
🔍 Processing OCR for page 13
🔍 Processing OCR for page 14
🔍 Processing OCR for page 15
🔍 Processing OCR for page 16
🔍 Processing OCR for page 17
🔍 Processing OCR for page 18
🔍 Processing OCR for page 19
🔍 Processing OCR for page 20
🔍 Processing OCR for page 21
🔍 Processing OCR for page 22
🔍 Processing OCR for page 23
🔍 Processing OCR for page 24
🔍 Processing OCR for page 25
🔍 Processing OCR for page 26
🔍 Processing OCR for page 27
🔍 Processing OCR for page 28
🔍 Processing OCR for page 29
🔍 Processing OCR for page 30
🔍 Pro

In [None]:
# Example usage:
if __name__ == "__main__":
    # Method 1: Extract embedded images and OCR them
    # results = process_images_with_ocr("your_pdf_file.pdf", "image_ocr_output.txt")
    
    # Method 2: Convert each page to image and OCR (recommended for scanned PDFs)
    results = process_pdf_pages_ocr("/home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/temp/TRANG TỬ NAM HOA KINH.pdf", "page_ocr_output.txt")
    
    
    print(results)