In [19]:
import pymupdf
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
import nltk
import unicodedata
from pathlib import Path

# ──────────────── CẤU HÌNH ────────────────
BOOK_METADATA = {
    "NAM_HOA_KINH": {
        "TITLE": "Nam Hoa Kinh",
        "VOLUME": "",
        "AUTHOR": "Trang Tử",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "thuviensach.vn"
    },
    "TRANG_TU_NAM_HOA_KINH": {
        "TITLE": "Nam Hoa Kinh (Bản học thuật)",
        "VOLUME": "",
        "AUTHOR": "Trang Tử",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "Bản học thuật số hoá"
    },
    "TRANG_TU_NAM_HOA_KINH_2": {
        "TITLE": "Nam Hoa Kinh (Bản dịch)",
        "VOLUME": "",
        "AUTHOR": "Trang Tử",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "Bản dịch hiện đại"
    }
}

In [None]:
# ──────────────── HÀM TIỆN ÍCH ────────────────
def normalize(s): return unicodedata.normalize("NFC", s)

def clean_page(text):
    text = normalize(text)
    text = re.sub(r'TRANG TỬ.*NAM HOA KINH.*', '', text, flags=re.I)
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)  # nối dòng giữa câu
    return text.strip()

def split_paragraphs(text):
    return [normalize(p.strip()) for p in re.split(r'\n\s*\n', text) if p.strip()]

def split_sentences(text):
    # Split by Vietnamese sentence delimiters
    # Vietnamese sentences typically end with ., !, or ?
    delimiters = ['.', '!', '?']
    sentences = []
    current_sentence = ''
    
    for char in text:
        current_sentence += char
        if char in delimiters:
            if current_sentence.strip():
                sentences.append(normalize(current_sentence.strip()))
            current_sentence = ''
    
    if current_sentence.strip():  # Add any remaining text
        sentences.append(normalize(current_sentence.strip()))
    
    return sentences

def detect_sections(pages):
    section_pattern = re.compile(r'^(PHẦN|CHƯƠNG)\s+[IVXLCDM\d]+\.*\s+.+$', re.MULTILINE)
    sections = []
    current = {"name": "Giới thiệu", "pages": []}

    for i, txt in enumerate(pages, 1):
        matches = section_pattern.findall(txt)
        if matches:
            if current["pages"]:
                sections.append(current)
            title = re.findall(section_pattern, txt)[0]
            current = {"name": normalize(title), "pages": [(i, txt)]}
        else:
            current["pages"].append((i, txt))
    sections.append(current)
    return sections

# ──────────────── GHI FILE XML ĐẸP ────────────────
def write_pretty_xml(tree, out_path):
    pretty = minidom.parseString(ET.tostring(tree.getroot(), encoding="utf-8"))
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(pretty.toprettyxml(indent="  "))

# ──────────────── HÀM CHÍNH ────────────────
def build_xml_for_book(pdf_path, des="output_xml", code:str="", mode: str = "sentence"):
    """
    Tạo file XML từ PDF
    
    :param pdf_path: Đường dẫn đến file PDF
    :param des: Đường dẫn đến thư mục đích
    :param code: Mã sách
    :param mode: Chế độ tách đoạn (paragraph) hoặc câu (sentence)
    """
    book_name = Path(pdf_path).stem.upper().replace("-", "_").replace(" ", "_")
    book_id = book_name if book_name in BOOK_METADATA else f"{book_name}_AUTO"

    doc = pymupdf.open(pdf_path)
    pages_text = [clean_page(p.get_text()) for p in doc]

    sections = detect_sections(pages_text)
    root = ET.Element("root")
    file_el = ET.SubElement(root, "FILE", ID=book_id)

    meta_info = BOOK_METADATA.get(book_id, {})
    meta = ET.SubElement(file_el, "meta")
    ET.SubElement(meta, "TITLE").text = meta_info.get("TITLE", book_id.title())
    ET.SubElement(meta, "VOLUME").text = meta_info.get("VOLUME", "")
    ET.SubElement(meta, "AUTHOR").text = meta_info.get("AUTHOR", "Không rõ")
    ET.SubElement(meta, "PERIOD").text = meta_info.get("PERIOD", "Không rõ")
    ET.SubElement(meta, "LANGUAGE").text = meta_info.get("LANGUAGE", "vi")
    ET.SubElement(meta, "SOURCE").text = meta_info.get("SOURCE", "Tự động")

    for sect_id, section in enumerate(sections, 1):
            sect_el = ET.SubElement(root, "SECT", ID=f"{code}.{sect_id:03}", NAME=section["name"])
            for page_num, page_text in section["pages"]:
                page_el = ET.SubElement(sect_el, "PAGE", ID=f"{code}.{sect_id:03}.{page_num:03}")
                
                if mode == "paragraph": 
                    paragraphs = split_paragraphs(page_text)
                    for para_id, para in enumerate(paragraphs, 1):
                        ET.SubElement(page_el, "STC", ID=f"{code}.{sect_id:03}.{page_num:03}.{para_id:02}").text = para
                elif mode == "sentence": 
                    sentences = split_sentences(page_text)
                    for sent_id, sent in enumerate(sentences, 1):
                        ET.SubElement(page_el, "STC", ID=f"{code}.{sect_id:03}.{page_num:03}.{sent_id:02}").text = sent
                else: 
                    raise ValueError("Giá trị `mode` không hợp lệ; chỉ hỗ trợ 'paragraph' hoặc 'sentence'")

    tree = ET.ElementTree(root)
    write_pretty_xml(tree, des)
    print(f"✅ Xuất file XML: {des}")


In [27]:
pdf_path = "/home/octoopt/workspace/projects/learn-from-basics/the-notes/test/TrangTu_NamHoaKinh_VanAnh/Nam-hoa-kinh.pdf"
code = "PKS_001"
mode = "sentence"
des = f"nam_hoa_kinh_1_{mode}.xml"

build_xml_for_book(pdf_path, des, code, mode)

✅ Xuất file XML: nam_hoa_kinh_1_sentence.xml


In [28]:
pdf_path = "/home/octoopt/workspace/projects/learn-from-basics/the-notes/test/TrangTu_NamHoaKinh_VanAnh/Nam-hoa-kinh.pdf"
code = "PKS_001"
mode = "paragraph"
des = f"nam_hoa_kinh_1_{mode}.xml"

build_xml_for_book(pdf_path, des, code, mode)

✅ Xuất file XML: nam_hoa_kinh_1_paragraph.xml


In [29]:
pdf_path = "/home/octoopt/workspace/projects/learn-from-basics/the-notes/test/TrangTu_NamHoaKinh_VanAnh/Trang-tu-nam-hoa-kinh.pdf"
code = "PKS_001"
mode = "paragraph"
file_name = "Trang-tu-nam-hoa-kinh"
des = f"{file_name}_{code}_{mode}.xml"

build_xml_for_book(pdf_path, des, code, mode)

✅ Xuất file XML: Trang-tu-nam-hoa-kinh_PKS_001_paragraph.xml


In [32]:
pdf_path = "/home/octoopt/workspace/projects/learn-from-basics/the-notes/test/TrangTu_NamHoaKinh_VanAnh/Trang-tu-nam-hoa-kinh.pdf"
code = "PKS_001"
mode = "sentence"
file_name = "Trang-tu-nam-hoa-kinh"
des = f"{file_name}_{code}_{mode}.xml"
build_xml_for_book(pdf_path, des, code, mode)

✅ Xuất file XML: Trang-tu-nam-hoa-kinh_PKS_001_sentence.xml


In [33]:
pdf_path = "/home/octoopt/workspace/projects/learn-from-basics/the-notes/test/TrangTu_NamHoaKinh_VanAnh/TRANG TỬ NAM HOA KINH.pdf"
code = "PKS_001"
mode = "sentence"
file_name = "TRANG TỬ NAM HOA KINH"
des = f"{file_name}_{code}_{mode}.xml"
build_xml_for_book(pdf_path, des, code, mode)

✅ Xuất file XML: TRANG TỬ NAM HOA KINH_PKS_001_sentence.xml


In [None]:
pdf_path = "/home/octoopt/workspace/projects/learn-from-basics/the-notes/test/TrangTu_NamHoaKinh_VanAnh/TRANG TỬ NAM HOA KINH.pdf"
code = "PKS_001"
mode = "sentence"
file_name = "TRANG TỬ NAM HOA KINH"
des = f"{file_name}_{code}_{mode}.xml"
build_xml_for_book(pdf_path, des, code, mode)

In [3]:
import xml.etree.ElementTree as ET

def count_sentences(xml_file_path):
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    
    # Find all STC elements (sentences)
    sentences = root.findall('.//STC')
    
    # Count the sentences
    total_sentences = len(sentences)
    
    # Count sentences per section
    sections = {}
    for section in root.findall('.//SECT'):
        section_name = section.get('NAME', 'Unknown')
        section_sentences = section.findall('.//STC')
        sections[section_name] = len(section_sentences)
    
    return total_sentences, sections

# Path to your XML file
xml_file = "/home/octoopt/workspace/projects/learn-from-basics/the-notes/others/nam_hoa_kinh_1_sentence.xml"

# Get the counts
total, section_counts = count_sentences(xml_file)

print(f"Total number of sentences: {total}")
print("\nNumber of sentences per section:")
for section, count in section_counts.items():
    print(f"- {section}: {count} sentences")

Total number of sentences: 3061

Number of sentences per section:
- Giới thiệu: 3061 sentences


In [4]:
import xml.etree.ElementTree as ET

def count_sentences(xml_file_path):
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    
    # Find all STC elements (sentences)
    sentences = root.findall('.//STC')
    
    # Count the sentences
    total_sentences = len(sentences)
    
    # Count sentences per section
    sections = {}
    for section in root.findall('.//SECT'):
        section_name = section.get('NAME', 'Unknown')
        section_sentences = section.findall('.//STC')
        sections[section_name] = len(section_sentences)
    
    return total_sentences, sections

# Path to your XML file
xml_file = "/home/octoopt/workspace/projects/learn-from-basics/the-notes/others/Trang-tu-nam-hoa-kinh_PKS_001_sentence.xml"

# Get the counts
total, section_counts = count_sentences(xml_file)

print(f"Total number of sentences: {total}")
print("\nNumber of sentences per section:")
for section, count in section_counts.items():
    print(f"- {section}: {count} sentences")

Total number of sentences: 6760

Number of sentences per section:
- Giới thiệu: 6760 sentences
