In [9]:
import json
import pymupdf
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
import unicodedata
from pathlib import Path
import html
from openai import OpenAI
from underthesea import ner

# ──────────────── CẤU HÌNH ────────────────
BOOK_METADATA = {
    "NAM_HOA_KINH": {
        "TITLE": "Nam Hoa Kinh",
        "VOLUME": "",
        "AUTHOR": "Trang Tử",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "thuviensach.vn",
    },
    "TRANG_TU_NAM_HOA_KINH": {
        "TITLE": "Trang Tử Nam Hoa Kinh",
        "VOLUME": "",
        "AUTHOR": "Nguyễn Kim Vỹ",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "Nhà xuất bản văn hóa",
    },
    "TRANG_TU_NAM_HOA_KINH_2": {
        "TITLE": "Trang Tử Nam Hoa Kinh",
        "VOLUME": "",
        "AUTHOR": "Thu Giang, Ngyễn Duy Cần",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "Nhà xuất bản trẻ",
    },
}
BASED_ENTITY_GROUPS = [
    "PER",
    "ORG",
    "LOC",
    "ORG",
    "TME",
    "TITLE",
    "NUM",
]


In [23]:

# ──────────────── UTILITY FUNCTIONS ────────────────

def normalize(s: str) -> str:
    return unicodedata.normalize("NFC", s.strip())

def remove_html_entities(text):
    """Remove HTML entities comprehensively."""
    try:
        text = html.unescape(text)
    except:
        pass
    
    # Remove &quot; variations
    text = text.replace("&quot;", "")
    text = text.replace("&quot", "")
    text = text.replace("quot;", "")
    
    # Remove other common entities
    html_entities = {
        "&amp;": "&", "&lt;": "<", "&gt;": ">", "&apos;": "'",
        "&nbsp;": " ", "&hellip;": "...", "&mdash;": "—", "&ndash;": "–"
    }
    
    for entity, replacement in html_entities.items():
        text = text.replace(entity, replacement)
    
    # Clean remaining entities with regex
    text = re.sub(r"&[a-zA-Z]+;?", "", text)
    
    return text

def clean_text(text: str) -> str:
    """Clean text comprehensively."""
    text = normalize(text)
    text = remove_html_entities(text)
    
    clean_patterns = ["***", "---", "___", "..."]
    for pattern in clean_patterns:
        text = text.replace(pattern, "")
    
    # Remove invisible characters and normalize spaces
    text = re.sub(r"[\u200b\u200e\u202a\u202c\ufeff]+", "", text)
    text = re.sub(r"\s+", " ", text)
    
    return text.strip()

def is_vietnamese(text: str) -> bool:
    """Check if text contains Vietnamese characters."""
    vietnamese_pattern = re.compile(r'[àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđĐ]')
    return bool(vietnamese_pattern.search(text))

def split_sentences(text: str) -> list[str]:
    """Split text into Vietnamese sentences."""
    text = clean_text(text)
    
    # Split by common sentence delimiters
    sentences = []
    current_sentence = ""
    
    for char in text:
        current_sentence += char
        if char in [".", "!", "?", "。"]:
            if current_sentence.strip():
                cleaned = clean_text(current_sentence)
                # Only keep Vietnamese sentences
                if cleaned and len(cleaned) > 10 and is_vietnamese(cleaned):
                    sentences.append(cleaned)
            current_sentence = ""
    
    # Add remaining text if it's Vietnamese
    if current_sentence.strip():
        cleaned = clean_text(current_sentence)
        if cleaned and len(cleaned) > 10 and is_vietnamese(cleaned):
            sentences.append(cleaned)
    
    return sentences

def ner_underthesea(text: str) -> list[dict]:
    """Extract named entities from Vietnamese text."""
    result = ner(text, deep=True)
    return result

# ──────────────── MAIN FUNCTION ────────────────

def build_xml_for_book(pdf_path, metadata: dict, output_path="vietnamese_parsed.xml", code="VIE_001"):
    """Parse Vietnamese text from PDF and create XML with NER."""
    print(f"🔄 Processing PDF: {pdf_path}")
    
    # Read PDF
    doc = pymupdf.open(pdf_path)
    pages_text = [p.get_text() for p in doc]
    print(f"📄 Extracted {len(pages_text)} pages")
    
    # Create XML structure
    root = ET.Element("root")
    file_el = ET.SubElement(root, "FILE", ID=code)
    
    # Metadata
    meta = ET.SubElement(file_el, "meta")
    ET.SubElement(meta, "TITLE").text = metadata.get("TITLE", "")
    ET.SubElement(meta, "VOLUME").text = metadata.get("VOLUME", "")
    ET.SubElement(meta, "AUTHOR").text = metadata.get("AUTHOR", "")
    ET.SubElement(meta, "PERIOD").text = metadata.get("PERIOD", "")
    ET.SubElement(meta, "LANGUAGE").text = metadata.get("LANGUAGE", "")
    ET.SubElement(meta, "TRANSLATOR").text = metadata.get("TRANSLATOR", "")
    ET.SubElement(meta, "SOURCE").text = metadata.get("SOURCE", "")
    
    total_sentences = 0
    
    # Process each page
    for page_num, page_text in enumerate(pages_text, 1):
        sentences = split_sentences(page_text)
        
        if not sentences:  # Skip empty pages
            continue
            
        page_el = ET.SubElement(file_el, "PAGE", ID=f"{code}.{page_num:03}")
        
        # Create sentence elements
        for sent_id, sentence in enumerate(sentences, 1):
            stc_el = ET.SubElement(
                page_el, "STC", ID=f"{code}.{page_num:03}.{sent_id:02}"
            )
            stc_el.text = sentence
            
            # Add NER if entities found
            sentence = clean_text(sentence)
            entities = ner_underthesea(sentence)
            if entities:
                ner_el = ET.SubElement(stc_el, "NER")
                valid_entities = [
                    ent for ent in entities
                    if ent.get("entity", "").split("-")[-1] in BASED_ENTITY_GROUPS
                ]
                for entity in valid_entities:
                    entity_type = entity["entity"].split("-")[-1]
                    ET.SubElement(
                        ner_el, "ENTITY",
                        TYPE=entity_type,
                        START=str(entity.get("start", 0)),
                        END=str(entity.get("end", 0))
                    ).text = entity.get("word", "")
            
            total_sentences += 1
    
    # Write XML
    tree = ET.ElementTree(root)
    pretty = minidom.parseString(ET.tostring(tree.getroot(), encoding="utf-8"))
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(pretty.toprettyxml(indent="  "))
    
    print(f"✅ Created XML file: {output_path}")
    print(f"📊 Total Vietnamese sentences: {total_sentences}")
    
    return output_path

In [24]:
pdf_path = "/home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/temp/Nam-hoa-kinh.pdf"
code = "PAS_003"
mode = "sentence"
des = f"{code}_nam_hoa_kinh.xml"

metadata = BOOK_METADATA.get("NAM_HOA_KINH", {})
build_xml_for_book(pdf_path, metadata, des, code)

🔄 Processing PDF: /home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/temp/Nam-hoa-kinh.pdf
📄 Extracted 139 pages


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[{'entity': 'B-LOC', 'score': np.float32(0.9969644), 'index': 1, 'word': 'vn', 'start': 0, 'end': 2}, {'entity': 'B-PER', 'score': np.float32(0.99681145), 'index': 3, 'word': 'NAM', 'start': 3, 'end': 6}, {'entity': 'I-PER', 'score': np.float32(0.9990652), 'index': 5, 'word': 'HOA', 'start': 7, 'end': 10}, {'entity': 'I-PER', 'score': np.float32(0.99887556), 'index': 8, 'word': 'KINH', 'start': 11, 'end': 15}, {'entity': 'I-PER', 'score': np.float32(0.99933535), 'index': 11, 'word': 'Trang', 'start': 16, 'end': 21}, {'entity': 'I-PER', 'score': np.float32(0.99949896), 'index': 12, 'word': 'Tử', 'start': 22, 'end': 24}]
[{'entity': 'B-PER', 'score': np.float32(0.9998584), 'index': 6, 'word': 'Nguyễn', 'start': 15, 'end': 21}, {'entity': 'I-PER', 'score': np.float32(0.9995894), 'index': 7, 'word': 'Kim', 'start': 22, 'end': 25}, {'entity': 'I-PER', 'score': np.float32(0.99982077), 'index': 8, 'word': 'Vỹ', 'start': 26, 'end': 28}]
[{'entity': 'B-LOC', 'score': np.float32(0.98745185), 'in

'PAS_003_nam_hoa_kinh.xml'

In [8]:
import xml.etree.ElementTree as ET


def count_sentences(xml_file_path):
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Find all STC elements (sentences)
    sentences = root.findall(".//STC")

    # Count the sentences
    total_sentences = len(sentences)

    # Count sentences per section
    sections = {}
    for section in root.findall(".//SECT"):
        section_name = section.get("NAME", "Unknown")
        section_sentences = section.findall(".//STC")
        sections[section_name] = len(section_sentences)

    return total_sentences, sections


# Path to your XML file
xml_file = "/home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/nam_hoa_kinh.xml"

# Get the counts
total, section_counts = count_sentences(xml_file)

print(f"Total number of sentences: {total}")
print("\nNumber of sentences per section:")
for section, count in section_counts.items():
    print(f"- {section}: {count} sentences")

Total number of sentences: 2251

Number of sentences per section:
- Giới thiệu: 2251 sentences
