In [3]:
import json
import pymupdf
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
import unicodedata
from pathlib import Path
import html
from openai import OpenAI
from underthesea import ner

# ──────────────── CẤU HÌNH ────────────────
BOOK_METADATA = {
    "NAM_HOA_KINH": {
        "TITLE": "Nam Hoa Kinh",
        "VOLUME": "",
        "AUTHOR": "Trang Tử",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "thuviensach.vn",
    },
    "TRANG_TU_NAM_HOA_KINH": {
        "TITLE": "Trang Tử Nam Hoa Kinh",
        "VOLUME": "",
        "AUTHOR": "Nguyễn Kim Vỹ",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "Nhà xuất bản văn hóa",
    },
    "TRANG_TU_NAM_HOA_KINH_2": {
        "TITLE": "Trang Tử Nam Hoa Kinh",
        "VOLUME": "",
        "AUTHOR": "Thu Giang, Ngyễn Duy Cần",
        "PERIOD": "Chiến Quốc",
        "LANGUAGE": "vi",
        "SOURCE": "Nhà xuất bản trẻ",
    },
}
BASED_ENTITY_GROUPS = [
    "PER",
    "ORG",
    "LOC",
    "ORG",
    "TME",
    "TITLE",
    "NUM",
]


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ──────────────── UTILITY FUNCTIONS ────────────────

def normalize(s: str) -> str:
    return unicodedata.normalize("NFC", s.strip())

def remove_html_entities(text):
    """Remove HTML entities comprehensively."""
    try:
        text = html.unescape(text)
    except:
        pass
    
    # Remove &quot; variations
    text = text.replace("&quot;", "")
    text = text.replace("&quot", "")
    text = text.replace("quot;", "")
    
    # Remove other common entities
    html_entities = {
        "&amp;": "&", "&lt;": "<", "&gt;": ">", "&apos;": "'",
        "&nbsp;": " ", "&hellip;": "...", "&mdash;": "—", "&ndash;": "–"
    }
    
    for entity, replacement in html_entities.items():
        text = text.replace(entity, replacement)
    
    # Clean remaining entities with regex
    text = re.sub(r"&[a-zA-Z]+;?", "", text)
    
    return text

def clean_text(text: str) -> str:
    """Clean text comprehensively."""
    text = normalize(text)
    text = remove_html_entities(text)
    
    clean_patterns = ["***", "---", "___", "..."]
    for pattern in clean_patterns:
        text = text.replace(pattern, "")
    
    # Remove invisible characters and normalize spaces
    text = re.sub(r"[\u200b\u200e\u202a\u202c\ufeff]+", "", text)
    text = re.sub(r"\s+", " ", text)
    
    return text.strip()

def is_vietnamese(text: str) -> bool:
    """Check if text contains Vietnamese characters."""
    vietnamese_pattern = re.compile(r'[àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđĐ]')
    return bool(vietnamese_pattern.search(text))

def split_sentences(text: str) -> list[str]:
    """Split text into Vietnamese sentences."""
    text = clean_text(text)
    
    # Split by common sentence delimiters
    sentences = []
    current_sentence = ""
    
    for char in text:
        current_sentence += char
        if char in [".", "!", "?", "。"]:
            if current_sentence.strip():
                cleaned = clean_text(current_sentence)
                # Only keep Vietnamese sentences
                if cleaned and len(cleaned) > 10 and is_vietnamese(cleaned):
                    sentences.append(cleaned)
            current_sentence = ""
    
    # Add remaining text if it's Vietnamese
    if current_sentence.strip():
        cleaned = clean_text(current_sentence)
        if cleaned and len(cleaned) > 10 and is_vietnamese(cleaned):
            sentences.append(cleaned)
    
    return sentences

In [5]:
def ner_underthesea(text: str) -> list[dict]:
    """Extract named entities from Vietnamese text."""
    result = ner(text, deep=True)
    return result

In [8]:
def merge_adjacent_entities(entities: list[dict], text: str) -> list[dict]:
    """
    Merge adjacent entities that are consecutive in the text.
    
    Args:
        entities: List of entity dictionaries with 'start', 'end', 'word', 'entity' keys
        text: Original text to verify merging
    
    Returns:
        List of merged entities
    """
    if not entities or len(entities) < 2:
        return entities
    
    # Sort entities by start position
    sorted_entities = sorted(entities, key=lambda x: x.get('start', 0))
    merged_entities = []
    current_entity = sorted_entities[0].copy()
    
    for i in range(1, len(sorted_entities)):
        next_entity = sorted_entities[i]
        
        # Check if entities are adjacent: end_current + 1 = start_next
        current_end = current_entity.get('end', 0)
        next_start = next_entity.get('start', 0)
        
        # Also check if they have the same entity type (optional - you can remove this condition)
        current_type = current_entity.get('entity', '').split('-')[-1]
        next_type = next_entity.get('entity', '').split('-')[-1]
        
        if current_end + 1 == next_start and current_type == next_type:
            # Merge entities
            print(f"   🔗 Merging adjacent entities: '{current_entity.get('word', '')}' + '{next_entity.get('word', '')}'")
            
            # Update current entity with merged information
            current_entity['end'] = next_entity.get('end', current_end)
            
            # Reconstruct the word from the original text
            merged_start = current_entity.get('start', 0)
            merged_end = current_entity.get('end', 0)
            current_entity['word'] = text[merged_start:merged_end]
            
            # Keep the entity type of the first entity
            print(f"   ✅ Merged result: '{current_entity.get('word', '')}' ({current_type})")
            
        else:
            # No merge possible, add current entity to results and move to next
            merged_entities.append(current_entity)
            current_entity = next_entity.copy()
    
    # Add the last entity
    merged_entities.append(current_entity)
    
    if len(merged_entities) < len(sorted_entities):
        print(f"   📊 Entity merging: {len(sorted_entities)} -> {len(merged_entities)} entities")
    
    return merged_entities

def process_ner_with_merging(text: str) -> list[dict]:
    """
    Extract NER entities and merge adjacent ones.
    
    Args:
        text: Input text for NER processing
        
    Returns:
        List of processed and merged entities
    """
    # Get raw NER results
    raw_entities = ner_underthesea(text)
    
    if not raw_entities:
        return []
    
    # Filter for valid entity types first
    valid_entities = [
        ent for ent in raw_entities
        if ent.get("entity", "").split("-")[-1] in BASED_ENTITY_GROUPS
    ]
    
    if not valid_entities:
        return []
    
    # Merge adjacent entities
    merged_entities = merge_adjacent_entities(valid_entities, text)
    
    return merged_entities

In [11]:

"""
──────────────── MAIN FUNCTION ────────────────
Các bước thực hiện bao gồm: 

1. Đọc PDF
2. Tách các câu trong từng trang
3. Xử lý các câu để loại bỏ các ký tự đặc biệt
4. Tách các từ trong câu

"""
def build_xml_for_book(pdf_path, metadata: dict, output_path="vietnamese_parsed.xml", code="VIE_001"):
    """Parse Vietnamese text from PDF and create XML with NER."""
    print(f"🔄 Processing PDF: {pdf_path}")
    
    # Read PDF
    doc = pymupdf.open(pdf_path)
    pages_text = [p.get_text() for p in doc]
    print(f"📄 Extracted {len(pages_text)} pages")
    
    # Create XML structure
    root = ET.Element("root")
    file_el = ET.SubElement(root, "FILE", ID=code)
    
    # Metadata
    meta = ET.SubElement(file_el, "meta")
    ET.SubElement(meta, "TITLE").text = metadata.get("TITLE", "")
    ET.SubElement(meta, "VOLUME").text = metadata.get("VOLUME", "")
    ET.SubElement(meta, "AUTHOR").text = metadata.get("AUTHOR", "")
    ET.SubElement(meta, "PERIOD").text = metadata.get("PERIOD", "")
    ET.SubElement(meta, "LANGUAGE").text = metadata.get("LANGUAGE", "")
    ET.SubElement(meta, "TRANSLATOR").text = metadata.get("TRANSLATOR", "")
    ET.SubElement(meta, "SOURCE").text = metadata.get("SOURCE", "")
    
    total_sentences = 0
    
    # Process each page
    for page_num, page_text in enumerate(pages_text, 1):
        sentences = split_sentences(page_text)
        
        if not sentences:  # Skip empty pages
            continue
            
        page_el = ET.SubElement(file_el, "PAGE", ID=f"{code}.{page_num:03}")
        
        # Create sentence elements
        for sent_id, sentence in enumerate(sentences, 1):
            stc_el = ET.SubElement(
                page_el, "STC", ID=f"{code}.{page_num:03}.{sent_id:02}"
            )
            stc_el.text = sentence
            
            # Add NER if entities found
            sentence = clean_text(sentence)
            merged_entities = process_ner_with_merging(sentence)
            if merged_entities:
                ner_el = ET.SubElement(stc_el, "NER")
                for entity in merged_entities:
                    entity_type = entity["entity"].split("-")[-1]
                    ET.SubElement(
                        ner_el, "ENTITY",
                        TYPE=entity_type,
                        START=str(entity.get("start", 0)),
                        END=str(entity.get("end", 0))
                    ).text = entity.get("word", "")
            
            total_sentences += 1
    
    # Write XML
    tree = ET.ElementTree(root)
    pretty = minidom.parseString(ET.tostring(tree.getroot(), encoding="utf-8"))
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(pretty.toprettyxml(indent="  "))
    
    print(f"✅ Created XML file: {output_path}")
    print(f"📊 Total Vietnamese sentences: {total_sentences}")
    
    return output_path

In [12]:
pdf_path = "/home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/temp/Nam-hoa-kinh.pdf"
code = "PAS_003"
ver = "016"
type = "donngu"
des = f"{code}_nam_hoa_kinh_{type}_{ver}.xml"

metadata = BOOK_METADATA.get("NAM_HOA_KINH", {})
build_xml_for_book(pdf_path, metadata, des, code)

🔄 Processing PDF: /home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/temp/Nam-hoa-kinh.pdf
📄 Extracted 139 pages
   🔗 Merging adjacent entities: 'NAM' + 'HOA'
   ✅ Merged result: 'NAM HOA' (PER)
   🔗 Merging adjacent entities: 'NAM HOA' + 'KINH'
   ✅ Merged result: 'NAM HOA KINH' (PER)
   🔗 Merging adjacent entities: 'NAM HOA KINH' + 'Trang'
   ✅ Merged result: 'NAM HOA KINH Trang' (PER)
   🔗 Merging adjacent entities: 'NAM HOA KINH Trang' + 'Tử'
   ✅ Merged result: 'NAM HOA KINH Trang Tử' (PER)
   📊 Entity merging: 6 -> 2 entities


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


   🔗 Merging adjacent entities: 'Nguyễn' + 'Kim'
   ✅ Merged result: 'Nguyễn Kim' (PER)
   🔗 Merging adjacent entities: 'Nguyễn Kim' + 'Vỹ'
   ✅ Merged result: 'Nguyễn Kim Vỹ' (PER)
   📊 Entity merging: 3 -> 1 entities
   🔗 Merging adjacent entities: 'THIÊN' + 'NỘI'
   ✅ Merged result: 'THIÊN NỘI' (PER)
   🔗 Merging adjacent entities: 'THIÊN NỘI' + 'THIÊNHIÊNỌC'
   ✅ Merged result: 'THIÊN NỘI THIÊN 2 NỘI THIÊN 3 Các sách chú giải Trang Tử HỌC' (PER)
   🔗 Merging adjacent entities: 'THIÊN NỘI THIÊN 2 NỘI THIÊN 3 Các sách chú giải Trang Tử HỌC' + 'THUY'
   ✅ Merged result: 'THIÊN NỘI THIÊN 2 NỘI THIÊN 3 Các sách chú giải Trang Tử HỌC THUY' (PER)
   🔗 Merging adjacent entities: 'TRANG' + 'TỬỌC'
   ✅ Merged result: 'TRANG TỬ HỌC' (PER)
   🔗 Merging adjacent entities: 'TRANG TỬ HỌC' + 'THUY'
   ✅ Merged result: 'TRANG TỬ HỌC THUY' (PER)
   🔗 Merging adjacent entities: 'TRANG' + 'TỬIÊU'
   ✅ Merged result: 'TRANG TỬ 2 TIÊU' (PER)
   🔗 Merging adjacent entities: 'DU' + 'TỔNG'
   ✅ Merged resu

'PAS_003_nam_hoa_kinh_donngu_016.xml'

In [8]:
import xml.etree.ElementTree as ET


def count_sentences(xml_file_path):
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Find all STC elements (sentences)
    sentences = root.findall(".//STC")

    # Count the sentences
    total_sentences = len(sentences)

    # Count sentences per section
    sections = {}
    for section in root.findall(".//SECT"):
        section_name = section.get("NAME", "Unknown")
        section_sentences = section.findall(".//STC")
        sections[section_name] = len(section_sentences)

    return total_sentences, sections


# Path to your XML file
xml_file = "/home/octoopt/workspace/projects/learn-from-basics/nlp-vietnamese-phd/nam_hoa_kinh.xml"

# Get the counts
total, section_counts = count_sentences(xml_file)

print(f"Total number of sentences: {total}")
print("\nNumber of sentences per section:")
for section, count in section_counts.items():
    print(f"- {section}: {count} sentences")

Total number of sentences: 2251

Number of sentences per section:
- Giới thiệu: 2251 sentences
