# XML to TXT

This script generates TXT files from XML files

In [4]:
import os, json, pprint, time
from xml.etree import ElementTree as ET

import re
import unicodedata

WHITESPACE_RE = re.compile(r"\s+")

root = "/data/cm/"
xml_root = "/data/cm/data/xml/"

In [5]:
def get_zeitschrift_struktur(path, zs_number):
    print("Zeitschrift:", zs_number, end="\n\n")
    for jahrgang in os.listdir(path+zs_number+"/"+zs_number+"/"):
        print("Jahrgang:", end=" ")
        print(jahrgang)
        print("Hefte:")
        for heft in os.listdir(path+zs_number+"/"+zs_number+"/"+jahrgang+"/"):
            print(heft, end=", ")
            seiten = len(os.listdir(path+zs_number+"/"+zs_number+"/"+jahrgang+"/"+heft))
            print(seiten, "Seiten.")
    print()
    
def xml_to_page(tree):
    # input: page parsed xml tree
    # output: 
    # page --> paragraphs --> lines
    # [[[line1], [line2]], [[line1], [line2]]] etc.
    page = []
    line = []
    par = []
    for node in tree.iter():
        if node.tag.endswith('charParams'):
            line.append(node.text)

        elif node.tag.endswith('line') and line != []:
            par.append(line)
            line = []
        elif node.tag.endswith('par') and par != []:
            page.append(par)
            par = []
    return(page)

def page_blocks_to_text(page_blocks):
    # input: output of xml_to_page
    # output: page as plain text
    text_page = ""
    for par in page_blocks:
        text_par = ""
        for line in par:
            # apparently sometimes line happens to be [None], then...
            if None in line:
                line = [el for el in line if el != None]
            
            text_line = "".join(line).strip()
            text_par += " "
            text_par += text_line
        text_page += text_par
    text_page = text_page.replace("¬ ", "").replace("- ", "").replace("= ", "").replace("^ ", "").strip()
    text_page = text_page.replace(" ;", ";").replace(" ?", "?").replace("•", "")
    text_page = text_page.replace("    ", " ").replace("   ", " ").replace("  ", " ")
    
    return(text_page)


def clean_text(text: str) -> str:
    if not text:
        return ""

    # Unicode-Normalisierung (z.B. zusammengesetzte Zeichen)
    text = unicodedata.normalize("NFC", text)

    # typische OCR-Artefakte entfernen / glätten
    replacements = {
        "¬ ": "",   # Trennfahne
        "- ": "",   # Silbentrennung am Zeilenende
        "= ": "",   # OCR-Fehler
        "^ ": "",   # OCR-Fehler
        " ;": ";",
        " ?": "?",
        "•": "",
    }
    for old, new in replacements.items():
        text = text.replace(old, new)

    # Steuerzeichen entfernen, aber alle Buchstaben/Ziffern/Satzzeichen lassen
    cleaned_chars = []
    for ch in text:
        cat = unicodedata.category(ch)
        # 'C*' = Control, Surrogate, Unassigned, Private use
        if cat.startswith("C"):
            # Standard-Whitespace zu normalem Leerzeichen
            if ch in ("\n", "\r", "\t"):
                cleaned_chars.append(" ")
            # sonst wegwerfen
            continue
        cleaned_chars.append(ch)

    text = "".join(cleaned_chars)

    # Whitespace normalisieren
    text = WHITESPACE_RE.sub(" ", text).strip()

    return text

In [3]:
# 1 get structure of the current folder
# 2 parse each single file
# 2.1 while parsing, create
start = time.time()
if not os.path.isdir(root+'output/txt/'):
    os.makedirs(root+'output/txt/')
    
processed_journals = [j for j in os.listdir(root+'output/txt/')]


for journal in os.listdir(xml_root):
    
    if journal not in processed_journals:
    
        start_journal = time.time()
        print("Processing", journal, end=" ")
    
        path_struct = os.walk(xml_root+"xml/"+journal)
        for dirpath, dirs, files in path_struct:
            if files != []:
        
                # parse each file
                for file in files:
                    #print(file, end=" ")
                    tree = ET.parse(dirpath+"/"+file)
                    page = xml_to_page(tree)
                    page_text = page_blocks_to_text(page)
                    page_text.replace('"', '‟')
            
                    dirout = root + 'output/txt/' + dirpath.split(xml_root)[1] + '/'
                    fileout = file.split('.')[0] + '.txt'
                    
            
                    if not os.path.exists(dirout):
                        os.makedirs(dirout)
            
                    out = open(dirout+"/"+fileout, "w", encoding="utf-8")
                    out.write(page_text)
                    out.close()
    
        duration_journal = time.time()-start_journal
        print("Done in", round(duration_journal, 2))

print(time.time()-start)         
print("finished")

Processing 2710055 Done in 0.0
Processing 9616701 Done in 0.0
Processing 2580773 Done in 0.0
Processing 2895450 Done in 0.0
Processing 2727810 Done in 0.0
Processing 3129675 Done in 0.0
Processing 6492429 Done in 0.0
Processing 2438141 Done in 0.0
Processing 2827798 Done in 0.0
Processing 5438908 Done in 0.0
Processing 4782723 Done in 0.0
Processing 7506414 Done in 0.0
Processing 9054153 Done in 0.0
Processing 9366335 Done in 0.0
Processing 4889892 Done in 0.0
Processing 2425321 Done in 0.0
Processing 6492329 Done in 0.0
Processing 700673 Done in 0.0
Processing 4895413 Done in 0.0
Processing 2542917 Done in 0.0
Processing 9583346 Done in 0.0
Processing 2549643 Done in 0.0
Processing 10638461 Done in 0.0
Processing 3139318 Done in 0.0
Processing 9620169 Done in 0.0
Processing 9051331 Done in 0.0
Processing 4781937 Done in 0.0
Processing 4911569 Done in 0.0
Processing 2625941 Done in 0.0
Processing 2446951 Done in 0.0
Processing README.md Done in 0.0
Processing 9643693 Done in 0.0
Proces