# XML Files to SOLR
This script takes the XML files and converts them to TXT and to JSON for SOLR

In [1]:
import os, json, pprint, time, re, unicodedata
from xml.etree import ElementTree as ET
path = "/data/cm/data/"

WHITESPACE_RE = re.compile(r"\s+")

In [2]:
def get_zeitschrift_struktur(path, zs_number):
    print("Zeitschrift:", zs_number, end="\n\n")
    for jahrgang in os.listdir(path+zs_number+"/"+zs_number+"/"):
        print("Jahrgang:", end=" ")
        print(jahrgang)
        print("Hefte:")
        for heft in os.listdir(path+zs_number+"/"+zs_number+"/"+jahrgang+"/"):
            print(heft, end=", ")
            seiten = len(os.listdir(path+zs_number+"/"+zs_number+"/"+jahrgang+"/"+heft))
            print(seiten, "Seiten.")
    print()
    
def xml_to_page(tree):
    # input: page parsed xml tree
    # output: 
    # page --> paragraphs --> lines
    # [[[line1], [line2]], [[line1], [line2]]] etc.
    page = []
    line = []
    par = []
    for node in tree.iter():
        if node.tag.endswith('charParams'):
            line.append(node.text)

        elif node.tag.endswith('line') and line != []:
            par.append(line)
            line = []
        elif node.tag.endswith('par') and par != []:
            page.append(par)
            par = []
    return(page)

def page_blocks_to_text(page_blocks):
    # input: output of xml_to_page
    # output: page as plain text
    text_page = ""
    for par in page_blocks:
        text_par = ""
        for line in par:
            # apparently sometimes line happens to be [None], then...
            if None in line:
                line = [el for el in line if el != None]
            
            text_line = "".join(line).strip()
            text_par += " "
            text_par += text_line
        text_page += text_par
    text_page = text_page.replace("¬ ", "").replace("- ", "").replace("= ", "").replace("^ ", "").strip()
    text_page = text_page.replace(" ;", ";").replace(" ?", "?").replace("•", "")
    text_page = text_page.replace("    ", " ").replace("   ", " ").replace("  ", " ")
    
    return(text_page)

def clean_text(text: str) -> str:
    if not text:
        return ""

    # Unicode-Normalisierung (z.B. zusammengesetzte Zeichen)
    text = unicodedata.normalize("NFC", text)

    # typische OCR-Artefakte entfernen / glätten
    replacements = {
        "¬ ": "",   # Trennfahne
        "- ": "",   # Silbentrennung am Zeilenende
        "= ": "",   # OCR-Fehler
        "^ ": "",   # OCR-Fehler
        " ;": ";",
        " ?": "?",
        "•": "",
    }
    for old, new in replacements.items():
        text = text.replace(old, new)

    # Steuerzeichen entfernen, aber alle Buchstaben/Ziffern/Satzzeichen lassen
    cleaned_chars = []
    for ch in text:
        cat = unicodedata.category(ch)
        # 'C*' = Control, Surrogate, Unassigned, Private use
        if cat.startswith("C"):
            # Standard-Whitespace zu normalem Leerzeichen
            if ch in ("\n", "\r", "\t"):
                cleaned_chars.append(" ")
            # sonst wegwerfen
            continue
        cleaned_chars.append(ch)

    text = "".join(cleaned_chars)

    # Whitespace normalisieren
    text = WHITESPACE_RE.sub(" ", text).strip()

    return text

In [3]:
# initialize output dictionary
current_page = []
notadirectory = []
isadirectory = []
start = time.time()
output = []
processed_journals = set()


for zeitschrift in os.listdir(path+"xml/"):
    
    print("Processing now", zeitschrift, end=" ")
    seiten = []

    if zeitschrift not in processed_journals:

        if os.path.isdir(path+"xml/"+zeitschrift+"/"):
            for jahrgang in os.listdir(path+"xml/"+zeitschrift+"/"):
            
                try:
                    if os.path.isdir(path+"xml/"+zeitschrift+"/"+jahrgang+"/"):
                
                        for heft in os.listdir(path+"xml/"+zeitschrift+"/"+jahrgang+"/"):
                    
                            try:
                                if os.path.isdir(path+"xml/"+zeitschrift+"/"+jahrgang+"/"+heft+"/"):
                                    for seite in os.listdir(path+"xml/"+zeitschrift+"/"+jahrgang+"/"+heft+"/"):
                                        current_page.append(seite)
                                        
                                        try:
                                            if os.path.isdir(path+"xml/"+zeitschrift+"/"+jahrgang+"/"+heft+"/"):
                                                seite_nr = os.listdir(path+"xml/"+zeitschrift+"/"+jahrgang+"/"+heft+"/").index(seite)
                                                
                                                
                                                tree = ET.parse(path+"xml/"+zeitschrift+"/"+jahrgang+"/"+heft+"/"+seite)
                                                page = xml_to_page(tree)
                                            
                                                page_text = page_blocks_to_text(page)
                                                page_text.replace('"', '‟')
                                                
                                                #write = open(path+"output/text/")
                                            
                                
                                
                                
                                
                                
                                        except IsADirectoryError as iade:
                                            isadirectory.append(iade)
                            except NotADirectoryError as nade:
                                notadirectory.append(nade)
                except NotADirectoryError as nade:
                    notadirectory.append(nade)
        print(zeitschrift, len(notadirectory), len(isadirectory))
        processed_journals.add(zeitschrift)
        
running_time = time.time()-start
print("Done in", running_time)

Processing now 2710055 2710055 0 0
Processing now 9616701 9616701 0 0
Processing now 2580773 2580773 0 0
Processing now 2895450 2895450 0 1413
Processing now 2727810 2727810 0 2270
Processing now 3129675 3129675 0 2270
Processing now 6492429 6492429 0 2270
Processing now 2438141 2438141 0 2270
Processing now 2827798 

KeyboardInterrupt: 

In [None]:
# serialize to json SOLR format (JSONL)
solr_lines = []

for elem in output:
    # ID-String
    doc_id = f"{elem['zeitschrift']}_{elem['jahrgang']}_{elem['heft']}_{elem['seite']}"

    # Solr document
    solr_doc = {
        "id": doc_id,
        "text": elem["text"]
    }

    solr_lines.append(json.dumps(solr_doc, ensure_ascii=False))

# Save file
output_file = path + "output/solr/solr_data.jsonl"
with open(output_file, "w", encoding="utf-8") as write_file:
    write_file.write("\n".join(solr_lines))

print(f"Gespeichert: {output_file} ({len(solr_lines)} Dokumente)")
