# XML to TXT

This script generates TXT files from XML files

In [1]:
import os, json, pprint, time
from xml.etree import ElementTree as ET
root = "/data/cm/"

In [2]:
def get_zeitschrift_struktur(path, zs_number):
    print("Zeitschrift:", zs_number, end="\n\n")
    for jahrgang in os.listdir(path+zs_number+"/"+zs_number+"/"):
        print("Jahrgang:", end=" ")
        print(jahrgang)
        print("Hefte:")
        for heft in os.listdir(path+zs_number+"/"+zs_number+"/"+jahrgang+"/"):
            print(heft, end=", ")
            seiten = len(os.listdir(path+zs_number+"/"+zs_number+"/"+jahrgang+"/"+heft))
            print(seiten, "Seiten.")
    print()
    
def xml_to_page(tree):
    # input: page parsed xml tree
    # output: 
    # page --> paragraphs --> lines
    # [[[line1], [line2]], [[line1], [line2]]] etc.
    page = []
    line = []
    par = []
    for node in tree.iter():
        if node.tag.endswith('charParams'):
            line.append(node.text)

        elif node.tag.endswith('line') and line != []:
            par.append(line)
            line = []
        elif node.tag.endswith('par') and par != []:
            page.append(par)
            par = []
    return(page)

def page_blocks_to_text(page_blocks):
    # input: output of xml_to_page
    # output: page as plain text
    text_page = ""
    for par in page_blocks:
        text_par = ""
        for line in par:
            # apparently sometimes line happens to be [None], then...
            if None in line:
                line = [el for el in line if el != None]
            
            text_line = "".join(line).strip()
            text_par += " "
            text_par += text_line
        text_page += text_par
    text_page = text_page.replace("¬ ", "").replace("- ", "").replace("= ", "").replace("^ ", "").strip()
    text_page = text_page.replace(" ;", ";").replace(" ?", "?").replace("•", "")
    text_page = text_page.replace("    ", " ").replace("   ", " ").replace("  ", " ")
    
    return(text_page)

def clean_text(text):
    text = text.replace("¬ ", "").replace("- ", "").replace("= ", "").replace("^ ", "").strip()
    text = text.replace(" ;", ";").replace(" ?", "?").replace("•", "")
    text = text.replace("    ", " ").replace("   ", " ").replace("  ", " ")
    return text

In [3]:
# 1 get structure of the current folder
# 2 parse each single file
# 2.1 while parsing, create
start = time.time()
processed_journals = [j for j in os.listdir(root+'output/text/')]


for journal in os.listdir(root+"xml/"):
    
    if journal not in processed_journals:
    
        start_journal = time.time()
        print("Processing", journal, end=" ")
    
        path_struct = os.walk(root+"xml/"+journal)
        for dirpath, dirs, files in path_struct:
            if files != []:
        
                # parse each file
                for file in files:
                    #print(file, end=" ")
                    tree = ET.parse(dirpath+"/"+file)
                    page = xml_to_page(tree)
                    page_text = page_blocks_to_text(page)
                    page_text.replace('"', '‟')
            
                    dirout = root + 'output/text/' + dirpath.split('/data/cm/xml/')[1] + '/'
                    fileout = file.split('.')[0] + '.txt'
                    
            
                    if not os.path.exists(dirout):
                        os.makedirs(dirout)
            
                    out = open(dirout+"/"+fileout, "w", encoding="utf-8")
                    out.write(page_text)
                    out.close()
    
        duration_journal = time.time()-start_journal
        print("Done in", round(duration_journal, 2))

print(time.time()-start)            

Processing 2425679 Done in 187.11
Processing 2254914 Done in 1.6
Processing 8555529 Done in 204.95
Processing 177657 Done in 0.77
Processing 2642950 Done in 29.07
Processing 6492301 Done in 6.44
Processing 4884143 Done in 23.81
Processing 5959619 Done in 2.93
Processing 2259275 Done in 137.99
Processing 4908657 Done in 5.65
Processing 9620162 Done in 31.88
Processing 9221535 Done in 12.06
Processing 9643684 Done in 38.09
Processing 1304829 Done in 256.82
Processing 9607246 Done in 5.78
Processing 6492399 Done in 34.3
Processing 3111067 Done in 144.69
Processing 2420797 Done in 3.31
Processing 4804947 Done in 1.29
Processing 2609281 Done in 130.22
Processing 3122061 Done in 125.09
Processing 2424658 Done in 72.85
Processing 4926451 Done in 57.61
Processing 2571117 Done in 47.02
Processing 2708691 Done in 10.44
Processing 8812431 Done in 0.38
Processing 9215017 Done in 3.62
Processing 2420973 Done in 72.6
Processing 2550047 Done in 26.45
Processing 4926100 Done in 8.96
Processing 3138070