# Transform "XML 1.1" to "TEI"

There are two steps to achieve this:

* Extract data from the XML 1.1 Files
* Write this data in a TEI format

## Import Packages & define functions

In [None]:
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from xml.dom import minidom
import pandas as pd
import os
import re

In [None]:
def listdir_path(d):
    # Return full path of all files & directories in directory
    list_full_path = []
    for path in os.listdir(d):
        full_path = os.path.join(d, path)
        list_full_path.append(full_path)
    return list_full_path

In [None]:
def prettify(elem):
    """Return a pretty-printed XML string for the Element.
    """
    rough_string = tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent=" ")

## Extract data

In [None]:
# dir_ = "D:/manual_anno"
dir_ = ".\Sequence-Labeling-for-Reference-Parsing-of-Cyrillic-Script-Scholarly-Data\Real_annotated_data\manual_anno_xml1.1"

In [None]:
list_paper_id = []
for path in listdir_path(dir_):
    pdf = str(path).split("\\")[1]
    core_id = pdf.split(".")[0].split("_")[2]
    list_paper_id.append(core_id)
list_paper_id

In [None]:
files = [f for f in listdir_path(dir_)]
files

In [None]:
for path in files:
    print(path)
    core_id = str(path).split("\\")[1].split(".")[0].split("_")[2]
    
    # get the xml1.1 annotation
    tree = ET.parse(path)
    root = tree.getroot()

    tokens_doc = []
    labels_doc = []

    for lt in root.findall("{http:///webanno/custom.ecore}Label"):
        label = lt.attrib["value"]
        token = root.findall("{http:///uima/cas.ecore}Sofa")[0].attrib["sofaString"][int(lt.attrib["begin"]) : int(lt.attrib["end"])]
    #     print("Token " + root.findall("{http:///uima/cas.ecore}Sofa")[0].attrib["sofaString"][int(lt.attrib["begin"]) : int(lt.attrib["end"])] + " hat Label " + label)
    #     d[token]=(label)
        labels_doc.append(label)
        tokens_doc.append(re.sub("\n","",re.sub("  ", " ",str(token).strip(' \t\n'))))
        
    prev_label = None
    prev_token = None
    count_ref = 0
    sequence = []
    label_seq = []
    all_label_seq = []
    all_seq = []
    i=0
    for l , t in zip(labels_doc , tokens_doc):

        if re.match("doc",l)!=None:
            sequence.append(t) 
            label_seq.append(l) 

        elif l == "ref_beg":
            i+=1
            count_ref+=1
            if sequence == []:
                pass
            else:
                all_seq.append(sequence)
                all_label_seq.append(label_seq)
            sequence = []
            label_seq = []
    #         sequence.append(t)
    #         label_seq.append(l)

        elif l == "ref_end":
            i+=1
            sequence.append(t) 
            label_seq.append(l)
            if sequence == []:
                pass
            else:
                all_seq.append(sequence)
                all_label_seq.append(label_seq)
            sequence = []
            label_seq = []
        else:
            sequence.append(t) 
            label_seq.append(l)

    #     prev_label = l

    if sequence == []:
        pass
    else:
        all_seq.append(sequence)
        all_label_seq.append(label_seq)
    print(count_ref)     

    
    # first read in the template TEI. From there we will fill up the entries.
#     tree = ET.parse("C:\\Masterarbeit\\Data\\template.tei")
    tree = ET.parse("\template.tei")

    template_root = tree.getroot()
    #################################
    listBibl = template_root[1][0][0][0]
    titleStmt = template_root[0][0][0]
    # doc_title = template_root[0][0][0][0]
    bx = 0
    for i in range(len(all_label_seq)): # Gehe die einzelnen Referenzen durch
        if len(re.findall("doc" , str(all_label_seq[i])))>0:
            doc_title = SubElement(titleStmt, "title", {"level":"a" , "type":"main"})
            doc_title.text = all_seq[i][all_label_seq[i].index("doc_title")]

            doc_author = SubElement(titleStmt, "author")
            for j in range(len(all_label_seq[i])): # gehe die einzelnen Labels der einzelnen Referenz durch
                try:
                # Author - labeled data
                    if all_label_seq[i][j] =='doc_author':
                        persName = SubElement(doc_author, "persName")
                        persName.text = all_seq[i][j]

                except:
                    pass

#             doc_author.text = all_seq[0][all_label_seq[i].index("doc_author")]

        else: 
            biblStruct = SubElement(listBibl, 'biblStruct',{"xml:id":"b"+str(bx)})
            bx+=1
            analytic = SubElement(biblStruct, 'analytic')
            monogr = SubElement(biblStruct, 'monogr')
            imprint = SubElement(monogr , 'imprint')
            print(i)
            labels = all_label_seq[i]
            text = all_seq[i]

            title_str = None
            journal_str = None

            # wollen pro Referenz nur ein "Author"-Value und darunter persName
            if len(re.findall('author' , str(labels)))>0:
                    author = SubElement(analytic, 'author')

            for j in range(len(labels)): # gehe die einzelnen Labels der einzelnen Referenz durch
                try:
                # Author - labeled data
                    if labels[j] =='author':
                        persName = SubElement(author, "persName")
                        persName.text = text[j]

                except:
                    pass

                # Titel - labeled data
                if labels[j] == "title" and title_str == None: ### Wir wollen alle Titel-labeled data zu einem String zusammenfassen
#                     try:
#                         if labels[j+1]=="title":
#                             title_str = text[j]
#                             for k , t in zip(labels[j+1:],text[j+1:]):
#                                 if k=="title" and title_str[-1]!="-":
#                                     title_str = title_str + " " + t
#                                     j+=1
#                                 elif k=="title" and title_str[-1]=="-":
#                                     title_str = title_str[0:-1] + t
#                                     j+=1
#                     except:
#                         pass
#                     else:
                    title_str = text[j]

                    title = SubElement(analytic, 'title', {"level":"a"})
                    title.text = title_str.replace("\n"," ")

                # Journal - labeled data
                if labels[j] == "journal" and journal_str == None:
                    if labels[j+1]=="journal":
                        journal_str = text[j]
                        for k , t in zip(labels[j+1:],text[j+1:]):
                            if k=="journal" and journal_str[-1]!="-":
                                journal_str = journal_str + " " + t
                                j+=1
                            elif k=="journal" and journal_str[-1]=="-":
                                journal_str = journal_str[0:-1] + t
                                j+=1
                    else:
                        journal_str = text[j]            
                    title_j = SubElement(monogr, 'title', {"level":"j"})
                    title_j.text = journal_str

                # Volume - labeled data
                if labels[j] == "volume":
                    volume = SubElement(imprint , 'biblScope' , {'unit':'volume'})
                    volume.text = text[j]

                # Issue - labeled data
                if labels[j] == "issue":
                    issue = SubElement(imprint , 'biblScope' , {'unit':'issue'})
                    issue.text = text[j]

                # Year - labeled data
                if labels[j] == "year":
                    year = SubElement(imprint , 'date' , {'type':'published' , "when":text[j] })

                # idno DOI - labeled data
                if labels[j] == "idno_doi":
                    issue = SubElement(imprint , 'idno' , {'type':'DOI'})
                    issue.text = text[j]

                 # idno other - labeled data
                if labels[j] == "idno_other":
                    issue = SubElement(imprint , 'idno' , {'type':'other'})
                    issue.text = text[j]

                # publisher - labeled data
                if labels[j] == "publisher":
                    publisher = SubElement(imprint , 'publisher')
                    publisher.text = text[j]
                    
                # address - labeled data
                if labels[j] == "address":
                    address = SubElement(imprint , 'pubPlace')
                    address.text = text[j]
                    
                # pages - labeled data
                if labels[j] == "pages":
                    pages = SubElement(imprint , 'biblScope' , {'unit':'pages'})
                    pages.text = text[j]

                
#             # b_page & e_page - labeled data
#             if len(re.findall('pages' , str(labels)))>1:
#                 b_page = text[labels.index("pages")]
# #                 e_page = text[labels.index("pages")]  
#                 page = SubElement(imprint , 'biblScope' , {'unit':'page' , "pages":b_page})
#             elif len(re.findall('page' , str(labels)))==1: 
#                 e_page = text[labels.index("pages")]
#                 page = SubElement(imprint , 'biblScope' , {'unit':'page' , "pages":e_page})
                
                
                
    mydata = str(prettify(template_root)).replace("ns0:","").replace(":ns0","")
#     myfile = open(os.path.join("C:/Masterarbeit/Data/manuel_annotated/TEI",core_id + ".xml"), "w",encoding="utf-8")
    myfile = open(os.path.join((".\Sequence-Labeling-for-Reference-Parsing-of-Cyrillic-Script-Scholarly-Data\Real_annotated_data\TEI",core_id + ".xml"), "w",encoding="utf-8")

    myfile.write(mydata)
    myfile.close()