# Transform "XML 1.1" to "Labelled Text"

There are two steps to achieve this:

* Extract data from the XML 1.1 Files
* Paste the reference strings & create labelled strings
* Write this data in a text file

## Import Packages & define functions

In [None]:
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
from xml.dom import minidom
import pandas as pd
import os
import fasttext
import re

In [None]:
def prettify(elem):
    """Return a pretty-printed XML string for the Element.
    """
    rough_string = tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent=" ")

In [None]:
def listdir_path(d):
    # Return full path of all files & directories in directory
    list_full_path = []
    for path in os.listdir(d):
        full_path = os.path.join(d, path)
        list_full_path.append(full_path)
    return list_full_path

## Extract data

In [None]:
dir_ = ".\Sequence-Labeling-for-Reference-Parsing-of-Cyrillic-Script-Scholarly-Data\Real_annotated_data\manual_anno_xml1.1"

In [None]:
# dir_ = "D:/manual_anno"
files = [f for f in listdir_path(dir_)]
files

In [None]:
for file in files:
    # get the xml1.1 file
    # tree = ET.parse("C:\\Masterarbeit\\Data\\manuel_annotated\\XML 1.1\\annotation\\Core_ID_11312400.pdf\\ulede.xmi")
    core_id = file.split("\\")[1].split(".")[0][8:]
    tree = ET.parse(file)
    root = tree.getroot()
    
    tokens_doc = []
    labels_doc = []
    ref_span = []
    count_ref = 0 #ref no. ended
    ref = 0 #ref no. started
    d = {"Token" : [] , "Label" : []}
    for lt in root.findall("{http:///webanno/custom.ecore}Label"):
        label = lt.attrib["value"]
        if label =="ref_beg" and ref == count_ref :
            count_ref+=1
            beg = int(lt.attrib["end"])
    #         print(label + " :  ref =  " + str(ref)+ "  count_ref = " +str(count_ref))

        elif label == "ref_end":
            end = int(lt.attrib["end"])
            ref +=1
            span = [beg,end]
            ref_span.append(span)
            span = []
    #         print(label + " :  ref =  " + str(ref)+ "  count_ref = " +str(count_ref))
        elif label=="ref_beg" and ref!=count_ref:
            end = int(lt.attrib["begin"])
            ref +=1
            count_ref+=1
            span = [beg,end]
            ref_span.append(span)
            span = []
            beg = int(lt.attrib["end"])
    #         print(label + " :  ref =  " + str(ref)+ "  count_ref = " +str(count_ref))

    references_plain = []
    for span in ref_span:
        references_plain.append(root.findall("{http:///uima/cas.ecore}Sofa")[0].attrib["sofaString"][span[0] : span[1]])
        
    labelled_text = []
    for span , text in zip(ref_span,references_plain):
        add_len = 0
        for lt in root.findall("{http:///webanno/custom.ecore}Label"):
            label = lt.attrib["value"]
            token = root.findall("{http:///uima/cas.ecore}Sofa")[0].attrib["sofaString"][int(lt.attrib["begin"]) : int(lt.attrib["end"])]

            if (label not in ("ref_beg","ref_end")) and (int(lt.attrib["end"]) in range(span[0],span[1]+1)):
                beg = int(lt.attrib["begin"]) - span[0] + add_len
                end = int(lt.attrib["end"]) - span[0] + add_len
#                 if re.match(".* $",token)!=None:
                if token[-1]==" ":
#                     if label == "issue":
#                         new_text = text[0:beg] + "<number>" + token[:-1] + "</number> " + text[end:]
#                     else:
                    new_text = text[0:beg] + f"<{label}>" + token[:-1] + f"</{label}> " + text[end:]
                else:
#                     if label == "issue":
#                         new_text = text[0:beg] + "<number>" + token + "</number> " + text[end:]
#                     else:
                    new_text = text[0:beg] + f"<{label}>" + token + f"</{label}>" + text[end:]
                text = new_text
    #             print(text)
#                 if label == "issue":
#                     add_len += len("<number>" + "" + "</number>")
#                 else:
                add_len += len(f"<{label}>" + "" + f"</{label}>")
#         labelled_text.append(text.replace("-\n","").replace("\n",""))
        text = text.replace("<issue>","<number>").replace("</issue>","</number>")
        text = text.replace("<idno_other>","").replace("</idno_other>","").replace("<idno_doi>","").replace("</idno_doi>","")
        labelled_text.append(" ".join(re.sub(r"-\n\s+", "", text).replace("-\n","").replace("\n","").split()).replace(""" </""","""</"""))
    if "<volumen>" in str(labelled_text) or "<jo#>" in str(labelled_text):
        print(core_id)
    mydata = "\n\n".join(labelled_text)
#     myfile = open(os.path.join("C:\Masterarbeit\Data\manuel_annotated\labelled_text",core_id)+".txt", "w",encoding="utf-8")
    myfile = open(os.path.join(".\Sequence-Labeling-for-Reference-Parsing-of-Cyrillic-Script-Scholarly-Data\Real_annotated_data\labelled_text_per_paper",core_id)+".txt", "w",encoding="utf-8")

    myfile.write(mydata)       
    myfile.close()
#     print(labelled_text)