Step 1: run `pip install classla` to install [classla](https://github.com/clarinsi/classla)

Step 2: download standard models and initialize pipelines

In [None]:
import classla

# slovenian
classla.download('sl')
sl_nlp = classla.Pipeline('sl')

# serbian
classla.download('sr')
sr_nlp = classla.Pipeline('sr')

Step 3: specify input directories and match them with corresponding pipeline; check well-formedness of the files and correct if needed

In [None]:
from xml.sax.handler import ContentHandler
from xml.sax import make_parser
from lxml import etree
from pathlib import Path
from tqdm import tqdm

input_data = {"source_sl": sl_nlp,
              "source_sr": sr_nlp}

def parsefile(file):
    parser = make_parser()
    parser.setContentHandler(ContentHandler())
    parser.parse(file)


for directory in input_data.keys():
    for filename in Path(directory).iterdir():
        try:
            etree.parse(filename)
            print(f"✅ File {filename} is well-formed.")
        except Exception as e:
            print(f"❌ File {filename} is NOT well-formed!\n{e}")

Step 4: process files (no changes required)

In [None]:
import cyrtranslit

known_keys = ["feats"]
keys_to_ignore = ["id", "text", "lemma", "upos", "xpos", "head", "ner", "deprel"]

def add_space_after(token_dict):
    return not ("misc" in token_dict.keys() and "SpaceAfter=No" in token_dict["misc"])

def tidy(property_string):
    lower = property_string.lower()
    res = ""
    props = {pair.split("=")[0]: pair.split("=")[-1] for pair in lower.split("|")}
    if "gender" in props.keys() and "number" in props.keys():
        props[props["gender"]] = props["number"]
        props.pop("gender")
        props.pop("number")
    for key, value in props.items():
        if key != value:
            res += f",{key}={value}"
    return res

def append_gr(ana_node, token_dict):
    for key, value in token_dict.items():
        if key not in keys_to_ignore:
            if key not in known_keys:
                if key != "misc" and value != "SpaceAfter=No":
                    print(f"{key} :: {value} in {token_dict}")
            else:
                ana_node.attrib["gr"] += tidy(value.lower())

def append_text(root, text):
    if len(root.getchildren()) > 0:
        node = last_child(root)
        if node.tail is None:
            node.tail = text
        else:
            node.tail += text
    else:
        if root.text is None:
            root.text = text
        else:
            root.text += text

def last_child(node):
    return node.getchildren()[-1]

for directory, pipeline in input_data.items():
    for filename in Path(directory).iterdir():
        parsed_file = etree.parse(filename)
        bar_format = "File: " + filename.name + " |{bar}| {percentage:3.0f}% {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
        for para in tqdm(parsed_file.find("body")[2:], bar_format=bar_format):
            if para.tag != "para":
                print(f"Incorrect sentence pair tag: {para.tag}")
            for sentence in para:
                if sentence.tag != "se":
                    print(f"Incorrect sentence tag: {sentence.tag}")
                if sentence.get("lang") == "rus" or sentence.text is None:
                    continue
                tokenized_sentence = etree.Element("se", attrib={"lang": sentence.get("lang")})
                if sentence.get("lang") == "srp":
                    parsed_sentence = pipeline(cyrtranslit.to_latin(sentence.text, "sr"))
                else:
                    parsed_sentence = pipeline(sentence.text)
                for token in parsed_sentence.iter_tokens():
                    token_dict = token.to_dict()[0]
                    if token_dict["upos"] == "PUNCT":
                        append_text(tokenized_sentence, token.text)
                    else:
                        tokenized_sentence.append(etree.Element("w"))
                        word_token = last_child(tokenized_sentence)
                        word_token.append(etree.Element("ana", attrib={"lex": token_dict["lemma"], "gr": token_dict["upos"]}))
                        ana = last_child(word_token)
                        ana.tail = token_dict["text"]
                        append_gr(ana, token_dict)
                    if add_space_after(token_dict):
                        append_text(tokenized_sentence, " ")

                sentence.getparent().replace(sentence, tokenized_sentence)

        if not Path("annotated").exists():
            Path.mkdir(Path("annotated"))
        if not Path("annotated").joinpath(directory).exists():
            Path.mkdir(Path("annotated").joinpath(directory))
        with open(Path("annotated").joinpath(filename), "bw") as annotated_file:
            annotated_file.write(etree.tostring(parsed_file, encoding="utf-8"))

Step 5: check well-formedness of the results

In [None]:
for directory in input_data.keys():
    for filename in Path("annotated").joinpath(directory).iterdir():
        if not filename.is_file():
            continue
        try:
            etree.parse(filename)
            print(f"✅ File {filename} is well-formed.")
        except Exception as e:
            print(f"❌ File {filename} is NOT well-formed!\n{e}")