Step 1: run `pip install classla` to install [classla](https://github.com/clarinsi/classla)

Step 2: download standard models and initialize pipelines

In [None]:
import classla

# slovenian
classla.download('sl')
sl_nlp = classla.Pipeline('sl')

# serbian
classla.download('sr')
sr_nlp = classla.Pipeline('sr')

Step 3: specify input directories and match them with corresponding pipeline; check well-formedness of the files and correct if needed

In [None]:
from xml.sax.handler import ContentHandler
from xml.sax import make_parser
from lxml import etree
from pathlib import Path
from tqdm import tqdm

input_data = {"source_sl": sl_nlp,
              "source_sr": sr_nlp}

def parsefile(file):
    parser = make_parser()
    parser.setContentHandler(ContentHandler())
    parser.parse(file)


for directory in input_data.keys():
    for file in Path(directory).iterdir():
        if not file.is_file():
            continue
        try:
            etree.parse(file)
            print(f"✅ File {file} is well-formed.")
        except Exception as e:
            print(f"❌ File {file} is NOT well-formed!\n{e}")

Step 4: process files (no changes required)

In [6]:
# mapping to the NCRL markdown
to_ncrl_format = {
    "ADJ": "A",
    "ADP": "PR",
    "ADV": "ADV",
    "AUX": "V,aux",
    "CCONJ": "CONJ,coord",
    "DET": "APRO",
    "INTJ": "INTJ",
    "NOUN": "S",
    "NUM": "NUM",
    "PART": "PART",
    "PRON": "SPRO",
    "PROPN": "S,propn",
    "SCONJ": "CONJ,subord",
    "VERB": "V",
    "X": "NONLEX",
    "animacy=anim": "anim",
    "animacy=inan": "inan",
    "aspect=imp": "ipf",
    "aspect=perf": "pf",
    "case=acc": "acc",
    "case=dat": "dat",
    "case=gen": "gen",
    "case=ins": "ins",
    "case=loc": "loc",
    "case=nom": "nom",
    "case=voc": "voc",
    "definite=def": "def",
    "definite=ind": "indef",
    "degree=cmp": "comp",
    "degree=pos": None,
    "degree=sup": "supr",
    "foreign=yes": None,
    "gender=fem": "f",
    "gender=masc": "m",
    "gender=neut": "n",
    "gender[psor]=fem": "poss:f",
    "gender[psor]=masc": "poss:m",
    "gender[psor]=masc,neut": "poss:m/n",
    "number[psor]=dual": "poss:du",
    "number[psor]=plur": "poss:pl",
    "number[psor]=sing": "poss:sg",
    "mood=cnd": "cond",
    "mood=imp": "imper",
    "mood=ind": "indic",
    "number=dual": "du",
    "number=plur": "pl",
    "number=sing": "sg",
    "numform=digit": "ciph",
    "numform=mult": "coll",
    "numform=word": None,
    "numtype=card": None,
    "numtype=mult": "coll",
    "numtype=ord": None,
    "person=1": "1p",
    "person=2": "2p",
    "person=3": "3p",
    "polarity=neg": "neg",
    "polarity=pos": "pos",
    "poss=yes": "poss",
    "prontype=dem": "dem",
    "prontype=ind": "ind",
    "prontype=int,rel": "interr,rel",
    "prontype=int": "interr",
    "prontype=neg": "neg",
    "prontype=prs": "pers",
    "prontype=rel": "rel",
    "prontype=tot": "tot",
    "reflex=yes": "refl",
    "tense=fut": "fut",
    "tense=past": "praet",
    "tense=pres": "praes",
    "variant=bound": "pr-pro",
    "variant=short": "brev",
    "verbform=conv": "ger",
    "verbform=fin": None,
    "verbform=inf": "inf",
    "verbform=part": "partcp",
    "verbform=sup": "sup",
    "voice=act": "act",
    "voice=pass": "pass"
}

if_present_delete = {
    "gender[psor]=masc": "neut"
}

def map_to_ncrl(pos, attrs):
    res_attrs = set([])
    res_pos = pos
    for attr_pair_str in attrs.lower().split("|"):
        if attr_pair_str == "numform=ord" and res_pos == "ADJ":
            res_pos = "ANUM"
        elif attr_pair_str == "foreign=yes":
            res_pos = "NONLEX"
        elif attr_pair_str in if_present_delete.keys():
            if if_present_delete[attr_pair_str] in res_attrs:
                res_attrs.remove(if_present_delete[attr_pair_str])
            else:
                to_ncrl_format[if_present_delete[attr_pair_str]] = None
        if attr_pair_str not in to_ncrl_format.keys():
            print(f"unmapped attribute: {attr_pair_str}")
            res_attrs.add(attr_pair_str)
        elif to_ncrl_format[attr_pair_str] is not None:
            res_attrs.add(to_ncrl_format[attr_pair_str])
    return res_pos, ",".join(res_attrs)

In [None]:
import cyrtranslit

known_keys = ["feats"]
keys_to_ignore = ["id", "text", "lemma", "upos", "xpos", "head", "ner", "deprel"]

def add_space_after(token_dict):
    return not ("misc" in token_dict.keys() and "SpaceAfter=No" in token_dict["misc"])

def annotate(ana_node, token_dict):
    pos = ""
    attrs = ""
    for key, value in token_dict.items():
        if key not in keys_to_ignore:
            if key not in known_keys:
                if key != "misc" and value != "SpaceAfter=No":
                    print(f"{key} :: {value} in {token_dict}")
            else:
                pos, attrs = map_to_ncrl(token_dict["upos"], value)
    if attrs != "":
        ana_node.attrib["gr"] = pos
    else:
        ana_node.attrib["gr"] = f"{pos},{attrs}"

def append_text(root, text):
    if len(root.getchildren()) > 0:
        node = last_child(root)
        if node.tail is None:
            node.tail = text
        else:
            node.tail += text
    else:
        if root.text is None:
            root.text = text
        else:
            root.text += text

def last_child(node):
    return node.getchildren()[-1]

for directory, pipeline in input_data.items():
    for filename in Path(directory).iterdir():
        parsed_file = etree.parse(filename)
        bar_format = "File: " + filename.name + " |{bar}| {percentage:3.0f}% {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
        for para in tqdm(parsed_file.find("body"), bar_format=bar_format):
            if para.tag != "para":
                print(f"Incorrect sentence pair tag: {para.tag}")
            for sentence in para:
                if sentence.tag != "se":
                    print(f"Incorrect sentence tag: {sentence.tag}")
                if sentence.get("lang") == "rus" or sentence.text is None:
                    continue
                tokenized_sentence = etree.Element("se", attrib={"lang": sentence.get("lang")})
                if sentence.get("lang") == "srp":
                    parsed_sentence = pipeline(cyrtranslit.to_latin(sentence.text, "sr"))
                else:
                    parsed_sentence = pipeline(sentence.text)
                if all(token.to_dict()[0]["upos"] in ["PUNCT", "X"] for token in parsed_sentence.iter_tokens()):
                    # no markdown required for non-textual sentences
                    continue
                for token in parsed_sentence.iter_tokens():
                    token_dict = token.to_dict()[0]
                    if token_dict["upos"] == "PUNCT":
                        append_text(tokenized_sentence, token.text)
                    else:
                        tokenized_sentence.append(etree.Element("w"))
                        word_token = last_child(tokenized_sentence)
                        word_token.append(etree.Element("ana", attrib={"lex": token_dict["lemma"]}))
                        ana = last_child(word_token)
                        ana.tail = token_dict["text"]
                        annotate(ana, token_dict)
                    if add_space_after(token_dict):
                        append_text(tokenized_sentence, " ")

                sentence.getparent().replace(sentence, tokenized_sentence)

        if not Path("annotated").exists():
            Path.mkdir(Path("annotated"))
        if not Path("annotated").joinpath(directory).exists():
            Path.mkdir(Path("annotated").joinpath(directory))
        with open(Path("annotated").joinpath(filename), "bw") as annotated_file:
            annotated_file.write(etree.tostring(parsed_file, encoding="utf-8"))

Step 5: check well-formedness of the results

In [None]:
for directory in input_data.keys():
    for file in Path("annotated").joinpath(directory).iterdir():
        if not file.is_file():
            continue
        try:
            etree.parse(file)
            print(f"✅ File {file} is well-formed.")
        except Exception as e:
            print(f"❌ File {file} is NOT well-formed!\n{e}")